284 files changed, 233845 insertions, 0 deletions
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.cc b/src/rocksdb/db/arena_wrapped_db_iter.cc
new file mode 100644
index 000000000..607403ccc
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.cc
@@ -0,0 +1,160 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/arena_wrapped_db_iter.h"
+
+#include "memory/arena.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
+                                       std::string* prop) {
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    if (!db_iter_->GetProperty(prop_name, prop).ok()) {
+      *prop = std::to_string(sv_number_);
+    }
+    return Status::OK();
+  }
+  return db_iter_->GetProperty(prop_name, prop);
+}
+
+void ArenaWrappedDBIter::Init(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+  auto mem = arena_.AllocateAligned(sizeof(DBIter));
+  db_iter_ =
+      new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
+                       ioptions.user_comparator, /* iter */ nullptr, version,
+                       sequence, true, max_sequential_skip_in_iteration,
+                       read_callback, db_impl, cfd, expose_blob_index);
+  sv_number_ = version_number;
+  read_options_ = read_options;
+  allow_refresh_ = allow_refresh;
+  memtable_range_tombstone_iter_ = nullptr;
+}
+
+Status ArenaWrappedDBIter::Refresh() {
+  if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
+    return Status::NotSupported("Creating renew iterator is not allowed.");
+  }
+  assert(db_iter_ != nullptr);
+  // TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
+  // correct behavior. Will be corrected automatically when we take a snapshot
+  // here for the case of WritePreparedTxnDB.
+  uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
+  TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
+  auto reinit_internal_iter = [&]() {
+    Env* env = db_iter_->env();
+    db_iter_->~DBIter();
+    arena_.~Arena();
+    new (&arena_) Arena();
+
+    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+    SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+    if (read_callback_) {
+      read_callback_->Refresh(latest_seq);
+    }
+    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+         sv->current, latest_seq,
+         sv->mutable_cf_options.max_sequential_skip_in_iterations,
+         cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
+         allow_refresh_);
+
+    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+        read_options_, cfd_, sv, &arena_, latest_seq,
+        /* allow_unprepared_value */ true, /* db_iter */ this);
+    SetIterUnderDBIter(internal_iter);
+  };
+  while (true) {
+    if (sv_number_ != cur_sv_number) {
+      reinit_internal_iter();
+      break;
+    } else {
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      // Refresh range-tombstones in MemTable
+      if (!read_options_.ignore_range_deletions) {
+        SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+        TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr);
+        auto t = sv->mem->NewRangeTombstoneIterator(
+            read_options_, latest_seq, false /* immutable_memtable */);
+        if (!t || t->empty()) {
+          // If memtable_range_tombstone_iter_ points to a non-empty tombstone
+          // iterator, then it means sv->mem is not the memtable that
+          // memtable_range_tombstone_iter_ points to, so SV must have changed
+          // after the sv_number_ != cur_sv_number check above. We will fall
+          // back to re-init the InternalIterator, and the tombstone iterator
+          // will be freed during db_iter destruction there.
+          if (memtable_range_tombstone_iter_) {
+            assert(!*memtable_range_tombstone_iter_ ||
+                   sv_number_ != cfd_->GetSuperVersionNumber());
+          }
+          delete t;
+        } else {  // current mutable memtable has range tombstones
+          if (!memtable_range_tombstone_iter_) {
+            delete t;
+            db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+            // The memtable under DBIter did not have range tombstone before
+            // refresh.
+            reinit_internal_iter();
+            break;
+          } else {
+            delete *memtable_range_tombstone_iter_;
+            *memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
+                std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
+                &cfd_->internal_comparator(), nullptr, nullptr);
+          }
+        }
+        db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
+      }
+      // Refresh latest sequence number
+      db_iter_->set_sequence(latest_seq);
+      db_iter_->set_valid(false);
+      // Check again if the latest super version number is changed
+      uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+      if (latest_sv_number != cur_sv_number) {
+        // If the super version number is changed after refreshing,
+        // fallback to Re-Init the InternalIterator
+        cur_sv_number = latest_sv_number;
+        continue;
+      }
+      break;
+    }
+  }
+  return Status::OK();
+}
+
+ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
+    ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+  ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
+  iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
+             max_sequential_skip_in_iterations, version_number, read_callback,
+             db_impl, cfd, expose_blob_index, allow_refresh);
+  if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
+    iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
+  }
+
+  return iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/arena_wrapped_db_iter.h b/src/rocksdb/db/arena_wrapped_db_iter.h
new file mode 100644
index 000000000..f15be306d
--- /dev/null
+++ b/src/rocksdb/db/arena_wrapped_db_iter.h
@@ -0,0 +1,127 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class Version;
+
+// A wrapper iterator which wraps DB Iterator and the arena, with which the DB
+// iterator is supposed to be allocated. This class is used as an entry point of
+// a iterator hierarchy whose memory can be allocated inline. In that way,
+// accessing the iterator tree can be more cache friendly. It is also faster
+// to allocate.
+// When using the class's Iterator interface, the behavior is exactly
+// the same as the inner DBIter.
+class ArenaWrappedDBIter : public Iterator {
+ public:
+  ~ArenaWrappedDBIter() override {
+    if (db_iter_ != nullptr) {
+      db_iter_->~DBIter();
+    } else {
+      assert(false);
+    }
+  }
+
+  // Get the arena to be used to allocate memory for DBIter to be wrapped,
+  // as well as child iterators in it.
+  virtual Arena* GetArena() { return &arena_; }
+
+  const ReadOptions& GetReadOptions() { return read_options_; }
+
+  // Set the internal iterator wrapped inside the DB Iterator. Usually it is
+  // a merging iterator.
+  virtual void SetIterUnderDBIter(InternalIterator* iter) {
+    db_iter_->SetIter(iter);
+  }
+
+  void SetMemtableRangetombstoneIter(TruncatedRangeDelIterator** iter) {
+    memtable_range_tombstone_iter_ = iter;
+  }
+
+  bool Valid() const override { return db_iter_->Valid(); }
+  void SeekToFirst() override { db_iter_->SeekToFirst(); }
+  void SeekToLast() override { db_iter_->SeekToLast(); }
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
+  void Seek(const Slice& target) override { db_iter_->Seek(target); }
+  void SeekForPrev(const Slice& target) override {
+    db_iter_->SeekForPrev(target);
+  }
+  void Next() override { db_iter_->Next(); }
+  void Prev() override { db_iter_->Prev(); }
+  Slice key() const override { return db_iter_->key(); }
+  Slice value() const override { return db_iter_->value(); }
+  const WideColumns& columns() const override { return db_iter_->columns(); }
+  Status status() const override { return db_iter_->status(); }
+  Slice timestamp() const override { return db_iter_->timestamp(); }
+  bool IsBlob() const { return db_iter_->IsBlob(); }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  Status Refresh() override;
+
+  void Init(Env* env, const ReadOptions& read_options,
+            const ImmutableOptions& ioptions,
+            const MutableCFOptions& mutable_cf_options, const Version* version,
+            const SequenceNumber& sequence,
+            uint64_t max_sequential_skip_in_iterations, uint64_t version_number,
+            ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+            bool expose_blob_index, bool allow_refresh);
+
+  // Store some parameters so we can refresh the iterator at a later point
+  // with these same params
+  void StoreRefreshInfo(DBImpl* db_impl, ColumnFamilyData* cfd,
+                        ReadCallback* read_callback, bool expose_blob_index) {
+    db_impl_ = db_impl;
+    cfd_ = cfd;
+    read_callback_ = read_callback;
+    expose_blob_index_ = expose_blob_index;
+  }
+
+ private:
+  DBIter* db_iter_ = nullptr;
+  Arena arena_;
+  uint64_t sv_number_;
+  ColumnFamilyData* cfd_ = nullptr;
+  DBImpl* db_impl_ = nullptr;
+  ReadOptions read_options_;
+  ReadCallback* read_callback_;
+  bool expose_blob_index_ = false;
+  bool allow_refresh_ = true;
+  // If this is nullptr, it means the mutable memtable does not contain range
+  // tombstone when added under this DBIter.
+  TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr;
+};
+
+// Generate the arena wrapped iterator class.
+// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
+// be supported.
+extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, const Version* version,
+    const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
+    uint64_t version_number, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false, bool allow_refresh = true);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_constants.h b/src/rocksdb/db/blob/blob_constants.h
new file mode 100644
index 000000000..a5d09ac76
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_constants.h
@@ -0,0 +1,16 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kInvalidBlobFileNumber = 0;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.cc b/src/rocksdb/db/blob/blob_contents.cc
new file mode 100644
index 000000000..9015609e7
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_contents.h"
+
+#include <cassert>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_helpers.h"
+#include "port/malloc.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<BlobContents> BlobContents::Create(
+    CacheAllocationPtr&& allocation, size_t size) {
+  return std::unique_ptr<BlobContents>(
+      new BlobContents(std::move(allocation), size));
+}
+
+size_t BlobContents::ApproximateMemoryUsage() const {
+  size_t usage = 0;
+
+  if (allocation_) {
+    MemoryAllocator* const allocator = allocation_.get_deleter().allocator;
+
+    if (allocator) {
+      usage += allocator->UsableSize(allocation_.get(), data_.size());
+    } else {
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+      usage += malloc_usable_size(allocation_.get());
+#else
+      usage += data_.size();
+#endif
+    }
+  }
+
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+  usage += malloc_usable_size(const_cast<BlobContents*>(this));
+#else
+  usage += sizeof(*this);
+#endif
+
+  return usage;
+}
+
+size_t BlobContents::SizeCallback(void* obj) {
+  assert(obj);
+
+  return static_cast<const BlobContents*>(obj)->size();
+}
+
+Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
+                                    size_t length, void* out) {
+  assert(from_obj);
+
+  const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
+  assert(buf->size() >= from_offset + length);
+
+  memcpy(out, buf->data().data() + from_offset, length);
+
+  return Status::OK();
+}
+
+Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
+  static Cache::CacheItemHelper cache_helper(
+      &SizeCallback, &SaveToCallback,
+      GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
+
+  return &cache_helper;
+}
+
+Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
+                                    const void* buf, size_t size,
+                                    void** out_obj, size_t* charge) {
+  assert(allocation);
+
+  memcpy(allocation.get(), buf, size);
+
+  std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
+  BlobContents* const contents = obj.release();
+
+  *out_obj = contents;
+  *charge = contents->ApproximateMemoryUsage();
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_contents.h b/src/rocksdb/db/blob/blob_contents.h
new file mode 100644
index 000000000..9b7c5b969
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_contents.h
@@ -0,0 +1,56 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <memory>
+
+#include "memory/memory_allocator.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class representing a single uncompressed value read from a blob file.
+class BlobContents {
+ public:
+  static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
+                                              size_t size);
+
+  BlobContents(const BlobContents&) = delete;
+  BlobContents& operator=(const BlobContents&) = delete;
+
+  BlobContents(BlobContents&&) = default;
+  BlobContents& operator=(BlobContents&&) = default;
+
+  ~BlobContents() = default;
+
+  const Slice& data() const { return data_; }
+  size_t size() const { return data_.size(); }
+
+  size_t ApproximateMemoryUsage() const;
+
+  // Callbacks for secondary cache
+  static size_t SizeCallback(void* obj);
+
+  static Status SaveToCallback(void* from_obj, size_t from_offset,
+                               size_t length, void* out);
+
+  static Cache::CacheItemHelper* GetCacheItemHelper();
+
+  static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
+                               size_t size, void** out_obj, size_t* charge);
+
+ private:
+  BlobContents(CacheAllocationPtr&& allocation, size_t size)
+      : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
+
+  CacheAllocationPtr allocation_;
+  Slice data_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator.h b/src/rocksdb/db/blob/blob_counting_iterator.h
new file mode 100644
index 000000000..de549afa2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator.h
@@ -0,0 +1,146 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that passes each key-value encountered to
+// BlobGarbageMeter as inflow in order to measure the total number and size of
+// blobs in the compaction input on a per-blob file basis.
+class BlobCountingIterator : public InternalIterator {
+ public:
+  BlobCountingIterator(InternalIterator* iter,
+                       BlobGarbageMeter* blob_garbage_meter)
+      : iter_(iter), blob_garbage_meter_(blob_garbage_meter) {
+    assert(iter_);
+    assert(blob_garbage_meter_);
+
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool Valid() const override { return iter_->Valid() && status_.ok(); }
+
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Seek(const Slice& target) override {
+    iter_->Seek(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    iter_->SeekForPrev(target);
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  void Next() override {
+    assert(Valid());
+
+    iter_->Next();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+
+    const bool res = iter_->NextAndGetResult(result);
+    UpdateAndCountBlobIfNeeded();
+    return res;
+  }
+
+  void Prev() override {
+    assert(Valid());
+
+    iter_->Prev();
+    UpdateAndCountBlobIfNeeded();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(Valid());
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    return iter_->value();
+  }
+
+  Status status() const override { return status_; }
+
+  bool PrepareValue() override {
+    assert(Valid());
+    return iter_->PrepareValue();
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return iter_->MayBeOutOfLowerBound();
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+    return iter_->UpperBoundCheckResult();
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(Valid());
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(Valid());
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateAndCountBlobIfNeeded() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    if (!iter_->Valid()) {
+      status_ = iter_->status();
+      return;
+    }
+
+    TEST_SYNC_POINT(
+        "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow");
+
+    status_ = blob_garbage_meter_->ProcessInFlow(key(), value());
+  }
+
+  InternalIterator* iter_;
+  BlobGarbageMeter* blob_garbage_meter_;
+  Status status_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_counting_iterator_test.cc b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
new file mode 100644
index 000000000..c7bbc8f58
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_counting_iterator_test.cc
@@ -0,0 +1,327 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_counting_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CheckInFlow(const BlobGarbageMeter& blob_garbage_meter,
+                 uint64_t blob_file_number, uint64_t count, uint64_t bytes) {
+  const auto& flows = blob_garbage_meter.flows();
+
+  const auto it = flows.find(blob_file_number);
+  if (it == flows.end()) {
+    ASSERT_EQ(count, 0);
+    ASSERT_EQ(bytes, 0);
+    return;
+  }
+
+  const auto& in = it->second.GetInFlow();
+
+  ASSERT_EQ(in.GetCount(), count);
+  ASSERT_EQ(in.GetBytes(), bytes);
+}
+
+TEST(BlobCountingIteratorTest, CountBlobs) {
+  // Note: the input consists of three key-values: two are blob references to
+  // different blob files, while the third one is a plain value.
+  constexpr char user_key0[] = "key0";
+  constexpr char user_key1[] = "key1";
+  constexpr char user_key2[] = "key2";
+
+  const std::vector<std::string> keys{
+      test::KeyStr(user_key0, 1, kTypeBlobIndex),
+      test::KeyStr(user_key1, 2, kTypeBlobIndex),
+      test::KeyStr(user_key2, 3, kTypeValue)};
+
+  constexpr uint64_t first_blob_file_number = 4;
+  constexpr uint64_t first_offset = 1000;
+  constexpr uint64_t first_size = 2000;
+
+  std::string first_blob_index;
+  BlobIndex::EncodeBlob(&first_blob_index, first_blob_file_number, first_offset,
+                        first_size, kNoCompression);
+
+  constexpr uint64_t second_blob_file_number = 6;
+  constexpr uint64_t second_offset = 2000;
+  constexpr uint64_t second_size = 4000;
+
+  std::string second_blob_index;
+  BlobIndex::EncodeBlob(&second_blob_index, second_blob_file_number,
+                        second_offset, second_size, kNoCompression);
+
+  const std::vector<std::string> values{first_blob_index, second_blob_index,
+                                        "raw_value"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  constexpr uint64_t first_expected_bytes =
+      first_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key0) - 1);
+  constexpr uint64_t second_expected_bytes =
+      second_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(user_key1) - 1);
+
+  // Call SeekToFirst and iterate forward
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 0, 0);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  blob_counter.Next();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 1,
+              first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  // Do it again using NextAndGetResult
+  blob_counter.SeekToFirst();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 1,
+              second_expected_bytes);
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[1]);
+    ASSERT_EQ(blob_counter.user_key(), user_key1);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[1]);
+    ASSERT_EQ(blob_counter.value(), values[1]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[2]);
+    ASSERT_EQ(blob_counter.user_key(), user_key2);
+    ASSERT_TRUE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    ASSERT_EQ(blob_counter.key(), keys[2]);
+    ASSERT_EQ(blob_counter.value(), values[2]);
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  {
+    IterateResult result;
+    ASSERT_FALSE(blob_counter.NextAndGetResult(&result));
+    ASSERT_FALSE(blob_counter.Valid());
+    ASSERT_OK(blob_counter.status());
+    CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+                2 * first_expected_bytes);
+    CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+                2 * second_expected_bytes);
+  }
+
+  // Call SeekToLast and iterate backward
+  blob_counter.SeekToLast();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 2,
+              2 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 2,
+              2 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Prev();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 3,
+              3 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  // Call Seek for all keys (plus one that's greater than all of them)
+  blob_counter.Seek(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 3,
+              3 * second_expected_bytes);
+
+  blob_counter.Seek(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.Seek("zzz");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  // Call SeekForPrev for all keys (plus one that's less than all of them)
+  blob_counter.SeekForPrev("aaa");
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 4,
+              4 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[0]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[0]);
+  ASSERT_EQ(blob_counter.user_key(), user_key0);
+  ASSERT_EQ(blob_counter.value(), values[0]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 4,
+              4 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[1]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[1]);
+  ASSERT_EQ(blob_counter.user_key(), user_key1);
+  ASSERT_EQ(blob_counter.value(), values[1]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+
+  blob_counter.SeekForPrev(keys[2]);
+  ASSERT_TRUE(blob_counter.Valid());
+  ASSERT_OK(blob_counter.status());
+  ASSERT_EQ(blob_counter.key(), keys[2]);
+  ASSERT_EQ(blob_counter.user_key(), user_key2);
+  ASSERT_EQ(blob_counter.value(), values[2]);
+  CheckInFlow(blob_garbage_meter, first_blob_file_number, 5,
+              5 * first_expected_bytes);
+  CheckInFlow(blob_garbage_meter, second_blob_file_number, 5,
+              5 * second_expected_bytes);
+}
+
+TEST(BlobCountingIteratorTest, CorruptBlobIndex) {
+  const std::vector<std::string> keys{
+      test::KeyStr("user_key", 1, kTypeBlobIndex)};
+  const std::vector<std::string> values{"i_am_not_a_blob_index"};
+
+  assert(keys.size() == values.size());
+
+  VectorIterator input(keys, values);
+  BlobGarbageMeter blob_garbage_meter;
+
+  BlobCountingIterator blob_counter(&input, &blob_garbage_meter);
+
+  blob_counter.SeekToFirst();
+  ASSERT_FALSE(blob_counter.Valid());
+  ASSERT_NOK(blob_counter.status());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_fetcher.cc b/src/rocksdb/db/blob/blob_fetcher.cc
new file mode 100644
index 000000000..124429f93
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.cc
@@ -0,0 +1,34 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_fetcher.h"
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const Slice& blob_index_slice,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index_slice,
+                           prefetch_buffer, blob_value, bytes_read);
+}
+
+Status BlobFetcher::FetchBlob(const Slice& user_key,
+                              const BlobIndex& blob_index,
+                              FilePrefetchBuffer* prefetch_buffer,
+                              PinnableSlice* blob_value,
+                              uint64_t* bytes_read) const {
+  assert(version_);
+
+  return version_->GetBlob(read_options_, user_key, blob_index, prefetch_buffer,
+                           blob_value, bytes_read);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_fetcher.h b/src/rocksdb/db/blob/blob_fetcher.h
new file mode 100644
index 000000000..8aeaf965d
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_fetcher.h
@@ -0,0 +1,37 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class Slice;
+class FilePrefetchBuffer;
+class PinnableSlice;
+class BlobIndex;
+
+// A thin wrapper around the blob retrieval functionality of Version.
+class BlobFetcher {
+ public:
+  BlobFetcher(const Version* version, const ReadOptions& read_options)
+      : version_(version), read_options_(read_options) {}
+
+  Status FetchBlob(const Slice& user_key, const Slice& blob_index_slice,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+  Status FetchBlob(const Slice& user_key, const BlobIndex& blob_index,
+                   FilePrefetchBuffer* prefetch_buffer,
+                   PinnableSlice* blob_value, uint64_t* bytes_read) const;
+
+ private:
+  const Version* version_;
+  ReadOptions read_options_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.cc b/src/rocksdb/db/blob/blob_file_addition.cc
new file mode 100644
index 000000000..71b1bb7fc
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.cc
@@ -0,0 +1,156 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileAddition::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileAddition::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, total_blob_count_);
+  PutVarint64(output, total_blob_bytes_);
+  PutLengthPrefixedSlice(output, checksum_method_);
+  PutLengthPrefixedSlice(output, checksum_value_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileAddition::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileAddition::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileAddition";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &total_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding total blob count");
+  }
+
+  if (!GetVarint64(input, &total_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding total blob bytes");
+  }
+
+  Slice checksum_method;
+  if (!GetLengthPrefixedSlice(input, &checksum_method)) {
+    return Status::Corruption(class_name, "Error decoding checksum method");
+  }
+  checksum_method_ = checksum_method.ToString();
+
+  Slice checksum_value;
+  if (!GetLengthPrefixedSlice(input, &checksum_value)) {
+    return Status::Corruption(class_name, "Error decoding checksum value");
+  }
+  checksum_value_ = checksum_value.ToString();
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileAddition::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileAddition::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetTotalBlobCount() == rhs.GetTotalBlobCount() &&
+         lhs.GetTotalBlobBytes() == rhs.GetTotalBlobBytes() &&
+         lhs.GetChecksumMethod() == rhs.GetChecksumMethod() &&
+         lhs.GetChecksumValue() == rhs.GetChecksumValue();
+}
+
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition) {
+  os << "blob_file_number: " << blob_file_addition.GetBlobFileNumber()
+     << " total_blob_count: " << blob_file_addition.GetTotalBlobCount()
+     << " total_blob_bytes: " << blob_file_addition.GetTotalBlobBytes()
+     << " checksum_method: " << blob_file_addition.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition) {
+  jw << "BlobFileNumber" << blob_file_addition.GetBlobFileNumber()
+     << "TotalBlobCount" << blob_file_addition.GetTotalBlobCount()
+     << "TotalBlobBytes" << blob_file_addition.GetTotalBlobBytes()
+     << "ChecksumMethod" << blob_file_addition.GetChecksumMethod()
+     << "ChecksumValue"
+     << Slice(blob_file_addition.GetChecksumValue()).ToString(/* hex */ true);
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition.h b/src/rocksdb/db/blob/blob_file_addition.h
new file mode 100644
index 000000000..43b1a0bcb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition.h
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileAddition {
+ public:
+  BlobFileAddition() = default;
+
+  BlobFileAddition(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t total_blob_count_ = 0;
+  uint64_t total_blob_bytes_ = 0;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+bool operator==(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+bool operator!=(const BlobFileAddition& lhs, const BlobFileAddition& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileAddition& blob_file_addition);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileAddition& blob_file_addition);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_addition_test.cc b/src/rocksdb/db/blob/blob_file_addition_test.cc
new file mode 100644
index 000000000..64cb0a9d6
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_addition_test.cc
@@ -0,0 +1,211 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_addition.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileAdditionTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileAddition& blob_file_addition) {
+    std::string encoded;
+    blob_file_addition.EncodeTo(&encoded);
+
+    BlobFileAddition decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_addition, decoded);
+  }
+};
+
+TEST_F(BlobFileAdditionTest, Empty) {
+  BlobFileAddition blob_file_addition;
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 0);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), 0);
+  ASSERT_TRUE(blob_file_addition.GetChecksumMethod().empty());
+  ASSERT_TRUE(blob_file_addition.GetChecksumValue().empty());
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t total_blob_count = 2;
+  constexpr uint64_t total_blob_bytes = 123456;
+  const std::string checksum_method("SHA1");
+  const std::string checksum_value(
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+}
+
+TEST_F(BlobFileAdditionTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileAddition blob_file_addition;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob count"));
+  }
+
+  constexpr uint64_t total_blob_count = 4567;
+  PutVarint64(&str, total_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "total blob bytes"));
+  }
+
+  constexpr uint64_t total_blob_bytes = 12345678;
+  PutVarint64(&str, total_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum method"));
+  }
+
+  constexpr char checksum_method[] = "SHA1";
+  PutLengthPrefixedSlice(&str, checksum_method);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "checksum value"));
+  }
+
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  PutLengthPrefixedSlice(&str, checksum_value);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_addition.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileAdditionTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t total_blob_count = 9999;
+  constexpr uint64_t total_blob_bytes = 100000000;
+  const std::string checksum_method("CRC32");
+  const std::string checksum_value("\x3d\x87\xff\x57");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  TestEncodeDecode(blob_file_addition);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileAdditionTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t total_blob_count = 100;
+  constexpr uint64_t total_blob_bytes = 2000000;
+  const std::string checksum_method("CRC32B");
+  const std::string checksum_value("\x6d\xbd\xf2\x3a");
+
+  BlobFileAddition blob_file_addition(blob_file_number, total_blob_count,
+                                      total_blob_bytes, checksum_method,
+                                      checksum_value);
+
+  std::string encoded;
+  blob_file_addition.EncodeTo(&encoded);
+
+  BlobFileAddition decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_builder.cc b/src/rocksdb/db/blob/blob_file_builder.cc
new file mode 100644
index 000000000..5e0e7f6cb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.cc
@@ -0,0 +1,446 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/event_helpers.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileBuilder::BlobFileBuilder(
+    VersionSet* versions, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    std::string db_id, std::string db_session_id, int job_id,
+    uint32_t column_family_id, const std::string& column_family_name,
+    Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : BlobFileBuilder([versions]() { return versions->NewFileNumber(); }, fs,
+                      immutable_options, mutable_cf_options, file_options,
+                      db_id, db_session_id, job_id, column_family_id,
+                      column_family_name, io_priority, write_hint, io_tracer,
+                      blob_callback, creation_reason, blob_file_paths,
+                      blob_file_additions) {}
+
+BlobFileBuilder::BlobFileBuilder(
+    std::function<uint64_t()> file_number_generator, FileSystem* fs,
+    const ImmutableOptions* immutable_options,
+    const MutableCFOptions* mutable_cf_options, const FileOptions* file_options,
+    std::string db_id, std::string db_session_id, int job_id,
+    uint32_t column_family_id, const std::string& column_family_name,
+    Env::IOPriority io_priority, Env::WriteLifeTimeHint write_hint,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCompletionCallback* blob_callback,
+    BlobFileCreationReason creation_reason,
+    std::vector<std::string>* blob_file_paths,
+    std::vector<BlobFileAddition>* blob_file_additions)
+    : file_number_generator_(std::move(file_number_generator)),
+      fs_(fs),
+      immutable_options_(immutable_options),
+      min_blob_size_(mutable_cf_options->min_blob_size),
+      blob_file_size_(mutable_cf_options->blob_file_size),
+      blob_compression_type_(mutable_cf_options->blob_compression_type),
+      prepopulate_blob_cache_(mutable_cf_options->prepopulate_blob_cache),
+      file_options_(file_options),
+      db_id_(std::move(db_id)),
+      db_session_id_(std::move(db_session_id)),
+      job_id_(job_id),
+      column_family_id_(column_family_id),
+      column_family_name_(column_family_name),
+      io_priority_(io_priority),
+      write_hint_(write_hint),
+      io_tracer_(io_tracer),
+      blob_callback_(blob_callback),
+      creation_reason_(creation_reason),
+      blob_file_paths_(blob_file_paths),
+      blob_file_additions_(blob_file_additions),
+      blob_count_(0),
+      blob_bytes_(0) {
+  assert(file_number_generator_);
+  assert(fs_);
+  assert(immutable_options_);
+  assert(file_options_);
+  assert(blob_file_paths_);
+  assert(blob_file_paths_->empty());
+  assert(blob_file_additions_);
+  assert(blob_file_additions_->empty());
+}
+
+BlobFileBuilder::~BlobFileBuilder() = default;
+
+Status BlobFileBuilder::Add(const Slice& key, const Slice& value,
+                            std::string* blob_index) {
+  assert(blob_index);
+  assert(blob_index->empty());
+
+  if (value.size() < min_blob_size_) {
+    return Status::OK();
+  }
+
+  {
+    const Status s = OpenBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Slice blob = value;
+  std::string compressed_blob;
+
+  {
+    const Status s = CompressBlobIfNeeded(&blob, &compressed_blob);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  uint64_t blob_file_number = 0;
+  uint64_t blob_offset = 0;
+
+  {
+    const Status s =
+        WriteBlobToFile(key, blob, &blob_file_number, &blob_offset);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = CloseBlobFileIfNeeded();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s =
+        PutBlobIntoCacheIfNeeded(value, blob_file_number, blob_offset);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_options_->info_log,
+                     "Failed to pre-populate the blob into blob cache: %s",
+                     s.ToString().c_str());
+    }
+  }
+
+  BlobIndex::EncodeBlob(blob_index, blob_file_number, blob_offset, blob.size(),
+                        blob_compression_type_);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::Finish() {
+  if (!IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+bool BlobFileBuilder::IsBlobFileOpen() const { return !!writer_; }
+
+Status BlobFileBuilder::OpenBlobFileIfNeeded() {
+  if (IsBlobFileOpen()) {
+    return Status::OK();
+  }
+
+  assert(!blob_count_);
+  assert(!blob_bytes_);
+
+  assert(file_number_generator_);
+  const uint64_t blob_file_number = file_number_generator_();
+
+  assert(immutable_options_);
+  assert(!immutable_options_->cf_paths.empty());
+  std::string blob_file_path =
+      BlobFileName(immutable_options_->cf_paths.front().path, blob_file_number);
+
+  if (blob_callback_) {
+    blob_callback_->OnBlobFileCreationStarted(
+        blob_file_path, column_family_name_, job_id_, creation_reason_);
+  }
+
+  std::unique_ptr<FSWritableFile> file;
+
+  {
+    assert(file_options_);
+    Status s = NewWritableFile(fs_, blob_file_path, &file, *file_options_);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Note: files get added to blob_file_paths_ right after the open, so they
+  // can be cleaned up upon failure. Contrast this with blob_file_additions_,
+  // which only contains successfully written files.
+  assert(blob_file_paths_);
+  blob_file_paths_->emplace_back(std::move(blob_file_path));
+
+  assert(file);
+  file->SetIOPriority(io_priority_);
+  file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = immutable_options_->checksum_handoff_file_types;
+  Statistics* const statistics = immutable_options_->stats;
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_paths_->back(), *file_options_,
+      immutable_options_->clock, io_tracer_, statistics,
+      immutable_options_->listeners,
+      immutable_options_->file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kBlobFile), false));
+
+  constexpr bool do_flush = false;
+
+  std::unique_ptr<BlobLogWriter> blob_log_writer(new BlobLogWriter(
+      std::move(file_writer), immutable_options_->clock, statistics,
+      blob_file_number, immutable_options_->use_fsync, do_flush));
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id_, blob_compression_type_, has_ttl,
+                       expiration_range);
+
+  {
+    Status s = blob_log_writer->WriteHeader(header);
+
+    TEST_SYNC_POINT_CALLBACK(
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader", &s);
+
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  writer_ = std::move(blob_log_writer);
+
+  assert(IsBlobFileOpen());
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CompressBlobIfNeeded(
+    Slice* blob, std::string* compressed_blob) const {
+  assert(blob);
+  assert(compressed_blob);
+  assert(compressed_blob->empty());
+  assert(immutable_options_);
+
+  if (blob_compression_type_ == kNoCompression) {
+    return Status::OK();
+  }
+
+  CompressionOptions opts;
+  CompressionContext context(blob_compression_type_);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       blob_compression_type_, sample_for_compression);
+
+  constexpr uint32_t compression_format_version = 2;
+
+  bool success = false;
+
+  {
+    StopWatch stop_watch(immutable_options_->clock, immutable_options_->stats,
+                         BLOB_DB_COMPRESSION_MICROS);
+    success =
+        CompressData(*blob, info, compression_format_version, compressed_blob);
+  }
+
+  if (!success) {
+    return Status::Corruption("Error compressing blob");
+  }
+
+  *blob = Slice(*compressed_blob);
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::WriteBlobToFile(const Slice& key, const Slice& blob,
+                                        uint64_t* blob_file_number,
+                                        uint64_t* blob_offset) {
+  assert(IsBlobFileOpen());
+  assert(blob_file_number);
+  assert(blob_offset);
+
+  uint64_t key_offset = 0;
+
+  Status s = writer_->AddRecord(key, blob, &key_offset, blob_offset);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AddRecord", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  *blob_file_number = writer_->get_log_number();
+
+  ++blob_count_;
+  blob_bytes_ += BlobLogRecord::kHeaderSize + key.size() + blob.size();
+
+  return Status::OK();
+}
+
+Status BlobFileBuilder::CloseBlobFile() {
+  assert(IsBlobFileOpen());
+
+  BlobLogFooter footer;
+  footer.blob_count = blob_count_;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  Status s = writer_->AppendFooter(footer, &checksum_method, &checksum_value);
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileBuilder::WriteBlobToFile:AppendFooter", &s);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  const uint64_t blob_file_number = writer_->get_log_number();
+
+  if (blob_callback_) {
+    s = blob_callback_->OnBlobFileCompleted(
+        blob_file_paths_->back(), column_family_name_, job_id_,
+        blob_file_number, creation_reason_, s, checksum_value, checksum_method,
+        blob_count_, blob_bytes_);
+  }
+
+  assert(blob_file_additions_);
+  blob_file_additions_->emplace_back(blob_file_number, blob_count_, blob_bytes_,
+                                     std::move(checksum_method),
+                                     std::move(checksum_value));
+
+  assert(immutable_options_);
+  ROCKS_LOG_INFO(immutable_options_->logger,
+                 "[%s] [JOB %d] Generated blob file #%" PRIu64 ": %" PRIu64
+                 " total blobs, %" PRIu64 " total bytes",
+                 column_family_name_.c_str(), job_id_, blob_file_number,
+                 blob_count_, blob_bytes_);
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+
+  return s;
+}
+
+Status BlobFileBuilder::CloseBlobFileIfNeeded() {
+  assert(IsBlobFileOpen());
+
+  const WritableFileWriter* const file_writer = writer_->file();
+  assert(file_writer);
+
+  if (file_writer->GetFileSize() < blob_file_size_) {
+    return Status::OK();
+  }
+
+  return CloseBlobFile();
+}
+
+void BlobFileBuilder::Abandon(const Status& s) {
+  if (!IsBlobFileOpen()) {
+    return;
+  }
+  if (blob_callback_) {
+    // BlobFileBuilder::Abandon() is called because of error while writing to
+    // Blob files. So we can ignore the below error.
+    blob_callback_
+        ->OnBlobFileCompleted(blob_file_paths_->back(), column_family_name_,
+                              job_id_, writer_->get_log_number(),
+                              creation_reason_, s, "", "", blob_count_,
+                              blob_bytes_)
+        .PermitUncheckedError();
+  }
+
+  writer_.reset();
+  blob_count_ = 0;
+  blob_bytes_ = 0;
+}
+
+Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
+                                                 uint64_t blob_file_number,
+                                                 uint64_t blob_offset) const {
+  Status s = Status::OK();
+
+  auto blob_cache = immutable_options_->blob_cache;
+  auto statistics = immutable_options_->statistics.get();
+  bool warm_cache =
+      prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
+      creation_reason_ == BlobFileCreationReason::kFlush;
+
+  if (blob_cache && warm_cache) {
+    const OffsetableCacheKey base_cache_key(db_id_, db_session_id_,
+                                            blob_file_number);
+    const CacheKey cache_key = base_cache_key.WithOffset(blob_offset);
+    const Slice key = cache_key.AsSlice();
+
+    const Cache::Priority priority = Cache::Priority::BOTTOM;
+
+    // Objects to be put into the cache have to be heap-allocated and
+    // self-contained, i.e. own their contents. The Cache has to be able to
+    // take unique ownership of them.
+    CacheAllocationPtr allocation =
+        AllocateBlock(blob.size(), blob_cache->memory_allocator());
+    memcpy(allocation.get(), blob.data(), blob.size());
+    std::unique_ptr<BlobContents> buf =
+        BlobContents::Create(std::move(allocation), blob.size());
+
+    Cache::CacheItemHelper* const cache_item_helper =
+        BlobContents::GetCacheItemHelper();
+    assert(cache_item_helper);
+
+    if (immutable_options_->lowest_used_cache_tier ==
+        CacheTier::kNonVolatileBlockTier) {
+      s = blob_cache->Insert(key, buf.get(), cache_item_helper,
+                             buf->ApproximateMemoryUsage(),
+                             nullptr /* cache_handle */, priority);
+    } else {
+      s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
+                             cache_item_helper->del_cb,
+                             nullptr /* cache_handle */, priority);
+    }
+
+    if (s.ok()) {
+      RecordTick(statistics, BLOB_DB_CACHE_ADD);
+      RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
+      buf.release();
+    } else {
+      RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
+    }
+  }
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder.h b/src/rocksdb/db/blob/blob_file_builder.h
new file mode 100644
index 000000000..8e7aab502
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder.h
@@ -0,0 +1,112 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cinttypes>
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionSet;
+class FileSystem;
+class SystemClock;
+struct ImmutableOptions;
+struct MutableCFOptions;
+struct FileOptions;
+class BlobFileAddition;
+class Status;
+class Slice;
+class BlobLogWriter;
+class IOTracer;
+class BlobFileCompletionCallback;
+
+class BlobFileBuilder {
+ public:
+  BlobFileBuilder(VersionSet* versions, FileSystem* fs,
+                  const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, std::string db_id,
+                  std::string db_session_id, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(std::function<uint64_t()> file_number_generator,
+                  FileSystem* fs, const ImmutableOptions* immutable_options,
+                  const MutableCFOptions* mutable_cf_options,
+                  const FileOptions* file_options, std::string db_id,
+                  std::string db_session_id, int job_id,
+                  uint32_t column_family_id,
+                  const std::string& column_family_name,
+                  Env::IOPriority io_priority,
+                  Env::WriteLifeTimeHint write_hint,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  BlobFileCompletionCallback* blob_callback,
+                  BlobFileCreationReason creation_reason,
+                  std::vector<std::string>* blob_file_paths,
+                  std::vector<BlobFileAddition>* blob_file_additions);
+
+  BlobFileBuilder(const BlobFileBuilder&) = delete;
+  BlobFileBuilder& operator=(const BlobFileBuilder&) = delete;
+
+  ~BlobFileBuilder();
+
+  Status Add(const Slice& key, const Slice& value, std::string* blob_index);
+  Status Finish();
+  void Abandon(const Status& s);
+
+ private:
+  bool IsBlobFileOpen() const;
+  Status OpenBlobFileIfNeeded();
+  Status CompressBlobIfNeeded(Slice* blob, std::string* compressed_blob) const;
+  Status WriteBlobToFile(const Slice& key, const Slice& blob,
+                         uint64_t* blob_file_number, uint64_t* blob_offset);
+  Status CloseBlobFile();
+  Status CloseBlobFileIfNeeded();
+
+  Status PutBlobIntoCacheIfNeeded(const Slice& blob, uint64_t blob_file_number,
+                                  uint64_t blob_offset) const;
+
+  std::function<uint64_t()> file_number_generator_;
+  FileSystem* fs_;
+  const ImmutableOptions* immutable_options_;
+  uint64_t min_blob_size_;
+  uint64_t blob_file_size_;
+  CompressionType blob_compression_type_;
+  PrepopulateBlobCache prepopulate_blob_cache_;
+  const FileOptions* file_options_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  int job_id_;
+  uint32_t column_family_id_;
+  std::string column_family_name_;
+  Env::IOPriority io_priority_;
+  Env::WriteLifeTimeHint write_hint_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  BlobFileCompletionCallback* blob_callback_;
+  BlobFileCreationReason creation_reason_;
+  std::vector<std::string>* blob_file_paths_;
+  std::vector<BlobFileAddition>* blob_file_additions_;
+  std::unique_ptr<BlobLogWriter> writer_;
+  uint64_t blob_count_;
+  uint64_t blob_bytes_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_builder_test.cc b/src/rocksdb/db/blob/blob_file_builder_test.cc
new file mode 100644
index 000000000..3a0feee45
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_builder_test.cc
@@ -0,0 +1,680 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_builder.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_sequential_reader.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestFileNumberGenerator {
+ public:
+  uint64_t operator()() { return ++next_file_number_; }
+
+ private:
+  uint64_t next_file_number_ = 1;
+};
+
+class BlobFileBuilderTest : public testing::Test {
+ protected:
+  BlobFileBuilderTest() {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+    clock_ = mock_env_->GetSystemClock().get();
+  }
+
+  void VerifyBlobFile(uint64_t blob_file_number,
+                      const std::string& blob_file_path,
+                      uint32_t column_family_id,
+                      CompressionType blob_compression_type,
+                      const std::vector<std::pair<std::string, std::string>>&
+                          expected_key_value_pairs,
+                      const std::vector<std::string>& blob_indexes) {
+    assert(expected_key_value_pairs.size() == blob_indexes.size());
+
+    std::unique_ptr<FSRandomAccessFile> file;
+    constexpr IODebugContext* dbg = nullptr;
+    ASSERT_OK(
+        fs_->NewRandomAccessFile(blob_file_path, file_options_, &file, dbg));
+
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(std::move(file), blob_file_path, clock_));
+
+    constexpr Statistics* statistics = nullptr;
+    BlobLogSequentialReader blob_log_reader(std::move(file_reader), clock_,
+                                            statistics);
+
+    BlobLogHeader header;
+    ASSERT_OK(blob_log_reader.ReadHeader(&header));
+    ASSERT_EQ(header.version, kVersion1);
+    ASSERT_EQ(header.column_family_id, column_family_id);
+    ASSERT_EQ(header.compression, blob_compression_type);
+    ASSERT_FALSE(header.has_ttl);
+    ASSERT_EQ(header.expiration_range, ExpirationRange());
+
+    for (size_t i = 0; i < expected_key_value_pairs.size(); ++i) {
+      BlobLogRecord record;
+      uint64_t blob_offset = 0;
+
+      ASSERT_OK(blob_log_reader.ReadRecord(
+          &record, BlobLogSequentialReader::kReadHeaderKeyBlob, &blob_offset));
+
+      // Check the contents of the blob file
+      const auto& expected_key_value = expected_key_value_pairs[i];
+      const auto& key = expected_key_value.first;
+      const auto& value = expected_key_value.second;
+
+      ASSERT_EQ(record.key_size, key.size());
+      ASSERT_EQ(record.value_size, value.size());
+      ASSERT_EQ(record.expiration, 0);
+      ASSERT_EQ(record.key, key);
+      ASSERT_EQ(record.value, value);
+
+      // Make sure the blob reference returned by the builder points to the
+      // right place
+      BlobIndex blob_index;
+      ASSERT_OK(blob_index.DecodeFrom(blob_indexes[i]));
+      ASSERT_FALSE(blob_index.IsInlined());
+      ASSERT_FALSE(blob_index.HasTTL());
+      ASSERT_EQ(blob_index.file_number(), blob_file_number);
+      ASSERT_EQ(blob_index.offset(), blob_offset);
+      ASSERT_EQ(blob_index.size(), value.size());
+    }
+
+    BlobLogFooter footer;
+    ASSERT_OK(blob_log_reader.ReadFooter(&footer));
+    ASSERT_EQ(footer.blob_count, expected_key_value_pairs.size());
+    ASSERT_EQ(footer.expiration_range, ExpirationRange());
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  SystemClock* clock_;
+  FileOptions file_options_;
+};
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckOneFile) {
+  // Build a single blob file
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 4;
+  constexpr size_t value_offset = 1234;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckOneFile"),
+      0);
+  options.enable_blob_files = true;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), number_of_blobs);
+  ASSERT_EQ(
+      blob_file_addition.GetTotalBlobBytes(),
+      number_of_blobs * (BlobLogRecord::kHeaderSize + key_size + value_size));
+
+  // Verify the contents of the new blob file as well as the blob references
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, BuildAndCheckMultipleFiles) {
+  // Build multiple blob files: file size limit is set to the size of a single
+  // value, so each blob ends up in a file of its own
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_BuildAndCheckMultipleFiles"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs(
+      number_of_blobs);
+  std::vector<std::string> blob_indexes(number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    auto& expected_key_value = expected_key_value_pairs[i];
+
+    auto& key = expected_key_value.first;
+    key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    auto& value = expected_key_value.second;
+    value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    auto& blob_index = blob_indexes[i];
+
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_FALSE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_EQ(blob_file_paths.size(), number_of_blobs);
+  ASSERT_EQ(blob_file_additions.size(), number_of_blobs);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const uint64_t blob_file_number = i + 2;
+
+    ASSERT_EQ(blob_file_paths[i],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+
+    const auto& blob_file_addition = blob_file_additions[i];
+
+    ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+    ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+              BlobLogRecord::kHeaderSize + key_size + value_size);
+  }
+
+  // Verify the contents of the new blob files as well as the blob references
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    std::vector<std::pair<std::string, std::string>> expected_key_value_pair{
+        expected_key_value_pairs[i]};
+    std::vector<std::string> blob_index{blob_indexes[i]};
+
+    VerifyBlobFile(i + 2, blob_file_paths[i], column_family_id, kNoCompression,
+                   expected_key_value_pair, blob_index);
+  }
+}
+
+TEST_F(BlobFileBuilderTest, InlinedValues) {
+  // All values are below the min_blob_size threshold; no blob files get written
+  constexpr size_t number_of_blobs = 10;
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 10;
+  constexpr size_t value_offset = 1234567890;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_InlinedValues"),
+      0);
+  options.enable_blob_files = true;
+  options.min_blob_size = 1024;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  for (size_t i = 0; i < number_of_blobs; ++i) {
+    const std::string key = std::to_string(i);
+    assert(key.size() == key_size);
+
+    const std::string value = std::to_string(i + value_offset);
+    assert(value.size() == value_size);
+
+    std::string blob_index;
+    ASSERT_OK(builder.Add(key, value, &blob_index));
+    ASSERT_TRUE(blob_index.empty());
+  }
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  ASSERT_TRUE(blob_file_paths.empty());
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Compression) {
+  // Build a blob file with a compressed blob
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  constexpr size_t key_size = 1;
+  constexpr size_t value_size = 100;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string uncompressed_value(value_size, 'x');
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, uncompressed_value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+
+  CompressionOptions opts;
+  CompressionContext context(kSnappyCompression);
+  constexpr uint64_t sample_for_compression = 0;
+
+  CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                       kSnappyCompression, sample_for_compression);
+
+  std::string compressed_value;
+  ASSERT_TRUE(Snappy_Compress(info, uncompressed_value.data(),
+                              uncompressed_value.size(), &compressed_value));
+
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key_size + compressed_value.size());
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, compressed_value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kSnappyCompression, expected_key_value_pairs, blob_indexes);
+}
+
+TEST_F(BlobFileBuilderTest, CompressionError) {
+  // Simulate an error during compression
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderTest_CompressionError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_compression_type = kSnappyCompression;
+  options.env = mock_env_.get();
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack("CompressData:TamperWithReturnValue",
+                                        [](void* arg) {
+                                          bool* ret = static_cast<bool*>(arg);
+                                          *ret = false;
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+  ASSERT_EQ(
+      blob_file_paths[0],
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+TEST_F(BlobFileBuilderTest, Checksum) {
+  // Build a blob file with checksum
+
+  class DummyFileChecksumGenerator : public FileChecksumGenerator {
+   public:
+    void Update(const char* /* data */, size_t /* n */) override {}
+
+    void Finalize() override {}
+
+    std::string GetChecksum() const override { return std::string("dummy"); }
+
+    const char* Name() const override { return "DummyFileChecksum"; }
+  };
+
+  class DummyFileChecksumGenFactory : public FileChecksumGenFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& /* context */) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new DummyFileChecksumGenerator);
+    }
+
+    const char* Name() const override { return "DummyFileChecksumGenFactory"; }
+  };
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileBuilderTest_Checksum"),
+      0);
+  options.enable_blob_files = true;
+  options.file_checksum_gen_factory =
+      std::make_shared<DummyFileChecksumGenFactory>();
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  const std::string key("1");
+  const std::string value("deadbeef");
+
+  std::string blob_index;
+
+  ASSERT_OK(builder.Add(key, value, &blob_index));
+  ASSERT_FALSE(blob_index.empty());
+
+  ASSERT_OK(builder.Finish());
+
+  // Check the metadata generated
+  constexpr uint64_t blob_file_number = 2;
+
+  ASSERT_EQ(blob_file_paths.size(), 1);
+
+  const std::string& blob_file_path = blob_file_paths[0];
+
+  ASSERT_EQ(
+      blob_file_path,
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number));
+
+  ASSERT_EQ(blob_file_additions.size(), 1);
+
+  const auto& blob_file_addition = blob_file_additions[0];
+
+  ASSERT_EQ(blob_file_addition.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobCount(), 1);
+  ASSERT_EQ(blob_file_addition.GetTotalBlobBytes(),
+            BlobLogRecord::kHeaderSize + key.size() + value.size());
+  ASSERT_EQ(blob_file_addition.GetChecksumMethod(), "DummyFileChecksum");
+  ASSERT_EQ(blob_file_addition.GetChecksumValue(), "dummy");
+
+  // Verify the contents of the new blob file as well as the blob reference
+  std::vector<std::pair<std::string, std::string>> expected_key_value_pairs{
+      {key, value}};
+  std::vector<std::string> blob_indexes{blob_index};
+
+  VerifyBlobFile(blob_file_number, blob_file_path, column_family_id,
+                 kNoCompression, expected_key_value_pairs, blob_indexes);
+}
+
+class BlobFileBuilderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileBuilderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fs_ = mock_env_->GetFileSystem().get();
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  FileSystem* fs_;
+  FileOptions file_options_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BlobFileBuilderTest, BlobFileBuilderIOErrorTest,
+    ::testing::ValuesIn(std::vector<std::string>{
+        "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile",
+        "BlobFileBuilder::OpenBlobFileIfNeeded:WriteHeader",
+        "BlobFileBuilder::WriteBlobToFile:AddRecord",
+        "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(BlobFileBuilderIOErrorTest, IOError) {
+  // Simulate an I/O error during the specified step of Add()
+  // Note: blob_file_size will be set to value_size in order for the first blob
+  // to trigger close
+  constexpr size_t value_size = 8;
+
+  Options options;
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileBuilderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+  options.blob_file_size = value_size;
+  options.env = mock_env_.get();
+
+  ImmutableOptions immutable_options(options);
+  MutableCFOptions mutable_cf_options(options);
+
+  constexpr int job_id = 1;
+  constexpr uint32_t column_family_id = 123;
+  constexpr char column_family_name[] = "foobar";
+  constexpr Env::IOPriority io_priority = Env::IO_HIGH;
+  constexpr Env::WriteLifeTimeHint write_hint = Env::WLTH_MEDIUM;
+
+  std::vector<std::string> blob_file_paths;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  BlobFileBuilder builder(
+      TestFileNumberGenerator(), fs_, &immutable_options, &mutable_cf_options,
+      &file_options_, "" /*db_id*/, "" /*db_session_id*/, job_id,
+      column_family_id, column_family_name, io_priority, write_hint,
+      nullptr /*IOTracer*/, nullptr /*BlobFileCompletionCallback*/,
+      BlobFileCreationReason::kFlush, &blob_file_paths, &blob_file_additions);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr char key[] = "1";
+  constexpr char value[] = "deadbeef";
+
+  std::string blob_index;
+
+  ASSERT_TRUE(builder.Add(key, value, &blob_index).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  if (sync_point_ == "BlobFileBuilder::OpenBlobFileIfNeeded:NewWritableFile") {
+    ASSERT_TRUE(blob_file_paths.empty());
+  } else {
+    constexpr uint64_t blob_file_number = 2;
+
+    ASSERT_EQ(blob_file_paths.size(), 1);
+    ASSERT_EQ(blob_file_paths[0],
+              BlobFileName(immutable_options.cf_paths.front().path,
+                           blob_file_number));
+  }
+
+  ASSERT_TRUE(blob_file_additions.empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_cache.cc b/src/rocksdb/db/blob/blob_file_cache.cc
new file mode 100644
index 000000000..1a6cdf688
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.cc
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <memory>
+
+#include "db/blob/blob_file_reader.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/slice.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/io_tracer.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobFileCache::BlobFileCache(Cache* cache,
+                             const ImmutableOptions* immutable_options,
+                             const FileOptions* file_options,
+                             uint32_t column_family_id,
+                             HistogramImpl* blob_file_read_hist,
+                             const std::shared_ptr<IOTracer>& io_tracer)
+    : cache_(cache),
+      mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+      immutable_options_(immutable_options),
+      file_options_(file_options),
+      column_family_id_(column_family_id),
+      blob_file_read_hist_(blob_file_read_hist),
+      io_tracer_(io_tracer) {
+  assert(cache_);
+  assert(immutable_options_);
+  assert(file_options_);
+}
+
+Status BlobFileCache::GetBlobFileReader(
+    uint64_t blob_file_number,
+    CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(blob_file_reader->IsEmpty());
+
+  const Slice key = GetSlice(&blob_file_number);
+
+  assert(cache_);
+
+  Cache::Handle* handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
+
+  // Check again while holding mutex
+  MutexLock lock(mutex_.get(key));
+
+  handle = cache_->Lookup(key);
+  if (handle) {
+    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    return Status::OK();
+  }
+
+  assert(immutable_options_);
+  Statistics* const statistics = immutable_options_->stats;
+
+  RecordTick(statistics, NO_FILE_OPENS);
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  {
+    assert(file_options_);
+    const Status s = BlobFileReader::Create(
+        *immutable_options_, *file_options_, column_family_id_,
+        blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  {
+    constexpr size_t charge = 1;
+
+    const Status s = cache_->Insert(key, reader.get(), charge,
+                                    &DeleteCacheEntry<BlobFileReader>, &handle);
+    if (!s.ok()) {
+      RecordTick(statistics, NO_FILE_ERRORS);
+      return s;
+    }
+  }
+
+  reader.release();
+
+  *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache.h b/src/rocksdb/db/blob/blob_file_cache.h
new file mode 100644
index 000000000..8eec05f18
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache.h
@@ -0,0 +1,52 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Cache;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+class Status;
+class BlobFileReader;
+class Slice;
+class IOTracer;
+
+class BlobFileCache {
+ public:
+  BlobFileCache(Cache* cache, const ImmutableOptions* immutable_options,
+                const FileOptions* file_options, uint32_t column_family_id,
+                HistogramImpl* blob_file_read_hist,
+                const std::shared_ptr<IOTracer>& io_tracer);
+
+  BlobFileCache(const BlobFileCache&) = delete;
+  BlobFileCache& operator=(const BlobFileCache&) = delete;
+
+  Status GetBlobFileReader(uint64_t blob_file_number,
+                           CacheHandleGuard<BlobFileReader>* blob_file_reader);
+
+ private:
+  Cache* cache_;
+  // Note: mutex_ below is used to guard against multiple threads racing to open
+  // the same file.
+  Striped<port::Mutex, Slice> mutex_;
+  const ImmutableOptions* immutable_options_;
+  const FileOptions* file_options_;
+  uint32_t column_family_id_;
+  HistogramImpl* blob_file_read_hist_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  static constexpr size_t kNumberOfMutexStripes = 1 << 7;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_cache_test.cc b/src/rocksdb/db/blob/blob_file_cache_test.cc
new file mode 100644
index 000000000..d3a61b3c5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_cache_test.cc
@@ -0,0 +1,269 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_cache.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with a single blob in it.
+void WriteBlobFile(uint32_t column_family_id,
+                   const ImmutableOptions& immutable_options,
+                   uint64_t blob_file_number) {
+  assert(!immutable_options.cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+
+  BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                       expiration_range);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  std::string compressed_blob;
+
+  uint64_t key_offset = 0;
+  uint64_t blob_offset = 0;
+
+  ASSERT_OK(blob_log_writer.AddRecord(key, blob, &key_offset, &blob_offset));
+
+  BlobLogFooter footer;
+  footer.blob_count = 1;
+  footer.expiration_range = expiration_range;
+
+  std::string checksum_method;
+  std::string checksum_value;
+
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobFileCacheTest : public testing::Test {
+ protected:
+  BlobFileCacheTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // First try: reader should be opened and put in cache
+  CacheHandleGuard<BlobFileReader> first;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  // Second try: reader should be served from cache
+  CacheHandleGuard<BlobFileReader> second;
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+  ASSERT_NE(second.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_Race"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  CacheHandleGuard<BlobFileReader> first;
+  CacheHandleGuard<BlobFileReader> second;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
+        // Disabling sync points to prevent infinite recursion
+        SyncPoint::GetInstance()->DisableProcessing();
+
+        ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+        ASSERT_NE(second.GetValue(), nullptr);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_NE(first.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
+
+  ASSERT_EQ(first.GetValue(), second.GetValue());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  ImmutableOptions immutable_options(options);
+  FileOptions file_options;
+  constexpr uint32_t column_family_id = 1;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Note: there is no blob file with the below number
+  constexpr uint64_t blob_file_number = 123;
+
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(
+      blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
+  Options options;
+  options.env = mock_env_.get();
+  options.statistics = CreateDBStatistics();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileCacheTest_GetBlobFileReader_CacheFull"),
+      0);
+  options.enable_blob_files = true;
+
+  constexpr uint32_t column_family_id = 1;
+  ImmutableOptions immutable_options(options);
+  constexpr uint64_t blob_file_number = 123;
+
+  WriteBlobFile(column_family_id, immutable_options, blob_file_number);
+
+  constexpr size_t capacity = 0;
+  constexpr int num_shard_bits = -1;  // determined automatically
+  constexpr bool strict_capacity_limit = true;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  BlobFileCache blob_file_cache(backing_cache.get(), &immutable_options,
+                                &file_options, column_family_id,
+                                blob_file_read_hist, nullptr /*IOTracer*/);
+
+  // Insert into cache should fail since it has zero capacity and
+  // strict_capacity_limit is set
+  CacheHandleGuard<BlobFileReader> reader;
+
+  ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
+                  .IsMemoryLimit());
+  ASSERT_EQ(reader.GetValue(), nullptr);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
+  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_completion_callback.h b/src/rocksdb/db/blob/blob_file_completion_callback.h
new file mode 100644
index 000000000..ffe65a0ff
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_completion_callback.h
@@ -0,0 +1,101 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileCompletionCallback {
+ public:
+  BlobFileCompletionCallback(
+      SstFileManager* sst_file_manager, InstrumentedMutex* mutex,
+      ErrorHandler* error_handler, EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& dbname)
+      : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
+#ifndef ROCKSDB_LITE
+    sst_file_manager_ = sst_file_manager;
+    mutex_ = mutex;
+    error_handler_ = error_handler;
+#else
+    (void)sst_file_manager;
+    (void)mutex;
+    (void)error_handler;
+#endif  // ROCKSDB_LITE
+  }
+
+  void OnBlobFileCreationStarted(const std::string& file_name,
+                                 const std::string& column_family_name,
+                                 int job_id,
+                                 BlobFileCreationReason creation_reason) {
+#ifndef ROCKSDB_LITE
+    // Notify the listeners.
+    EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
+                                                column_family_name, file_name,
+                                                job_id, creation_reason);
+#else
+    (void)file_name;
+    (void)column_family_name;
+    (void)job_id;
+    (void)creation_reason;
+#endif
+  }
+
+  Status OnBlobFileCompleted(const std::string& file_name,
+                             const std::string& column_family_name, int job_id,
+                             uint64_t file_number,
+                             BlobFileCreationReason creation_reason,
+                             const Status& report_status,
+                             const std::string& checksum_value,
+                             const std::string& checksum_method,
+                             uint64_t blob_count, uint64_t blob_bytes) {
+    Status s;
+
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
+    if (sfm) {
+      // Report new blob files to SstFileManagerImpl
+      s = sfm->OnAddFile(file_name);
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        s = Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT(
+            "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached");
+        InstrumentedMutexLock l(mutex_);
+        error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // !ROCKSDB_LITE
+
+    // Notify the listeners.
+    EventHelpers::LogAndNotifyBlobFileCreationFinished(
+        event_logger_, listeners_, dbname_, column_family_name, file_name,
+        job_id, file_number, creation_reason,
+        (!report_status.ok() ? report_status : s),
+        (checksum_value.empty() ? kUnknownFileChecksum : checksum_value),
+        (checksum_method.empty() ? kUnknownFileChecksumFuncName
+                                 : checksum_method),
+        blob_count, blob_bytes);
+    return s;
+  }
+
+ private:
+#ifndef ROCKSDB_LITE
+  SstFileManager* sst_file_manager_;
+  InstrumentedMutex* mutex_;
+  ErrorHandler* error_handler_;
+#endif  // ROCKSDB_LITE
+  EventLogger* event_logger_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::string dbname_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.cc b/src/rocksdb/db/blob/blob_file_garbage.cc
new file mode 100644
index 000000000..52c336f49
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tags for custom fields. Note that these get persisted in the manifest,
+// so existing tags should not be modified.
+enum BlobFileGarbage::CustomFieldTags : uint32_t {
+  kEndMarker,
+
+  // Add forward compatible fields here
+
+  /////////////////////////////////////////////////////////////////////
+
+  kForwardIncompatibleMask = 1 << 6,
+
+  // Add forward incompatible fields here
+};
+
+void BlobFileGarbage::EncodeTo(std::string* output) const {
+  PutVarint64(output, blob_file_number_);
+  PutVarint64(output, garbage_blob_count_);
+  PutVarint64(output, garbage_blob_bytes_);
+
+  // Encode any custom fields here. The format to use is a Varint32 tag (see
+  // CustomFieldTags above) followed by a length prefixed slice. Unknown custom
+  // fields will be ignored during decoding unless they're in the forward
+  // incompatible range.
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileGarbage::EncodeTo::CustomFields", output);
+
+  PutVarint32(output, kEndMarker);
+}
+
+Status BlobFileGarbage::DecodeFrom(Slice* input) {
+  constexpr char class_name[] = "BlobFileGarbage";
+
+  if (!GetVarint64(input, &blob_file_number_)) {
+    return Status::Corruption(class_name, "Error decoding blob file number");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_count_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob count");
+  }
+
+  if (!GetVarint64(input, &garbage_blob_bytes_)) {
+    return Status::Corruption(class_name, "Error decoding garbage blob bytes");
+  }
+
+  while (true) {
+    uint32_t custom_field_tag = 0;
+    if (!GetVarint32(input, &custom_field_tag)) {
+      return Status::Corruption(class_name, "Error decoding custom field tag");
+    }
+
+    if (custom_field_tag == kEndMarker) {
+      break;
+    }
+
+    if (custom_field_tag & kForwardIncompatibleMask) {
+      return Status::Corruption(
+          class_name, "Forward incompatible custom field encountered");
+    }
+
+    Slice custom_field_value;
+    if (!GetLengthPrefixedSlice(input, &custom_field_value)) {
+      return Status::Corruption(class_name,
+                                "Error decoding custom field value");
+    }
+  }
+
+  return Status::OK();
+}
+
+std::string BlobFileGarbage::DebugString() const {
+  std::ostringstream oss;
+
+  oss << *this;
+
+  return oss.str();
+}
+
+std::string BlobFileGarbage::DebugJSON() const {
+  JSONWriter jw;
+
+  jw << *this;
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return lhs.GetBlobFileNumber() == rhs.GetBlobFileNumber() &&
+         lhs.GetGarbageBlobCount() == rhs.GetGarbageBlobCount() &&
+         lhs.GetGarbageBlobBytes() == rhs.GetGarbageBlobBytes();
+}
+
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs) {
+  return !(lhs == rhs);
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage) {
+  os << "blob_file_number: " << blob_file_garbage.GetBlobFileNumber()
+     << " garbage_blob_count: " << blob_file_garbage.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << blob_file_garbage.GetGarbageBlobBytes();
+
+  return os;
+}
+
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage) {
+  jw << "BlobFileNumber" << blob_file_garbage.GetBlobFileNumber()
+     << "GarbageBlobCount" << blob_file_garbage.GetGarbageBlobCount()
+     << "GarbageBlobBytes" << blob_file_garbage.GetGarbageBlobBytes();
+
+  return jw;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage.h b/src/rocksdb/db/blob/blob_file_garbage.h
new file mode 100644
index 000000000..6dc14ddca
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage.h
@@ -0,0 +1,57 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <string>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+class BlobFileGarbage {
+ public:
+  BlobFileGarbage() = default;
+
+  BlobFileGarbage(uint64_t blob_file_number, uint64_t garbage_blob_count,
+                  uint64_t garbage_blob_bytes)
+      : blob_file_number_(blob_file_number),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  void EncodeTo(std::string* output) const;
+  Status DecodeFrom(Slice* input);
+
+  std::string DebugString() const;
+  std::string DebugJSON() const;
+
+ private:
+  enum CustomFieldTags : uint32_t;
+
+  uint64_t blob_file_number_ = kInvalidBlobFileNumber;
+  uint64_t garbage_blob_count_ = 0;
+  uint64_t garbage_blob_bytes_ = 0;
+};
+
+bool operator==(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+bool operator!=(const BlobFileGarbage& lhs, const BlobFileGarbage& rhs);
+
+std::ostream& operator<<(std::ostream& os,
+                         const BlobFileGarbage& blob_file_garbage);
+JSONWriter& operator<<(JSONWriter& jw,
+                       const BlobFileGarbage& blob_file_garbage);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_garbage_test.cc b/src/rocksdb/db/blob/blob_file_garbage_test.cc
new file mode 100644
index 000000000..292a8b38a
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_garbage_test.cc
@@ -0,0 +1,174 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_garbage.h"
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileGarbageTest : public testing::Test {
+ public:
+  static void TestEncodeDecode(const BlobFileGarbage& blob_file_garbage) {
+    std::string encoded;
+    blob_file_garbage.EncodeTo(&encoded);
+
+    BlobFileGarbage decoded;
+    Slice input(encoded);
+    ASSERT_OK(decoded.DecodeFrom(&input));
+
+    ASSERT_EQ(blob_file_garbage, decoded);
+  }
+};
+
+TEST_F(BlobFileGarbageTest, Empty) {
+  BlobFileGarbage blob_file_garbage;
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), kInvalidBlobFileNumber);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), 0);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), 0);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, NonEmpty) {
+  constexpr uint64_t blob_file_number = 123;
+  constexpr uint64_t garbage_blob_count = 1;
+  constexpr uint64_t garbage_blob_bytes = 9876;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  ASSERT_EQ(blob_file_garbage.GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(blob_file_garbage.GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+}
+
+TEST_F(BlobFileGarbageTest, DecodeErrors) {
+  std::string str;
+  Slice slice(str);
+
+  BlobFileGarbage blob_file_garbage;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "blob file number"));
+  }
+
+  constexpr uint64_t blob_file_number = 123;
+  PutVarint64(&str, blob_file_number);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob count"));
+  }
+
+  constexpr uint64_t garbage_blob_count = 4567;
+  PutVarint64(&str, garbage_blob_count);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "garbage blob bytes"));
+  }
+
+  constexpr uint64_t garbage_blob_bytes = 12345678;
+  PutVarint64(&str, garbage_blob_bytes);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field tag"));
+  }
+
+  constexpr uint32_t custom_tag = 2;
+  PutVarint32(&str, custom_tag);
+  slice = str;
+
+  {
+    const Status s = blob_file_garbage.DecodeFrom(&slice);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "custom field value"));
+  }
+}
+
+TEST_F(BlobFileGarbageTest, ForwardCompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_compatible_tag = 2;
+        PutVarint32(output, forward_compatible_tag);
+
+        PutLengthPrefixedSlice(output, "deadbeef");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 678;
+  constexpr uint64_t garbage_blob_count = 9999;
+  constexpr uint64_t garbage_blob_bytes = 100000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  TestEncodeDecode(blob_file_garbage);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileGarbageTest, ForwardIncompatibleCustomField) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields", [&](void* arg) {
+        std::string* output = static_cast<std::string*>(arg);
+
+        constexpr uint32_t forward_incompatible_tag = (1 << 6) + 1;
+        PutVarint32(output, forward_incompatible_tag);
+
+        PutLengthPrefixedSlice(output, "foobar");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t blob_file_number = 456;
+  constexpr uint64_t garbage_blob_count = 100;
+  constexpr uint64_t garbage_blob_bytes = 2000000;
+
+  BlobFileGarbage blob_file_garbage(blob_file_number, garbage_blob_count,
+                                    garbage_blob_bytes);
+
+  std::string encoded;
+  blob_file_garbage.EncodeTo(&encoded);
+
+  BlobFileGarbage decoded_blob_file_addition;
+  Slice input(encoded);
+  const Status s = decoded_blob_file_addition.DecodeFrom(&input);
+
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Forward incompatible"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_file_meta.cc b/src/rocksdb/db/blob/blob_file_meta.cc
new file mode 100644
index 000000000..4913137e5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.cc
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_meta.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t SharedBlobFileMetaData::GetBlobFileSize() const {
+  return BlobLogHeader::kSize + total_blob_bytes_ + BlobLogFooter::kSize;
+}
+
+std::string SharedBlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta) {
+  os << "blob_file_number: " << shared_meta.GetBlobFileNumber()
+     << " total_blob_count: " << shared_meta.GetTotalBlobCount()
+     << " total_blob_bytes: " << shared_meta.GetTotalBlobBytes()
+     << " checksum_method: " << shared_meta.GetChecksumMethod()
+     << " checksum_value: "
+     << Slice(shared_meta.GetChecksumValue()).ToString(/* hex */ true);
+
+  return os;
+}
+
+std::string BlobFileMetaData::DebugString() const {
+  std::ostringstream oss;
+  oss << (*this);
+
+  return oss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta) {
+  const auto& shared_meta = meta.GetSharedMeta();
+  assert(shared_meta);
+  os << (*shared_meta);
+
+  os << " linked_ssts: {";
+  for (uint64_t file_number : meta.GetLinkedSsts()) {
+    os << ' ' << file_number;
+  }
+  os << " }";
+
+  os << " garbage_blob_count: " << meta.GetGarbageBlobCount()
+     << " garbage_blob_bytes: " << meta.GetGarbageBlobBytes();
+
+  return os;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_meta.h b/src/rocksdb/db/blob/blob_file_meta.h
new file mode 100644
index 000000000..d7c8a1243
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_meta.h
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SharedBlobFileMetaData represents the immutable part of blob files' metadata,
+// like the blob file number, total number and size of blobs, or checksum
+// method and value. There is supposed to be one object of this class per blob
+// file (shared across all versions that include the blob file in question);
+// hence, the type is neither copyable nor movable. A blob file can be marked
+// obsolete when the corresponding SharedBlobFileMetaData object is destroyed.
+
+class SharedBlobFileMetaData {
+ public:
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value) {
+    return std::shared_ptr<SharedBlobFileMetaData>(new SharedBlobFileMetaData(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value)));
+  }
+
+  template <typename Deleter>
+  static std::shared_ptr<SharedBlobFileMetaData> Create(
+      uint64_t blob_file_number, uint64_t total_blob_count,
+      uint64_t total_blob_bytes, std::string checksum_method,
+      std::string checksum_value, Deleter deleter) {
+    return std::shared_ptr<SharedBlobFileMetaData>(
+        new SharedBlobFileMetaData(blob_file_number, total_blob_count,
+                                   total_blob_bytes, std::move(checksum_method),
+                                   std::move(checksum_value)),
+        deleter);
+  }
+
+  SharedBlobFileMetaData(const SharedBlobFileMetaData&) = delete;
+  SharedBlobFileMetaData& operator=(const SharedBlobFileMetaData&) = delete;
+
+  SharedBlobFileMetaData(SharedBlobFileMetaData&&) = delete;
+  SharedBlobFileMetaData& operator=(SharedBlobFileMetaData&&) = delete;
+
+  uint64_t GetBlobFileSize() const;
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  uint64_t GetTotalBlobCount() const { return total_blob_count_; }
+  uint64_t GetTotalBlobBytes() const { return total_blob_bytes_; }
+  const std::string& GetChecksumMethod() const { return checksum_method_; }
+  const std::string& GetChecksumValue() const { return checksum_value_; }
+
+  std::string DebugString() const;
+
+ private:
+  SharedBlobFileMetaData(uint64_t blob_file_number, uint64_t total_blob_count,
+                         uint64_t total_blob_bytes, std::string checksum_method,
+                         std::string checksum_value)
+      : blob_file_number_(blob_file_number),
+        total_blob_count_(total_blob_count),
+        total_blob_bytes_(total_blob_bytes),
+        checksum_method_(std::move(checksum_method)),
+        checksum_value_(std::move(checksum_value)) {
+    assert(checksum_method_.empty() == checksum_value_.empty());
+  }
+
+  uint64_t blob_file_number_;
+  uint64_t total_blob_count_;
+  uint64_t total_blob_bytes_;
+  std::string checksum_method_;
+  std::string checksum_value_;
+};
+
+std::ostream& operator<<(std::ostream& os,
+                         const SharedBlobFileMetaData& shared_meta);
+
+// BlobFileMetaData contains the part of the metadata for blob files that can
+// vary across versions, like the amount of garbage in the blob file. In
+// addition, BlobFileMetaData objects point to and share the ownership of the
+// SharedBlobFileMetaData object for the corresponding blob file. Similarly to
+// SharedBlobFileMetaData, BlobFileMetaData are not copyable or movable. They
+// are meant to be jointly owned by the versions in which the blob file has the
+// same (immutable *and* mutable) state.
+
+class BlobFileMetaData {
+ public:
+  using LinkedSsts = std::unordered_set<uint64_t>;
+
+  static std::shared_ptr<BlobFileMetaData> Create(
+      std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+      LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+      uint64_t garbage_blob_bytes) {
+    return std::shared_ptr<BlobFileMetaData>(
+        new BlobFileMetaData(std::move(shared_meta), std::move(linked_ssts),
+                             garbage_blob_count, garbage_blob_bytes));
+  }
+
+  BlobFileMetaData(const BlobFileMetaData&) = delete;
+  BlobFileMetaData& operator=(const BlobFileMetaData&) = delete;
+
+  BlobFileMetaData(BlobFileMetaData&&) = delete;
+  BlobFileMetaData& operator=(BlobFileMetaData&&) = delete;
+
+  const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+    return shared_meta_;
+  }
+
+  uint64_t GetBlobFileSize() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileSize();
+  }
+
+  uint64_t GetBlobFileNumber() const {
+    assert(shared_meta_);
+    return shared_meta_->GetBlobFileNumber();
+  }
+  uint64_t GetTotalBlobCount() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobCount();
+  }
+  uint64_t GetTotalBlobBytes() const {
+    assert(shared_meta_);
+    return shared_meta_->GetTotalBlobBytes();
+  }
+  const std::string& GetChecksumMethod() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumMethod();
+  }
+  const std::string& GetChecksumValue() const {
+    assert(shared_meta_);
+    return shared_meta_->GetChecksumValue();
+  }
+
+  const LinkedSsts& GetLinkedSsts() const { return linked_ssts_; }
+
+  uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+  uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+  std::string DebugString() const;
+
+ private:
+  BlobFileMetaData(std::shared_ptr<SharedBlobFileMetaData> shared_meta,
+                   LinkedSsts linked_ssts, uint64_t garbage_blob_count,
+                   uint64_t garbage_blob_bytes)
+      : shared_meta_(std::move(shared_meta)),
+        linked_ssts_(std::move(linked_ssts)),
+        garbage_blob_count_(garbage_blob_count),
+        garbage_blob_bytes_(garbage_blob_bytes) {
+    assert(shared_meta_);
+    assert(garbage_blob_count_ <= shared_meta_->GetTotalBlobCount());
+    assert(garbage_blob_bytes_ <= shared_meta_->GetTotalBlobBytes());
+  }
+
+  std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+  LinkedSsts linked_ssts_;
+  uint64_t garbage_blob_count_;
+  uint64_t garbage_blob_bytes_;
+};
+
+std::ostream& operator<<(std::ostream& os, const BlobFileMetaData& meta);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.cc b/src/rocksdb/db/blob/blob_file_reader.cc
new file mode 100644
index 000000000..a4eabb605
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.cc
@@ -0,0 +1,610 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "file/file_prefetch_buffer.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "table/multiget_context.h"
+#include "test_util/sync_point.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobFileReader::Create(
+    const ImmutableOptions& immutable_options, const FileOptions& file_options,
+    uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
+    uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+    std::unique_ptr<BlobFileReader>* blob_file_reader) {
+  assert(blob_file_reader);
+  assert(!*blob_file_reader);
+
+  uint64_t file_size = 0;
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+
+  {
+    const Status s =
+        OpenFile(immutable_options, file_options, blob_file_read_hist,
+                 blob_file_number, io_tracer, &file_size, &file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file_reader);
+
+  Statistics* const statistics = immutable_options.stats;
+
+  CompressionType compression_type = kNoCompression;
+
+  {
+    const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
+                                &compression_type);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  {
+    const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  blob_file_reader->reset(
+      new BlobFileReader(std::move(file_reader), file_size, compression_type,
+                         immutable_options.clock, statistics));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::OpenFile(
+    const ImmutableOptions& immutable_options, const FileOptions& file_opts,
+    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const std::shared_ptr<IOTracer>& io_tracer, uint64_t* file_size,
+    std::unique_ptr<RandomAccessFileReader>* file_reader) {
+  assert(file_size);
+  assert(file_reader);
+
+  const auto& cf_paths = immutable_options.cf_paths;
+  assert(!cf_paths.empty());
+
+  const std::string blob_file_path =
+      BlobFileName(cf_paths.front().path, blob_file_number);
+
+  FileSystem* const fs = immutable_options.fs.get();
+  assert(fs);
+
+  constexpr IODebugContext* dbg = nullptr;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:GetFileSize");
+
+    const Status s =
+        fs->GetFileSize(blob_file_path, IOOptions(), file_size, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (*file_size < BlobLogHeader::kSize + BlobLogFooter::kSize) {
+    return Status::Corruption("Malformed blob file");
+  }
+
+  std::unique_ptr<FSRandomAccessFile> file;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::OpenFile:NewRandomAccessFile");
+
+    const Status s =
+        fs->NewRandomAccessFile(blob_file_path, file_opts, &file, dbg);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(file);
+
+  if (immutable_options.advise_random_on_open) {
+    file->Hint(FSRandomAccessFile::kRandom);
+  }
+
+  file_reader->reset(new RandomAccessFileReader(
+      std::move(file), blob_file_path, immutable_options.clock, io_tracer,
+      immutable_options.stats, BLOB_DB_BLOB_FILE_READ_MICROS,
+      blob_file_read_hist, immutable_options.rate_limiter.get(),
+      immutable_options.listeners));
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+                                  uint32_t column_family_id,
+                                  Statistics* statistics,
+                                  CompressionType* compression_type) {
+  assert(file_reader);
+  assert(compression_type);
+
+  Slice header_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadHeader:ReadFromFile");
+
+    constexpr uint64_t read_offset = 0;
+    constexpr size_t read_size = BlobLogHeader::kSize;
+
+    // TODO: rate limit reading headers from blob files.
+    const Status s = ReadFromFile(file_reader, read_offset, read_size,
+                                  statistics, &header_slice, &buf, &aligned_buf,
+                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadHeader:TamperWithResult",
+                             &header_slice);
+  }
+
+  BlobLogHeader header;
+
+  {
+    const Status s = header.DecodeFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (header.has_ttl || header.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  if (header.column_family_id != column_family_id) {
+    return Status::Corruption("Column family ID mismatch");
+  }
+
+  *compression_type = header.compression;
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+                                  uint64_t file_size, Statistics* statistics) {
+  assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
+  assert(file_reader);
+
+  Slice footer_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  {
+    TEST_SYNC_POINT("BlobFileReader::ReadFooter:ReadFromFile");
+
+    const uint64_t read_offset = file_size - BlobLogFooter::kSize;
+    constexpr size_t read_size = BlobLogFooter::kSize;
+
+    // TODO: rate limit reading footers from blob files.
+    const Status s = ReadFromFile(file_reader, read_offset, read_size,
+                                  statistics, &footer_slice, &buf, &aligned_buf,
+                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    if (!s.ok()) {
+      return s;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::ReadFooter:TamperWithResult",
+                             &footer_slice);
+  }
+
+  BlobLogFooter footer;
+
+  {
+    const Status s = footer.DecodeFrom(footer_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  constexpr ExpirationRange no_expiration_range;
+
+  if (footer.expiration_range != no_expiration_range) {
+    return Status::Corruption("Unexpected TTL blob file");
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+                                    uint64_t read_offset, size_t read_size,
+                                    Statistics* statistics, Slice* slice,
+                                    Buffer* buf, AlignedBuf* aligned_buf,
+                                    Env::IOPriority rate_limiter_priority) {
+  assert(slice);
+  assert(buf);
+  assert(aligned_buf);
+
+  assert(file_reader);
+
+  RecordTick(statistics, BLOB_DB_BLOB_FILE_BYTES_READ, read_size);
+
+  Status s;
+
+  if (file_reader->use_direct_io()) {
+    constexpr char* scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+                          aligned_buf, rate_limiter_priority);
+  } else {
+    buf->reset(new char[read_size]);
+    constexpr AlignedBuf* aligned_scratch = nullptr;
+
+    s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
+                          buf->get(), aligned_scratch, rate_limiter_priority);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (slice->size() != read_size) {
+    return Status::Corruption("Failed to read data from blob file");
+  }
+
+  return Status::OK();
+}
+
+BlobFileReader::BlobFileReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, uint64_t file_size,
+    CompressionType compression_type, SystemClock* clock,
+    Statistics* statistics)
+    : file_reader_(std::move(file_reader)),
+      file_size_(file_size),
+      compression_type_(compression_type),
+      clock_(clock),
+      statistics_(statistics) {
+  assert(file_reader_);
+}
+
+BlobFileReader::~BlobFileReader() = default;
+
+Status BlobFileReader::GetBlob(
+    const ReadOptions& read_options, const Slice& user_key, uint64_t offset,
+    uint64_t value_size, CompressionType compression_type,
+    FilePrefetchBuffer* prefetch_buffer, MemoryAllocator* allocator,
+    std::unique_ptr<BlobContents>* result, uint64_t* bytes_read) const {
+  assert(result);
+
+  const uint64_t key_size = user_key.size();
+
+  if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+    return Status::Corruption("Invalid blob offset");
+  }
+
+  if (compression_type != compression_type_) {
+    return Status::Corruption("Compression type mismatch when reading blob");
+  }
+
+  // Note: if verify_checksum is set, we read the entire blob record to be able
+  // to perform the verification; otherwise, we just read the blob itself. Since
+  // the offset in BlobIndex actually points to the blob value, we need to make
+  // an adjustment in the former case.
+  const uint64_t adjustment =
+      read_options.verify_checksums
+          ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+          : 0;
+  assert(offset >= adjustment);
+
+  const uint64_t record_offset = offset - adjustment;
+  const uint64_t record_size = value_size + adjustment;
+
+  Slice record_slice;
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  bool prefetched = false;
+
+  if (prefetch_buffer) {
+    Status s;
+    constexpr bool for_compaction = true;
+
+    prefetched = prefetch_buffer->TryReadFromCache(
+        IOOptions(), file_reader_.get(), record_offset,
+        static_cast<size_t>(record_size), &record_slice, &s,
+        read_options.rate_limiter_priority, for_compaction);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (!prefetched) {
+    TEST_SYNC_POINT("BlobFileReader::GetBlob:ReadFromFile");
+    PERF_COUNTER_ADD(blob_read_count, 1);
+    PERF_COUNTER_ADD(blob_read_byte, record_size);
+    PERF_TIMER_GUARD(blob_read_time);
+    const Status s = ReadFromFile(file_reader_.get(), record_offset,
+                                  static_cast<size_t>(record_size), statistics_,
+                                  &record_slice, &buf, &aligned_buf,
+                                  read_options.rate_limiter_priority);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  TEST_SYNC_POINT_CALLBACK("BlobFileReader::GetBlob:TamperWithResult",
+                           &record_slice);
+
+  if (read_options.verify_checksums) {
+    const Status s = VerifyBlob(record_slice, user_key, value_size);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  const Slice value_slice(record_slice.data() + adjustment, value_size);
+
+  {
+    const Status s = UncompressBlobIfNeeded(
+        value_slice, compression_type, allocator, clock_, statistics_, result);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (bytes_read) {
+    *bytes_read = record_size;
+  }
+
+  return Status::OK();
+}
+
+void BlobFileReader::MultiGetBlob(
+    const ReadOptions& read_options, MemoryAllocator* allocator,
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+        blob_reqs,
+    uint64_t* bytes_read) const {
+  const size_t num_blobs = blob_reqs.size();
+  assert(num_blobs > 0);
+  assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_blobs - 1; ++i) {
+    assert(blob_reqs[i].first->offset <= blob_reqs[i + 1].first->offset);
+  }
+#endif  // !NDEBUG
+
+  std::vector<FSReadRequest> read_reqs;
+  autovector<uint64_t> adjustments;
+  uint64_t total_len = 0;
+  read_reqs.reserve(num_blobs);
+  for (size_t i = 0; i < num_blobs; ++i) {
+    BlobReadRequest* const req = blob_reqs[i].first;
+    assert(req);
+    assert(req->user_key);
+    assert(req->status);
+
+    const size_t key_size = req->user_key->size();
+    const uint64_t offset = req->offset;
+    const uint64_t value_size = req->len;
+
+    if (!IsValidBlobOffset(offset, key_size, value_size, file_size_)) {
+      *req->status = Status::Corruption("Invalid blob offset");
+      continue;
+    }
+    if (req->compression != compression_type_) {
+      *req->status =
+          Status::Corruption("Compression type mismatch when reading a blob");
+      continue;
+    }
+
+    const uint64_t adjustment =
+        read_options.verify_checksums
+            ? BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size)
+            : 0;
+    assert(req->offset >= adjustment);
+    adjustments.push_back(adjustment);
+
+    FSReadRequest read_req = {};
+    read_req.offset = req->offset - adjustment;
+    read_req.len = req->len + adjustment;
+    read_reqs.emplace_back(read_req);
+    total_len += read_req.len;
+  }
+
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
+
+  Buffer buf;
+  AlignedBuf aligned_buf;
+
+  Status s;
+  bool direct_io = file_reader_->use_direct_io();
+  if (direct_io) {
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = nullptr;
+    }
+  } else {
+    buf.reset(new char[total_len]);
+    std::ptrdiff_t pos = 0;
+    for (size_t i = 0; i < read_reqs.size(); ++i) {
+      read_reqs[i].scratch = buf.get() + pos;
+      pos += read_reqs[i].len;
+    }
+  }
+  TEST_SYNC_POINT("BlobFileReader::MultiGetBlob:ReadFromFile");
+  PERF_COUNTER_ADD(blob_read_count, num_blobs);
+  PERF_COUNTER_ADD(blob_read_byte, total_len);
+  s = file_reader_->MultiRead(IOOptions(), read_reqs.data(), read_reqs.size(),
+                              direct_io ? &aligned_buf : nullptr,
+                              read_options.rate_limiter_priority);
+  if (!s.ok()) {
+    for (auto& req : read_reqs) {
+      req.status.PermitUncheckedError();
+    }
+    for (auto& blob_req : blob_reqs) {
+      BlobReadRequest* const req = blob_req.first;
+      assert(req);
+      assert(req->status);
+
+      if (!req->status->IsCorruption()) {
+        // Avoid overwriting corruption status.
+        *req->status = s;
+      }
+    }
+    return;
+  }
+
+  assert(s.ok());
+
+  uint64_t total_bytes = 0;
+  for (size_t i = 0, j = 0; i < num_blobs; ++i) {
+    BlobReadRequest* const req = blob_reqs[i].first;
+    assert(req);
+    assert(req->user_key);
+    assert(req->status);
+
+    if (!req->status->ok()) {
+      continue;
+    }
+
+    assert(j < read_reqs.size());
+    auto& read_req = read_reqs[j++];
+    const auto& record_slice = read_req.result;
+    if (read_req.status.ok() && record_slice.size() != read_req.len) {
+      read_req.status =
+          IOStatus::Corruption("Failed to read data from blob file");
+    }
+
+    *req->status = read_req.status;
+    if (!req->status->ok()) {
+      continue;
+    }
+
+    // Verify checksums if enabled
+    if (read_options.verify_checksums) {
+      *req->status = VerifyBlob(record_slice, *req->user_key, req->len);
+      if (!req->status->ok()) {
+        continue;
+      }
+    }
+
+    // Uncompress blob if needed
+    Slice value_slice(record_slice.data() + adjustments[i], req->len);
+    *req->status =
+        UncompressBlobIfNeeded(value_slice, compression_type_, allocator,
+                               clock_, statistics_, &blob_reqs[i].second);
+    if (req->status->ok()) {
+      total_bytes += record_slice.size();
+    }
+  }
+
+  if (bytes_read) {
+    *bytes_read = total_bytes;
+  }
+}
+
+Status BlobFileReader::VerifyBlob(const Slice& record_slice,
+                                  const Slice& user_key, uint64_t value_size) {
+  PERF_TIMER_GUARD(blob_checksum_time);
+
+  BlobLogRecord record;
+
+  const Slice header_slice(record_slice.data(), BlobLogRecord::kHeaderSize);
+
+  {
+    const Status s = record.DecodeHeaderFrom(header_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (record.key_size != user_key.size()) {
+    return Status::Corruption("Key size mismatch when reading blob");
+  }
+
+  if (record.value_size != value_size) {
+    return Status::Corruption("Value size mismatch when reading blob");
+  }
+
+  record.key =
+      Slice(record_slice.data() + BlobLogRecord::kHeaderSize, record.key_size);
+  if (record.key != user_key) {
+    return Status::Corruption("Key mismatch when reading blob");
+  }
+
+  record.value = Slice(record.key.data() + record.key_size, value_size);
+
+  {
+    TEST_SYNC_POINT_CALLBACK("BlobFileReader::VerifyBlob:CheckBlobCRC",
+                             &record);
+
+    const Status s = record.CheckBlobCRC();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BlobFileReader::UncompressBlobIfNeeded(
+    const Slice& value_slice, CompressionType compression_type,
+    MemoryAllocator* allocator, SystemClock* clock, Statistics* statistics,
+    std::unique_ptr<BlobContents>* result) {
+  assert(result);
+
+  if (compression_type == kNoCompression) {
+    CacheAllocationPtr allocation =
+        AllocateBlock(value_slice.size(), allocator);
+    memcpy(allocation.get(), value_slice.data(), value_slice.size());
+
+    *result = BlobContents::Create(std::move(allocation), value_slice.size());
+
+    return Status::OK();
+  }
+
+  UncompressionContext context(compression_type);
+  UncompressionInfo info(context, UncompressionDict::GetEmptyDict(),
+                         compression_type);
+
+  size_t uncompressed_size = 0;
+  constexpr uint32_t compression_format_version = 2;
+
+  CacheAllocationPtr output;
+
+  {
+    PERF_TIMER_GUARD(blob_decompress_time);
+    StopWatch stop_watch(clock, statistics, BLOB_DB_DECOMPRESSION_MICROS);
+    output = UncompressData(info, value_slice.data(), value_slice.size(),
+                            &uncompressed_size, compression_format_version,
+                            allocator);
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", &output);
+
+  if (!output) {
+    return Status::Corruption("Unable to uncompress blob");
+  }
+
+  *result = BlobContents::Create(std::move(output), uncompressed_size);
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader.h b/src/rocksdb/db/blob/blob_file_reader.h
new file mode 100644
index 000000000..75b756da1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader.h
@@ -0,0 +1,108 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "db/blob/blob_read_request.h"
+#include "file/random_access_file_reader.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Status;
+struct ImmutableOptions;
+struct FileOptions;
+class HistogramImpl;
+struct ReadOptions;
+class Slice;
+class FilePrefetchBuffer;
+class BlobContents;
+class Statistics;
+
+class BlobFileReader {
+ public:
+  static Status Create(const ImmutableOptions& immutable_options,
+                       const FileOptions& file_options,
+                       uint32_t column_family_id,
+                       HistogramImpl* blob_file_read_hist,
+                       uint64_t blob_file_number,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       std::unique_ptr<BlobFileReader>* reader);
+
+  BlobFileReader(const BlobFileReader&) = delete;
+  BlobFileReader& operator=(const BlobFileReader&) = delete;
+
+  ~BlobFileReader();
+
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 uint64_t offset, uint64_t value_size,
+                 CompressionType compression_type,
+                 FilePrefetchBuffer* prefetch_buffer,
+                 MemoryAllocator* allocator,
+                 std::unique_ptr<BlobContents>* result,
+                 uint64_t* bytes_read) const;
+
+  // offsets must be sorted in ascending order by caller.
+  void MultiGetBlob(
+      const ReadOptions& read_options, MemoryAllocator* allocator,
+      autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>&
+          blob_reqs,
+      uint64_t* bytes_read) const;
+
+  CompressionType GetCompressionType() const { return compression_type_; }
+
+  uint64_t GetFileSize() const { return file_size_; }
+
+ private:
+  BlobFileReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                 uint64_t file_size, CompressionType compression_type,
+                 SystemClock* clock, Statistics* statistics);
+
+  static Status OpenFile(const ImmutableOptions& immutable_options,
+                         const FileOptions& file_opts,
+                         HistogramImpl* blob_file_read_hist,
+                         uint64_t blob_file_number,
+                         const std::shared_ptr<IOTracer>& io_tracer,
+                         uint64_t* file_size,
+                         std::unique_ptr<RandomAccessFileReader>* file_reader);
+
+  static Status ReadHeader(const RandomAccessFileReader* file_reader,
+                           uint32_t column_family_id, Statistics* statistics,
+                           CompressionType* compression_type);
+
+  static Status ReadFooter(const RandomAccessFileReader* file_reader,
+                           uint64_t file_size, Statistics* statistics);
+
+  using Buffer = std::unique_ptr<char[]>;
+
+  static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+                             uint64_t read_offset, size_t read_size,
+                             Statistics* statistics, Slice* slice, Buffer* buf,
+                             AlignedBuf* aligned_buf,
+                             Env::IOPriority rate_limiter_priority);
+
+  static Status VerifyBlob(const Slice& record_slice, const Slice& user_key,
+                           uint64_t value_size);
+
+  static Status UncompressBlobIfNeeded(const Slice& value_slice,
+                                       CompressionType compression_type,
+                                       MemoryAllocator* allocator,
+                                       SystemClock* clock,
+                                       Statistics* statistics,
+                                       std::unique_ptr<BlobContents>* result);
+
+  std::unique_ptr<RandomAccessFileReader> file_reader_;
+  uint64_t file_size_;
+  CompressionType compression_type_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_file_reader_test.cc b/src/rocksdb/db/blob/blob_file_reader_test.cc
new file mode 100644
index 000000000..03458e2b5
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_file_reader_test.cc
@@ -0,0 +1,1024 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_file_reader.h"
+
+#include <cassert>
+#include <string>
+
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/compression.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const std::vector<Slice>& keys,
+                   const std::vector<Slice>& blobs, CompressionType compression,
+                   std::vector<uint64_t>& blob_offsets,
+                   std::vector<uint64_t>& blob_sizes) {
+  assert(!immutable_options.cf_paths.empty());
+  size_t num = keys.size();
+  assert(num == blobs.size());
+  assert(num == blob_offsets.size());
+  assert(num == blob_sizes.size());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  BlobLogHeader header(column_family_id, compression, has_ttl,
+                       expiration_range_header);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  std::vector<std::string> compressed_blobs(num);
+  std::vector<Slice> blobs_to_write(num);
+  if (kNoCompression == compression) {
+    for (size_t i = 0; i < num; ++i) {
+      blobs_to_write[i] = blobs[i];
+      blob_sizes[i] = blobs[i].size();
+    }
+  } else {
+    CompressionOptions opts;
+    CompressionContext context(compression);
+    constexpr uint64_t sample_for_compression = 0;
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         compression, sample_for_compression);
+
+    constexpr uint32_t compression_format_version = 2;
+
+    for (size_t i = 0; i < num; ++i) {
+      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+                               &compressed_blobs[i]));
+      blobs_to_write[i] = compressed_blobs[i];
+      blob_sizes[i] = compressed_blobs[i].size();
+    }
+  }
+
+  for (size_t i = 0; i < num; ++i) {
+    uint64_t key_offset = 0;
+    ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+                                        &blob_offsets[i]));
+  }
+
+  BlobLogFooter footer;
+  footer.blob_count = num;
+  footer.expiration_range = expiration_range_footer;
+
+  std::string checksum_method;
+  std::string checksum_value;
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+// Creates a test blob file with a single blob in it. Note: this method
+// makes it possible to test various corner cases by allowing the caller
+// to specify the contents of various blob file header/footer fields.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const Slice& key,
+                   const Slice& blob, CompressionType compression,
+                   uint64_t* blob_offset, uint64_t* blob_size) {
+  std::vector<Slice> keys{key};
+  std::vector<Slice> blobs{blob};
+  std::vector<uint64_t> blob_offsets{0};
+  std::vector<uint64_t> blob_sizes{0};
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, keys, blobs, compression, blob_offsets,
+                blob_sizes);
+  if (blob_offset) {
+    *blob_offset = blob_offsets[0];
+  }
+  if (blob_size) {
+    *blob_size = blob_sizes[0];
+  }
+}
+
+}  // anonymous namespace
+
+class BlobFileReaderTest : public testing::Test {
+ protected:
+  BlobFileReaderTest() { mock_env_.reset(MockEnv::Create(Env::Default())); }
+  std::unique_ptr<Env> mock_env_;
+};
+
+TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_CreateReaderAndGetBlob"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 3;
+  const std::vector<std::string> key_strs = {"key1", "key2", "key3"};
+  const std::vector<std::string> blob_strs = {"blob1", "blob2", "blob3"};
+
+  const std::vector<Slice> keys = {key_strs[0], key_strs[1], key_strs[2]};
+  const std::vector<Slice> blobs = {blob_strs[0], blob_strs[1], blob_strs[2]};
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blobs[0]);
+    ASSERT_EQ(bytes_read, blob_sizes[0]);
+
+    // MultiGetBlob
+    bytes_read = 0;
+    size_t total_size = 0;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(keys[i], blob_offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      const auto& result = blob_reqs[i].second;
+
+      ASSERT_OK(statuses_buf[i]);
+      ASSERT_NE(result, nullptr);
+      ASSERT_EQ(result->data(), blobs[i]);
+      total_size += blob_sizes[i];
+    }
+    ASSERT_EQ(bytes_read, total_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blobs[1]);
+
+    const uint64_t key_size = keys[1].size();
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_sizes[1]);
+  }
+
+  // Invalid offset (too close to start of file)
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0] - 1,
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Invalid offset (too close to end of file)
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[2], blob_offsets[2] + 1,
+                              blob_sizes[2], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect compression type
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[0], blob_offsets[0],
+                              blob_sizes[0], kZSTD, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  // Incorrect key size
+  {
+    constexpr char shorter_key[] = "k";
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, shorter_key,
+                              blob_offsets[0] -
+                                  (keys[0].size() - sizeof(shorter_key) + 1),
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice shorter_key_slice(shorter_key, sizeof(shorter_key) - 1);
+    key_refs[1] = std::cref(shorter_key_slice);
+
+    autovector<uint64_t> offsets{
+        blob_offsets[0],
+        blob_offsets[1] - (keys[1].size() - key_refs[1].get().size()),
+        blob_offsets[2]};
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(key_refs[i], offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect key
+  {
+    constexpr char incorrect_key[] = "foo1";
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, incorrect_key, blob_offsets[0],
+                              blob_sizes[0], kNoCompression, prefetch_buffer,
+                              allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+    Slice wrong_key_slice(incorrect_key, sizeof(incorrect_key) - 1);
+    key_refs[2] = std::cref(wrong_key_slice);
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      requests_buf[i] =
+          BlobReadRequest(key_refs[i], blob_offsets[i], blob_sizes[i],
+                          kNoCompression, nullptr, &statuses_buf[i]);
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i == num_blobs - 1) {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      } else {
+        ASSERT_OK(statuses_buf[i]);
+      }
+    }
+  }
+
+  // Incorrect value size
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(read_options, keys[1], blob_offsets[1],
+                              blob_sizes[1] + 1, kNoCompression,
+                              prefetch_buffer, allocator, &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+
+    // MultiGetBlob
+    autovector<std::reference_wrapper<const Slice>> key_refs;
+    for (const auto& key_ref : keys) {
+      key_refs.emplace_back(std::cref(key_ref));
+    }
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<BlobReadRequest, num_blobs> requests_buf;
+
+    requests_buf[0] =
+        BlobReadRequest(key_refs[0], blob_offsets[0], blob_sizes[0],
+                        kNoCompression, nullptr, &statuses_buf[0]);
+    requests_buf[1] =
+        BlobReadRequest(key_refs[1], blob_offsets[1], blob_sizes[1] + 1,
+                        kNoCompression, nullptr, &statuses_buf[1]);
+    requests_buf[2] =
+        BlobReadRequest(key_refs[2], blob_offsets[2], blob_sizes[2],
+                        kNoCompression, nullptr, &statuses_buf[2]);
+
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      blob_reqs.emplace_back(&requests_buf[i], std::unique_ptr<BlobContents>());
+    }
+
+    reader->MultiGetBlob(read_options, allocator, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i != 1) {
+        ASSERT_OK(statuses_buf[i]);
+      } else {
+        ASSERT_TRUE(statuses_buf[i].IsCorruption());
+      }
+    }
+  }
+}
+
+TEST_F(BlobFileReaderTest, Malformed) {
+  // Write a blob file consisting of nothing but a header, and make sure we
+  // detect the error when we open it for reading
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Malformed"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr uint64_t blob_file_number = 1;
+
+  {
+    constexpr bool has_ttl = false;
+    constexpr ExpirationRange expiration_range;
+
+    const std::string blob_file_path =
+        BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+
+    std::unique_ptr<FSWritableFile> file;
+    ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                              FileOptions()));
+
+    std::unique_ptr<WritableFileWriter> file_writer(
+        new WritableFileWriter(std::move(file), blob_file_path, FileOptions(),
+                               immutable_options.clock));
+
+    constexpr Statistics* statistics = nullptr;
+    constexpr bool use_fsync = false;
+    constexpr bool do_flush = false;
+
+    BlobLogWriter blob_log_writer(std::move(file_writer),
+                                  immutable_options.clock, statistics,
+                                  blob_file_number, use_fsync, do_flush);
+
+    BlobLogHeader header(column_family_id, kNoCompression, has_ttl,
+                         expiration_range);
+
+    ASSERT_OK(blob_log_writer.WriteHeader(header));
+  }
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, TTL) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_TTL"), 0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = true;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInHeader"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  const ExpirationRange expiration_range_header(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr ExpirationRange expiration_range_footer;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_ExpirationRangeInFooter"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range_header;
+  const ExpirationRange expiration_range_footer(
+      1, 2);  // can be made constexpr when we adopt C++14
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                expiration_range_header, expiration_range_footer,
+                blob_file_number, key, blob, kNoCompression, &blob_offset,
+                &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     column_family_id, blob_file_read_hist,
+                                     blob_file_number, nullptr /*IOTracer*/,
+                                     &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_IncorrectColumnFamily"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  constexpr uint32_t incorrect_column_family_id = 2;
+
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
+                                     incorrect_column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
+                  .IsCorruption());
+}
+
+TEST_F(BlobFileReaderTest, BlobCRCError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_BlobCRCError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
+        BlobLogRecord* const record = static_cast<BlobLogRecord*>(arg);
+        assert(record);
+
+        record->blob_crc = 0xfaceb00c;
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  std::unique_ptr<BlobContents> value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kNoCompression, prefetch_buffer, allocator, &value,
+                            &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(value, nullptr);
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(BlobFileReaderTest, Compression) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(), "BlobFileReaderTest_Compression"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  // Make sure the blob can be retrieved with and without checksum verification
+  ReadOptions read_options;
+  read_options.verify_checksums = false;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blob);
+    ASSERT_EQ(bytes_read, blob_size);
+  }
+
+  read_options.verify_checksums = true;
+
+  {
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_OK(reader->GetBlob(read_options, key, blob_offset, blob_size,
+                              kSnappyCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read));
+    ASSERT_NE(value, nullptr);
+    ASSERT_EQ(value->data(), blob);
+
+    constexpr uint64_t key_size = sizeof(key) - 1;
+    ASSERT_EQ(bytes_read,
+              BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                  blob_size);
+  }
+}
+
+TEST_F(BlobFileReaderTest, UncompressionError) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderTest_UncompressionError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob,
+                kSnappyCompression, &blob_offset, &blob_size);
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  ASSERT_OK(BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
+        CacheAllocationPtr* const output =
+            static_cast<CacheAllocationPtr*>(arg);
+        assert(output);
+
+        output->reset();
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr MemoryAllocator* allocator = nullptr;
+
+  std::unique_ptr<BlobContents> value;
+  uint64_t bytes_read = 0;
+
+  ASSERT_TRUE(reader
+                  ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                            kSnappyCompression, prefetch_buffer, allocator,
+                            &value, &bytes_read)
+                  .IsCorruption());
+  ASSERT_EQ(value, nullptr);
+  ASSERT_EQ(bytes_read, 0);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderIOErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderIOErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+    fault_injection_env_.reset(new FaultInjectionTestEnv(mock_env_.get()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:GetFileSize",
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::ReadHeader:ReadFromFile",
+                            "BlobFileReader::ReadFooter:ReadFromFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+TEST_P(BlobFileReaderIOErrorTest, IOError) {
+  // Simulates an I/O error during the specified step
+
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(fault_injection_env_.get(),
+                            "BlobFileReaderIOErrorTest_IOError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsIOError());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    constexpr MemoryAllocator* allocator = nullptr;
+
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsIOError());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class BlobFileReaderDecodingErrorTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::string> {
+ protected:
+  BlobFileReaderDecodingErrorTest() : sync_point_(GetParam()) {
+    mock_env_.reset(MockEnv::Create(Env::Default()));
+  }
+
+  std::unique_ptr<Env> mock_env_;
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(BlobFileReaderTest, BlobFileReaderDecodingErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::ReadHeader:TamperWithResult",
+                            "BlobFileReader::ReadFooter:TamperWithResult",
+                            "BlobFileReader::GetBlob:TamperWithResult"}));
+
+TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
+  Options options;
+  options.env = mock_env_.get();
+  options.cf_paths.emplace_back(
+      test::PerThreadDBPath(mock_env_.get(),
+                            "BlobFileReaderDecodingErrorTest_DecodingError"),
+      0);
+  options.enable_blob_files = true;
+
+  ImmutableOptions immutable_options(options);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  uint64_t blob_offset = 0;
+  uint64_t blob_size = 0;
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, key, blob, kNoCompression,
+                &blob_offset, &blob_size);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [](void* arg) {
+    Slice* const slice = static_cast<Slice*>(arg);
+    assert(slice);
+    assert(!slice->empty());
+
+    slice->remove_prefix(1);
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileReader> reader;
+
+  const Status s = BlobFileReader::Create(
+      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
+      blob_file_number, nullptr /*IOTracer*/, &reader);
+
+  const bool fail_during_create =
+      sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
+
+  if (fail_during_create) {
+    ASSERT_TRUE(s.IsCorruption());
+  } else {
+    ASSERT_OK(s);
+
+    constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+    constexpr MemoryAllocator* allocator = nullptr;
+
+    std::unique_ptr<BlobContents> value;
+    uint64_t bytes_read = 0;
+
+    ASSERT_TRUE(reader
+                    ->GetBlob(ReadOptions(), key, blob_offset, blob_size,
+                              kNoCompression, prefetch_buffer, allocator,
+                              &value, &bytes_read)
+                    .IsCorruption());
+    ASSERT_EQ(value, nullptr);
+    ASSERT_EQ(bytes_read, 0);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.cc b/src/rocksdb/db/blob/blob_garbage_meter.cc
new file mode 100644
index 000000000..d328d7ff4
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status BlobGarbageMeter::ProcessInFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  flows_[blob_file_number].AddInFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::ProcessOutFlow(const Slice& key, const Slice& value) {
+  uint64_t blob_file_number = kInvalidBlobFileNumber;
+  uint64_t bytes = 0;
+
+  const Status s = Parse(key, value, &blob_file_number, &bytes);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (blob_file_number == kInvalidBlobFileNumber) {
+    return Status::OK();
+  }
+
+  // Note: in order to measure the amount of additional garbage, we only need to
+  // track the outflow for preexisting files, i.e. those that also had inflow.
+  // (Newly written files would only have outflow.)
+  auto it = flows_.find(blob_file_number);
+  if (it == flows_.end()) {
+    return Status::OK();
+  }
+
+  it->second.AddOutFlow(bytes);
+
+  return Status::OK();
+}
+
+Status BlobGarbageMeter::Parse(const Slice& key, const Slice& value,
+                               uint64_t* blob_file_number, uint64_t* bytes) {
+  assert(blob_file_number);
+  assert(*blob_file_number == kInvalidBlobFileNumber);
+  assert(bytes);
+  assert(*bytes == 0);
+
+  ParsedInternalKey ikey;
+
+  {
+    constexpr bool log_err_key = false;
+    const Status s = ParseInternalKey(key, &ikey, log_err_key);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (ikey.type != kTypeBlobIndex) {
+    return Status::OK();
+  }
+
+  BlobIndex blob_index;
+
+  {
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (blob_index.IsInlined() || blob_index.HasTTL()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  *blob_file_number = blob_index.file_number();
+  *bytes =
+      blob_index.size() +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(ikey.user_key.size());
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter.h b/src/rocksdb/db/blob/blob_garbage_meter.h
new file mode 100644
index 000000000..a6c04b0b2
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter.h
@@ -0,0 +1,102 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <unordered_map>
+
+#include "db/blob/blob_constants.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// A class that can be used to compute the amount of additional garbage
+// generated by a compaction. It parses the keys and blob references in the
+// input and output of a compaction, and aggregates the "inflow" and "outflow"
+// on a per-blob file basis. The amount of additional garbage for any given blob
+// file can then be computed by subtracting the outflow from the inflow.
+class BlobGarbageMeter {
+ public:
+  // A class to store the number and total size of blobs on a per-blob file
+  // basis.
+  class BlobStats {
+   public:
+    void Add(uint64_t bytes) {
+      ++count_;
+      bytes_ += bytes;
+    }
+    void Add(uint64_t count, uint64_t bytes) {
+      count_ += count;
+      bytes_ += bytes;
+    }
+
+    uint64_t GetCount() const { return count_; }
+    uint64_t GetBytes() const { return bytes_; }
+
+   private:
+    uint64_t count_ = 0;
+    uint64_t bytes_ = 0;
+  };
+
+  // A class to keep track of the "inflow" and the "outflow" and to compute the
+  // amount of additional garbage for a given blob file.
+  class BlobInOutFlow {
+   public:
+    void AddInFlow(uint64_t bytes) {
+      in_flow_.Add(bytes);
+      assert(IsValid());
+    }
+    void AddOutFlow(uint64_t bytes) {
+      out_flow_.Add(bytes);
+      assert(IsValid());
+    }
+
+    const BlobStats& GetInFlow() const { return in_flow_; }
+    const BlobStats& GetOutFlow() const { return out_flow_; }
+
+    bool IsValid() const {
+      return in_flow_.GetCount() >= out_flow_.GetCount() &&
+             in_flow_.GetBytes() >= out_flow_.GetBytes();
+    }
+    bool HasGarbage() const {
+      assert(IsValid());
+      return in_flow_.GetCount() > out_flow_.GetCount();
+    }
+    uint64_t GetGarbageCount() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetCount() - out_flow_.GetCount();
+    }
+    uint64_t GetGarbageBytes() const {
+      assert(IsValid());
+      assert(HasGarbage());
+      return in_flow_.GetBytes() - out_flow_.GetBytes();
+    }
+
+   private:
+    BlobStats in_flow_;
+    BlobStats out_flow_;
+  };
+
+  Status ProcessInFlow(const Slice& key, const Slice& value);
+  Status ProcessOutFlow(const Slice& key, const Slice& value);
+
+  const std::unordered_map<uint64_t, BlobInOutFlow>& flows() const {
+    return flows_;
+  }
+
+ private:
+  static Status Parse(const Slice& key, const Slice& value,
+                      uint64_t* blob_file_number, uint64_t* bytes);
+
+  std::unordered_map<uint64_t, BlobInOutFlow> flows_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_garbage_meter_test.cc b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
new file mode 100644
index 000000000..ba53f06f1
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_garbage_meter_test.cc
@@ -0,0 +1,197 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_garbage_meter.h"
+
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/dbformat.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(BlobGarbageMeterTest, MeasureGarbage) {
+  BlobGarbageMeter blob_garbage_meter;
+
+  struct BlobDescriptor {
+    std::string user_key;
+    uint64_t blob_file_number;
+    uint64_t offset;
+    uint64_t size;
+    CompressionType compression_type;
+    bool has_in_flow;
+    bool has_out_flow;
+
+    uint64_t GetExpectedBytes() const {
+      return size +
+             BlobLogRecord::CalculateAdjustmentForRecordHeader(user_key.size());
+    }
+  };
+
+  // Note: blob file 4 has the same inflow and outflow and hence no additional
+  // garbage. Blob file 5 has less outflow than inflow and thus it does have
+  // additional garbage. Blob file 6 is a newly written file (i.e. no inflow,
+  // only outflow) and is thus not tracked by the meter.
+  std::vector<BlobDescriptor> blobs{
+      {"key", 4, 1234, 555, kLZ4Compression, true, true},
+      {"other_key", 4, 6789, 101010, kLZ4Compression, true, true},
+      {"yet_another_key", 5, 22222, 3456, kLZ4Compression, true, true},
+      {"foo_key", 5, 77777, 8888, kLZ4Compression, true, true},
+      {"bar_key", 5, 999999, 1212, kLZ4Compression, true, false},
+      {"baz_key", 5, 1234567, 890, kLZ4Compression, true, false},
+      {"new_key", 6, 7777, 9999, kNoCompression, false, true}};
+
+  for (const auto& blob : blobs) {
+    constexpr SequenceNumber seq = 123;
+    const InternalKey key(blob.user_key, seq, kTypeBlobIndex);
+    const Slice key_slice = key.Encode();
+
+    std::string value;
+    BlobIndex::EncodeBlob(&value, blob.blob_file_number, blob.offset, blob.size,
+                          blob.compression_type);
+    const Slice value_slice(value);
+
+    if (blob.has_in_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+    }
+    if (blob.has_out_flow) {
+      ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+    }
+  }
+
+  const auto& flows = blob_garbage_meter.flows();
+  ASSERT_EQ(flows.size(), 2);
+
+  {
+    const auto it = flows.find(4);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    constexpr uint64_t expected_count = 2;
+    const uint64_t expected_bytes =
+        blobs[0].GetExpectedBytes() + blobs[1].GetExpectedBytes();
+
+    const auto& in = flow.GetInFlow();
+    ASSERT_EQ(in.GetCount(), expected_count);
+    ASSERT_EQ(in.GetBytes(), expected_bytes);
+
+    const auto& out = flow.GetOutFlow();
+    ASSERT_EQ(out.GetCount(), expected_count);
+    ASSERT_EQ(out.GetBytes(), expected_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_FALSE(flow.HasGarbage());
+  }
+
+  {
+    const auto it = flows.find(5);
+    ASSERT_NE(it, flows.end());
+
+    const auto& flow = it->second;
+
+    const auto& in = flow.GetInFlow();
+
+    constexpr uint64_t expected_in_count = 4;
+    const uint64_t expected_in_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes() +
+        blobs[4].GetExpectedBytes() + blobs[5].GetExpectedBytes();
+
+    ASSERT_EQ(in.GetCount(), expected_in_count);
+    ASSERT_EQ(in.GetBytes(), expected_in_bytes);
+
+    const auto& out = flow.GetOutFlow();
+
+    constexpr uint64_t expected_out_count = 2;
+    const uint64_t expected_out_bytes =
+        blobs[2].GetExpectedBytes() + blobs[3].GetExpectedBytes();
+
+    ASSERT_EQ(out.GetCount(), expected_out_count);
+    ASSERT_EQ(out.GetBytes(), expected_out_bytes);
+
+    ASSERT_TRUE(flow.IsValid());
+    ASSERT_TRUE(flow.HasGarbage());
+    ASSERT_EQ(flow.GetGarbageCount(), expected_in_count - expected_out_count);
+    ASSERT_EQ(flow.GetGarbageBytes(), expected_in_bytes - expected_out_bytes);
+  }
+}
+
+TEST(BlobGarbageMeterTest, PlainValue) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeValue);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_OK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_OK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+  ASSERT_TRUE(blob_garbage_meter.flows().empty());
+}
+
+TEST(BlobGarbageMeterTest, CorruptInternalKey) {
+  constexpr char corrupt_key[] = "i_am_corrupt";
+  const Slice key_slice(corrupt_key);
+
+  constexpr char value[] = "value";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, CorruptBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr char value[] = "i_am_not_a_blob_index";
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+TEST(BlobGarbageMeterTest, InlinedTTLBlobIndex) {
+  constexpr char user_key[] = "user_key";
+  constexpr SequenceNumber seq = 123;
+
+  const InternalKey key(user_key, seq, kTypeBlobIndex);
+  const Slice key_slice = key.Encode();
+
+  constexpr uint64_t expiration = 1234567890;
+  constexpr char inlined_value[] = "inlined";
+
+  std::string value;
+  BlobIndex::EncodeInlinedTTL(&value, expiration, inlined_value);
+
+  const Slice value_slice(value);
+
+  BlobGarbageMeter blob_garbage_meter;
+
+  ASSERT_NOK(blob_garbage_meter.ProcessInFlow(key_slice, value_slice));
+  ASSERT_NOK(blob_garbage_meter.ProcessOutFlow(key_slice, value_slice));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/blob_index.h b/src/rocksdb/db/blob/blob_index.h
new file mode 100644
index 000000000..e9944d784
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_index.h
@@ -0,0 +1,187 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <sstream>
+#include <string>
+
+#include "rocksdb/compression_type.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// BlobIndex is a pointer to the blob and metadata of the blob. The index is
+// stored in base DB as ValueType::kTypeBlobIndex.
+// There are three types of blob index:
+//
+//    kInlinedTTL:
+//      +------+------------+---------------+
+//      | type | expiration | value         |
+//      +------+------------+---------------+
+//      | char | varint64   | variable size |
+//      +------+------------+---------------+
+//
+//    kBlob:
+//      +------+-------------+----------+----------+-------------+
+//      | type | file number | offset   | size     | compression |
+//      +------+-------------+----------+----------+-------------+
+//      | char | varint64    | varint64 | varint64 | char        |
+//      +------+-------------+----------+----------+-------------+
+//
+//    kBlobTTL:
+//      +------+------------+-------------+----------+----------+-------------+
+//      | type | expiration | file number | offset   | size     | compression |
+//      +------+------------+-------------+----------+----------+-------------+
+//      | char | varint64   | varint64    | varint64 | varint64 | char        |
+//      +------+------------+-------------+----------+----------+-------------+
+//
+// There isn't a kInlined (without TTL) type since we can store it as a plain
+// value (i.e. ValueType::kTypeValue).
+class BlobIndex {
+ public:
+  enum class Type : unsigned char {
+    kInlinedTTL = 0,
+    kBlob = 1,
+    kBlobTTL = 2,
+    kUnknown = 3,
+  };
+
+  BlobIndex() : type_(Type::kUnknown) {}
+
+  BlobIndex(const BlobIndex&) = default;
+  BlobIndex& operator=(const BlobIndex&) = default;
+
+  bool IsInlined() const { return type_ == Type::kInlinedTTL; }
+
+  bool HasTTL() const {
+    return type_ == Type::kInlinedTTL || type_ == Type::kBlobTTL;
+  }
+
+  uint64_t expiration() const {
+    assert(HasTTL());
+    return expiration_;
+  }
+
+  const Slice& value() const {
+    assert(IsInlined());
+    return value_;
+  }
+
+  uint64_t file_number() const {
+    assert(!IsInlined());
+    return file_number_;
+  }
+
+  uint64_t offset() const {
+    assert(!IsInlined());
+    return offset_;
+  }
+
+  uint64_t size() const {
+    assert(!IsInlined());
+    return size_;
+  }
+
+  CompressionType compression() const {
+    assert(!IsInlined());
+    return compression_;
+  }
+
+  Status DecodeFrom(Slice slice) {
+    const char* kErrorMessage = "Error while decoding blob index";
+    assert(slice.size() > 0);
+    type_ = static_cast<Type>(*slice.data());
+    if (type_ >= Type::kUnknown) {
+      return Status::Corruption(kErrorMessage,
+                                "Unknown blob index type: " +
+                                    std::to_string(static_cast<char>(type_)));
+    }
+    slice = Slice(slice.data() + 1, slice.size() - 1);
+    if (HasTTL()) {
+      if (!GetVarint64(&slice, &expiration_)) {
+        return Status::Corruption(kErrorMessage, "Corrupted expiration");
+      }
+    }
+    if (IsInlined()) {
+      value_ = slice;
+    } else {
+      if (GetVarint64(&slice, &file_number_) && GetVarint64(&slice, &offset_) &&
+          GetVarint64(&slice, &size_) && slice.size() == 1) {
+        compression_ = static_cast<CompressionType>(*slice.data());
+      } else {
+        return Status::Corruption(kErrorMessage, "Corrupted blob offset");
+      }
+    }
+    return Status::OK();
+  }
+
+  std::string DebugString(bool output_hex) const {
+    std::ostringstream oss;
+
+    if (IsInlined()) {
+      oss << "[inlined blob] value:" << value_.ToString(output_hex);
+    } else {
+      oss << "[blob ref] file:" << file_number_ << " offset:" << offset_
+          << " size:" << size_
+          << " compression: " << CompressionTypeToString(compression_);
+    }
+
+    if (HasTTL()) {
+      oss << " exp:" << expiration_;
+    }
+
+    return oss.str();
+  }
+
+  static void EncodeInlinedTTL(std::string* dst, uint64_t expiration,
+                               const Slice& value) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(1 + kMaxVarint64Length + value.size());
+    dst->push_back(static_cast<char>(Type::kInlinedTTL));
+    PutVarint64(dst, expiration);
+    dst->append(value.data(), value.size());
+  }
+
+  static void EncodeBlob(std::string* dst, uint64_t file_number,
+                         uint64_t offset, uint64_t size,
+                         CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 3 + 2);
+    dst->push_back(static_cast<char>(Type::kBlob));
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+  static void EncodeBlobTTL(std::string* dst, uint64_t expiration,
+                            uint64_t file_number, uint64_t offset,
+                            uint64_t size, CompressionType compression) {
+    assert(dst != nullptr);
+    dst->clear();
+    dst->reserve(kMaxVarint64Length * 4 + 2);
+    dst->push_back(static_cast<char>(Type::kBlobTTL));
+    PutVarint64(dst, expiration);
+    PutVarint64(dst, file_number);
+    PutVarint64(dst, offset);
+    PutVarint64(dst, size);
+    dst->push_back(static_cast<char>(compression));
+  }
+
+ private:
+  Type type_ = Type::kUnknown;
+  uint64_t expiration_ = 0;
+  Slice value_;
+  uint64_t file_number_ = 0;
+  uint64_t offset_ = 0;
+  uint64_t size_ = 0;
+  CompressionType compression_ = kNoCompression;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.cc b/src/rocksdb/db/blob/blob_log_format.cc
new file mode 100644
index 000000000..8e26281e3
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.cc
@@ -0,0 +1,143 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_format.h"
+
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void BlobLogHeader::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogHeader::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed32(dst, version);
+  PutFixed32(dst, column_family_id);
+  unsigned char flags = (has_ttl ? 1 : 0);
+  dst->push_back(flags);
+  dst->push_back(compression);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+}
+
+Status BlobLogHeader::DecodeFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob log header";
+  if (src.size() != BlobLogHeader::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file header size");
+  }
+  uint32_t magic_number;
+  unsigned char flags;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed32(&src, &version) ||
+      !GetFixed32(&src, &column_family_id)) {
+    return Status::Corruption(
+        kErrorMessage,
+        "Error decoding magic number, version and column family id");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (version != kVersion1) {
+    return Status::Corruption(kErrorMessage, "Unknown header version");
+  }
+  flags = src.data()[0];
+  compression = static_cast<CompressionType>(src.data()[1]);
+  has_ttl = (flags & 1) == 1;
+  src.remove_prefix(2);
+  if (!GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second)) {
+    return Status::Corruption(kErrorMessage, "Error decoding expiration range");
+  }
+  return Status::OK();
+}
+
+void BlobLogFooter::EncodeTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogFooter::kSize);
+  PutFixed32(dst, kMagicNumber);
+  PutFixed64(dst, blob_count);
+  PutFixed64(dst, expiration_range.first);
+  PutFixed64(dst, expiration_range.second);
+  crc = crc32c::Value(dst->c_str(), dst->size());
+  crc = crc32c::Mask(crc);
+  PutFixed32(dst, crc);
+}
+
+Status BlobLogFooter::DecodeFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob log footer";
+  if (src.size() != BlobLogFooter::kSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob file footer size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogFooter::kSize - sizeof(uint32_t));
+  src_crc = crc32c::Mask(src_crc);
+  uint32_t magic_number = 0;
+  if (!GetFixed32(&src, &magic_number) || !GetFixed64(&src, &blob_count) ||
+      !GetFixed64(&src, &expiration_range.first) ||
+      !GetFixed64(&src, &expiration_range.second) || !GetFixed32(&src, &crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (magic_number != kMagicNumber) {
+    return Status::Corruption(kErrorMessage, "Magic number mismatch");
+  }
+  if (src_crc != crc) {
+    return Status::Corruption(kErrorMessage, "CRC mismatch");
+  }
+  return Status::OK();
+}
+
+void BlobLogRecord::EncodeHeaderTo(std::string* dst) {
+  assert(dst != nullptr);
+  dst->clear();
+  dst->reserve(BlobLogRecord::kHeaderSize + key.size() + value.size());
+  PutFixed64(dst, key.size());
+  PutFixed64(dst, value.size());
+  PutFixed64(dst, expiration);
+  header_crc = crc32c::Value(dst->c_str(), dst->size());
+  header_crc = crc32c::Mask(header_crc);
+  PutFixed32(dst, header_crc);
+  blob_crc = crc32c::Value(key.data(), key.size());
+  blob_crc = crc32c::Extend(blob_crc, value.data(), value.size());
+  blob_crc = crc32c::Mask(blob_crc);
+  PutFixed32(dst, blob_crc);
+}
+
+Status BlobLogRecord::DecodeHeaderFrom(Slice src) {
+  const char* kErrorMessage = "Error while decoding blob record";
+  if (src.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption(kErrorMessage,
+                              "Unexpected blob record header size");
+  }
+  uint32_t src_crc = 0;
+  src_crc = crc32c::Value(src.data(), BlobLogRecord::kHeaderSize - 8);
+  src_crc = crc32c::Mask(src_crc);
+  if (!GetFixed64(&src, &key_size) || !GetFixed64(&src, &value_size) ||
+      !GetFixed64(&src, &expiration) || !GetFixed32(&src, &header_crc) ||
+      !GetFixed32(&src, &blob_crc)) {
+    return Status::Corruption(kErrorMessage, "Error decoding content");
+  }
+  if (src_crc != header_crc) {
+    return Status::Corruption(kErrorMessage, "Header CRC mismatch");
+  }
+  return Status::OK();
+}
+
+Status BlobLogRecord::CheckBlobCRC() const {
+  uint32_t expected_crc = 0;
+  expected_crc = crc32c::Value(key.data(), key.size());
+  expected_crc = crc32c::Extend(expected_crc, value.data(), value.size());
+  expected_crc = crc32c::Mask(expected_crc);
+  if (expected_crc != blob_crc) {
+    return Status::Corruption("Blob CRC mismatch");
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_format.h b/src/rocksdb/db/blob/blob_log_format.h
new file mode 100644
index 000000000..607db2367
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_format.h
@@ -0,0 +1,164 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Log format information shared by reader and writer.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint32_t kMagicNumber = 2395959;  // 0x00248f37
+constexpr uint32_t kVersion1 = 1;
+
+using ExpirationRange = std::pair<uint64_t, uint64_t>;
+
+// clang-format off
+
+// Format of blob log file header (30 bytes):
+//
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    | magic number | version |  cf id  | flags | compression | expiration range  |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//    |   Fixed32    | Fixed32 | Fixed32 | char  |    char     | Fixed64   Fixed64 |
+//    +--------------+---------+---------+-------+-------------+-------------------+
+//
+// List of flags:
+//   has_ttl: Whether the file contain TTL data.
+//
+// Expiration range in the header is a rough range based on
+// blob_db_options.ttl_range_secs.
+
+// clang-format on
+
+struct BlobLogHeader {
+  static constexpr size_t kSize = 30;
+
+  BlobLogHeader() = default;
+  BlobLogHeader(uint32_t _column_family_id, CompressionType _compression,
+                bool _has_ttl, const ExpirationRange& _expiration_range)
+      : column_family_id(_column_family_id),
+        compression(_compression),
+        has_ttl(_has_ttl),
+        expiration_range(_expiration_range) {}
+
+  uint32_t version = kVersion1;
+  uint32_t column_family_id = 0;
+  CompressionType compression = kNoCompression;
+  bool has_ttl = false;
+  ExpirationRange expiration_range;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Format of blob log file footer (32 bytes):
+//
+//    +--------------+------------+-------------------+------------+
+//    | magic number | blob count | expiration range  | footer CRC |
+//    +--------------+------------+-------------------+------------+
+//    |   Fixed32    |  Fixed64   | Fixed64 + Fixed64 |   Fixed32  |
+//    +--------------+------------+-------------------+------------+
+//
+// The footer will be presented only when the blob file is properly closed.
+//
+// Unlike the same field in file header, expiration range in the footer is the
+// range of smallest and largest expiration of the data in this file.
+
+// clang-format on
+
+struct BlobLogFooter {
+  static constexpr size_t kSize = 32;
+
+  uint64_t blob_count = 0;
+  ExpirationRange expiration_range = std::make_pair(0, 0);
+  uint32_t crc = 0;
+
+  void EncodeTo(std::string* dst);
+
+  Status DecodeFrom(Slice slice);
+};
+
+// clang-format off
+
+// Blob record format (32 bytes header + key + value):
+//
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    | key length | value length | expiration | header CRC | blob CRC |   key   |   value   |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//    |   Fixed64  |   Fixed64    |  Fixed64   |  Fixed32   | Fixed32  | key len | value len |
+//    +------------+--------------+------------+------------+----------+---------+-----------+
+//
+// If file has has_ttl = false, expiration field is always 0, and the blob
+// doesn't has expiration.
+//
+// Also note that if compression is used, value is compressed value and value
+// length is compressed value length.
+//
+// Header CRC is the checksum of (key_len + val_len + expiration), while
+// blob CRC is the checksum of (key + value).
+//
+// We could use variable length encoding (Varint64) to save more space, but it
+// make reader more complicated.
+
+// clang-format on
+
+struct BlobLogRecord {
+  // header include fields up to blob CRC
+  static constexpr size_t kHeaderSize = 32;
+
+  // Note that the offset field of BlobIndex actually points to the blob value
+  // as opposed to the start of the blob record. The following method can
+  // be used to calculate the adjustment needed to read the blob record header.
+  static constexpr uint64_t CalculateAdjustmentForRecordHeader(
+      uint64_t key_size) {
+    return key_size + kHeaderSize;
+  }
+
+  uint64_t key_size = 0;
+  uint64_t value_size = 0;
+  uint64_t expiration = 0;
+  uint32_t header_crc = 0;
+  uint32_t blob_crc = 0;
+  Slice key;
+  Slice value;
+  std::unique_ptr<char[]> key_buf;
+  std::unique_ptr<char[]> value_buf;
+
+  uint64_t record_size() const { return kHeaderSize + key_size + value_size; }
+
+  void EncodeHeaderTo(std::string* dst);
+
+  Status DecodeHeaderFrom(Slice src);
+
+  Status CheckBlobCRC() const;
+};
+
+// Checks whether a blob offset is potentially valid or not.
+inline bool IsValidBlobOffset(uint64_t value_offset, uint64_t key_size,
+                              uint64_t value_size, uint64_t file_size) {
+  if (value_offset <
+      BlobLogHeader::kSize + BlobLogRecord::kHeaderSize + key_size) {
+    return false;
+  }
+
+  if (value_offset + value_size + BlobLogFooter::kSize > file_size) {
+    return false;
+  }
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.cc b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
new file mode 100644
index 000000000..778725189
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.cc
@@ -0,0 +1,134 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "db/blob/blob_log_sequential_reader.h"
+
+#include "file/random_access_file_reader.h"
+#include "monitoring/statistics.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogSequentialReader::BlobLogSequentialReader(
+    std::unique_ptr<RandomAccessFileReader>&& file_reader, SystemClock* clock,
+    Statistics* statistics)
+    : file_(std::move(file_reader)),
+      clock_(clock),
+      statistics_(statistics),
+      next_byte_(0) {}
+
+BlobLogSequentialReader::~BlobLogSequentialReader() = default;
+
+Status BlobLogSequentialReader::ReadSlice(uint64_t size, Slice* slice,
+                                          char* buf) {
+  assert(slice);
+  assert(file_);
+
+  StopWatch read_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
+  // TODO: rate limit `BlobLogSequentialReader` reads (it appears unused?)
+  Status s =
+      file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size), slice,
+                  buf, nullptr, Env::IO_TOTAL /* rate_limiter_priority */);
+  next_byte_ += size;
+  if (!s.ok()) {
+    return s;
+  }
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, slice->size());
+  if (slice->size() != size) {
+    return Status::Corruption("EOF reached while reading record");
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadHeader(BlobLogHeader* header) {
+  assert(header);
+  assert(next_byte_ == 0);
+
+  static_assert(BlobLogHeader::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogHeader::kSize");
+
+  Status s = ReadSlice(BlobLogHeader::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogHeader::kSize) {
+    return Status::Corruption("EOF reached before file header");
+  }
+
+  return header->DecodeFrom(buffer_);
+}
+
+Status BlobLogSequentialReader::ReadRecord(BlobLogRecord* record,
+                                           ReadLevel level,
+                                           uint64_t* blob_offset) {
+  assert(record);
+  static_assert(BlobLogRecord::kHeaderSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogRecord::kHeaderSize");
+
+  Status s = ReadSlice(BlobLogRecord::kHeaderSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (buffer_.size() != BlobLogRecord::kHeaderSize) {
+    return Status::Corruption("EOF reached before record header");
+  }
+
+  s = record->DecodeHeaderFrom(buffer_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  uint64_t kb_size = record->key_size + record->value_size;
+  if (blob_offset != nullptr) {
+    *blob_offset = next_byte_ + record->key_size;
+  }
+
+  switch (level) {
+    case kReadHeader:
+      next_byte_ += kb_size;
+      break;
+
+    case kReadHeaderKey:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      next_byte_ += record->value_size;
+      break;
+
+    case kReadHeaderKeyBlob:
+      record->key_buf.reset(new char[record->key_size]);
+      s = ReadSlice(record->key_size, &record->key, record->key_buf.get());
+      if (s.ok()) {
+        record->value_buf.reset(new char[record->value_size]);
+        s = ReadSlice(record->value_size, &record->value,
+                      record->value_buf.get());
+      }
+      if (s.ok()) {
+        s = record->CheckBlobCRC();
+      }
+      break;
+  }
+  return s;
+}
+
+Status BlobLogSequentialReader::ReadFooter(BlobLogFooter* footer) {
+  assert(footer);
+  static_assert(BlobLogFooter::kSize <= sizeof(header_buf_),
+                "Buffer is smaller than BlobLogFooter::kSize");
+
+  Status s = ReadSlice(BlobLogFooter::kSize, &buffer_, header_buf_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (buffer_.size() != BlobLogFooter::kSize) {
+    return Status::Corruption("EOF reached before file footer");
+  }
+
+  return footer->DecodeFrom(buffer_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_sequential_reader.h b/src/rocksdb/db/blob/blob_log_sequential_reader.h
new file mode 100644
index 000000000..98afa8518
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_sequential_reader.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <memory>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+
+#define MAX_HEADER_SIZE(a, b, c) (a > b ? (a > c ? a : c) : (b > c ? b : c))
+
+namespace ROCKSDB_NAMESPACE {
+
+class RandomAccessFileReader;
+class Env;
+class Statistics;
+class Status;
+class SystemClock;
+
+/**
+ * BlobLogSequentialReader is a general purpose log stream reader
+ * implementation. The actual job of reading from the device is implemented by
+ * the RandomAccessFileReader interface.
+ *
+ * Please see BlobLogWriter for details on the file and record layout.
+ */
+
+class BlobLogSequentialReader {
+ public:
+  enum ReadLevel {
+    kReadHeader,
+    kReadHeaderKey,
+    kReadHeaderKeyBlob,
+  };
+
+  // Create a reader that will return log records from "*file_reader".
+  BlobLogSequentialReader(std::unique_ptr<RandomAccessFileReader>&& file_reader,
+                          SystemClock* clock, Statistics* statistics);
+
+  // No copying allowed
+  BlobLogSequentialReader(const BlobLogSequentialReader&) = delete;
+  BlobLogSequentialReader& operator=(const BlobLogSequentialReader&) = delete;
+
+  ~BlobLogSequentialReader();
+
+  Status ReadHeader(BlobLogHeader* header);
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input. The contents filled in
+  // *record will only be valid until the next mutating operation on this
+  // reader.
+  // If blob_offset is non-null, return offset of the blob through it.
+  Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHeader,
+                    uint64_t* blob_offset = nullptr);
+
+  Status ReadFooter(BlobLogFooter* footer);
+
+  void ResetNextByte() { next_byte_ = 0; }
+
+  uint64_t GetNextByte() const { return next_byte_; }
+
+ private:
+  Status ReadSlice(uint64_t size, Slice* slice, char* buf);
+
+  const std::unique_ptr<RandomAccessFileReader> file_;
+  SystemClock* clock_;
+
+  Statistics* statistics_;
+
+  Slice buffer_;
+  char header_buf_[MAX_HEADER_SIZE(BlobLogHeader::kSize, BlobLogFooter::kSize,
+                                   BlobLogRecord::kHeaderSize)];
+
+  // which byte to read next
+  uint64_t next_byte_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#undef MAX_HEADER_SIZE
+\ No newline at end of file
diff --git a/src/rocksdb/db/blob/blob_log_writer.cc b/src/rocksdb/db/blob/blob_log_writer.cc
new file mode 100644
index 000000000..9dbac7f25
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.cc
@@ -0,0 +1,178 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_log_writer.h"
+
+#include <cstdint>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobLogWriter::BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest,
+                             SystemClock* clock, Statistics* statistics,
+                             uint64_t log_number, bool use_fs, bool do_flush,
+                             uint64_t boffset)
+    : dest_(std::move(dest)),
+      clock_(clock),
+      statistics_(statistics),
+      log_number_(log_number),
+      block_offset_(boffset),
+      use_fsync_(use_fs),
+      do_flush_(do_flush),
+      last_elem_type_(kEtNone) {}
+
+BlobLogWriter::~BlobLogWriter() = default;
+
+Status BlobLogWriter::Sync() {
+  TEST_SYNC_POINT("BlobLogWriter::Sync");
+
+  StopWatch sync_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_SYNC_MICROS);
+  Status s = dest_->Sync(use_fsync_);
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_SYNCED);
+  return s;
+}
+
+Status BlobLogWriter::WriteHeader(BlobLogHeader& header) {
+  assert(block_offset_ == 0);
+  assert(last_elem_type_ == kEtNone);
+  std::string str;
+  header.EncodeTo(&str);
+
+  Status s = dest_->Append(Slice(str));
+  if (s.ok()) {
+    block_offset_ += str.size();
+    if (do_flush_) {
+      s = dest_->Flush();
+    }
+  }
+  last_elem_type_ = kEtFileHdr;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogHeader::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AppendFooter(BlobLogFooter& footer,
+                                   std::string* checksum_method,
+                                   std::string* checksum_value) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string str;
+  footer.EncodeTo(&str);
+
+  Status s;
+  if (dest_->seen_error()) {
+    s.PermitUncheckedError();
+    return Status::IOError("Seen Error. Skip closing.");
+  } else {
+    s = dest_->Append(Slice(str));
+    if (s.ok()) {
+      block_offset_ += str.size();
+
+      s = Sync();
+
+      if (s.ok()) {
+        s = dest_->Close();
+
+        if (s.ok()) {
+          assert(!!checksum_method == !!checksum_value);
+
+          if (checksum_method) {
+            assert(checksum_method->empty());
+
+            std::string method = dest_->GetFileChecksumFuncName();
+            if (method != kUnknownFileChecksumFuncName) {
+              *checksum_method = std::move(method);
+            }
+          }
+          if (checksum_value) {
+            assert(checksum_value->empty());
+
+            std::string value = dest_->GetFileChecksum();
+            if (value != kUnknownFileChecksum) {
+              *checksum_value = std::move(value);
+            }
+          }
+        }
+      }
+    }
+
+    dest_.reset();
+  }
+
+  last_elem_type_ = kEtFileFooter;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogFooter::kSize);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t expiration, uint64_t* key_offset,
+                                uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, expiration);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+Status BlobLogWriter::AddRecord(const Slice& key, const Slice& val,
+                                uint64_t* key_offset, uint64_t* blob_offset) {
+  assert(block_offset_ != 0);
+  assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtRecord);
+
+  std::string buf;
+  ConstructBlobHeader(&buf, key, val, 0);
+
+  Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset);
+  return s;
+}
+
+void BlobLogWriter::ConstructBlobHeader(std::string* buf, const Slice& key,
+                                        const Slice& val, uint64_t expiration) {
+  BlobLogRecord record;
+  record.key = key;
+  record.value = val;
+  record.expiration = expiration;
+  record.EncodeHeaderTo(buf);
+}
+
+Status BlobLogWriter::EmitPhysicalRecord(const std::string& headerbuf,
+                                         const Slice& key, const Slice& val,
+                                         uint64_t* key_offset,
+                                         uint64_t* blob_offset) {
+  StopWatch write_sw(clock_, statistics_, BLOB_DB_BLOB_FILE_WRITE_MICROS);
+  Status s = dest_->Append(Slice(headerbuf));
+  if (s.ok()) {
+    s = dest_->Append(key);
+  }
+  if (s.ok()) {
+    s = dest_->Append(val);
+  }
+  if (do_flush_ && s.ok()) {
+    s = dest_->Flush();
+  }
+
+  *key_offset = block_offset_ + BlobLogRecord::kHeaderSize;
+  *blob_offset = *key_offset + key.size();
+  block_offset_ = *blob_offset + val.size();
+  last_elem_type_ = kEtRecord;
+  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_WRITTEN,
+             BlobLogRecord::kHeaderSize + key.size() + val.size());
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_log_writer.h b/src/rocksdb/db/blob/blob_log_writer.h
new file mode 100644
index 000000000..c1f9f31ad
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_log_writer.h
@@ -0,0 +1,83 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "db/blob/blob_log_format.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+class SystemClock;
+/**
+ * BlobLogWriter is the blob log stream writer. It provides an append-only
+ * abstraction for writing blob data.
+ *
+ *
+ * Look at blob_db_format.h to see the details of the record formats.
+ */
+
+class BlobLogWriter {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this BlobLogWriter is in use.
+  BlobLogWriter(std::unique_ptr<WritableFileWriter>&& dest, SystemClock* clock,
+                Statistics* statistics, uint64_t log_number, bool use_fsync,
+                bool do_flush, uint64_t boffset = 0);
+  // No copying allowed
+  BlobLogWriter(const BlobLogWriter&) = delete;
+  BlobLogWriter& operator=(const BlobLogWriter&) = delete;
+
+  ~BlobLogWriter();
+
+  static void ConstructBlobHeader(std::string* buf, const Slice& key,
+                                  const Slice& val, uint64_t expiration);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset,
+                   uint64_t* blob_offset);
+
+  Status AddRecord(const Slice& key, const Slice& val, uint64_t expiration,
+                   uint64_t* key_offset, uint64_t* blob_offset);
+
+  Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key,
+                            const Slice& val, uint64_t* key_offset,
+                            uint64_t* blob_offset);
+
+  Status AppendFooter(BlobLogFooter& footer, std::string* checksum_method,
+                      std::string* checksum_value);
+
+  Status WriteHeader(BlobLogHeader& header);
+
+  WritableFileWriter* file() { return dest_.get(); }
+
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  Status Sync();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  SystemClock* clock_;
+  Statistics* statistics_;
+  uint64_t log_number_;
+  uint64_t block_offset_;  // Current offset in block
+  bool use_fsync_;
+  bool do_flush_;
+
+ public:
+  enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFileFooter };
+  ElemType last_elem_type_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_read_request.h b/src/rocksdb/db/blob/blob_read_request.h
new file mode 100644
index 000000000..f9668ca2e
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_read_request.h
@@ -0,0 +1,58 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+
+#include "rocksdb/compression_type.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A read Blob request structure for use in BlobSource::MultiGetBlob and
+// BlobFileReader::MultiGetBlob.
+struct BlobReadRequest {
+  // User key to lookup the paired blob
+  const Slice* user_key = nullptr;
+
+  // File offset in bytes
+  uint64_t offset = 0;
+
+  // Length to read in bytes
+  size_t len = 0;
+
+  // Blob compression type
+  CompressionType compression = kNoCompression;
+
+  // Output parameter set by MultiGetBlob() to point to the data buffer, and
+  // the number of valid bytes
+  PinnableSlice* result = nullptr;
+
+  // Status of read
+  Status* status = nullptr;
+
+  BlobReadRequest(const Slice& _user_key, uint64_t _offset, size_t _len,
+                  CompressionType _compression, PinnableSlice* _result,
+                  Status* _status)
+      : user_key(&_user_key),
+        offset(_offset),
+        len(_len),
+        compression(_compression),
+        result(_result),
+        status(_status) {}
+
+  BlobReadRequest() = default;
+  BlobReadRequest(const BlobReadRequest& other) = default;
+  BlobReadRequest& operator=(const BlobReadRequest& other) = default;
+};
+
+using BlobFileReadRequests =
+    std::tuple<uint64_t /* file_number */, uint64_t /* file_size */,
+               autovector<BlobReadRequest>>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.cc b/src/rocksdb/db/blob/blob_source.cc
new file mode 100644
index 000000000..bfade2507
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.cc
@@ -0,0 +1,488 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <string>
+
+#include "cache/cache_reservation_manager.h"
+#include "cache/charged_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "monitoring/statistics.h"
+#include "options/cf_options.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+BlobSource::BlobSource(const ImmutableOptions* immutable_options,
+                       const std::string& db_id,
+                       const std::string& db_session_id,
+                       BlobFileCache* blob_file_cache)
+    : db_id_(db_id),
+      db_session_id_(db_session_id),
+      statistics_(immutable_options->statistics.get()),
+      blob_file_cache_(blob_file_cache),
+      blob_cache_(immutable_options->blob_cache),
+      lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
+#ifndef ROCKSDB_LITE
+  auto bbto =
+      immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
+  if (bbto &&
+      bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
+              .charged == CacheEntryRoleOptions::Decision::kEnabled) {
+    blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
+                                                 bbto->block_cache);
+  }
+#endif  // ROCKSDB_LITE
+}
+
+BlobSource::~BlobSource() = default;
+
+Status BlobSource::GetBlobFromCache(
+    const Slice& cache_key, CacheHandleGuard<BlobContents>* cached_blob) const {
+  assert(blob_cache_);
+  assert(!cache_key.empty());
+  assert(cached_blob);
+  assert(cached_blob->IsEmpty());
+
+  Cache::Handle* cache_handle = nullptr;
+  cache_handle = GetEntryFromCache(cache_key);
+  if (cache_handle != nullptr) {
+    *cached_blob =
+        CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+    assert(cached_blob->GetValue());
+
+    PERF_COUNTER_ADD(blob_cache_hit_count, 1);
+    RecordTick(statistics_, BLOB_DB_CACHE_HIT);
+    RecordTick(statistics_, BLOB_DB_CACHE_BYTES_READ,
+               cached_blob->GetValue()->size());
+
+    return Status::OK();
+  }
+
+  RecordTick(statistics_, BLOB_DB_CACHE_MISS);
+
+  return Status::NotFound("Blob not found in cache");
+}
+
+Status BlobSource::PutBlobIntoCache(
+    const Slice& cache_key, std::unique_ptr<BlobContents>* blob,
+    CacheHandleGuard<BlobContents>* cached_blob) const {
+  assert(blob_cache_);
+  assert(!cache_key.empty());
+  assert(blob);
+  assert(*blob);
+  assert(cached_blob);
+  assert(cached_blob->IsEmpty());
+
+  Cache::Handle* cache_handle = nullptr;
+  const Status s = InsertEntryIntoCache(cache_key, blob->get(),
+                                        (*blob)->ApproximateMemoryUsage(),
+                                        &cache_handle, Cache::Priority::BOTTOM);
+  if (s.ok()) {
+    blob->release();
+
+    assert(cache_handle != nullptr);
+    *cached_blob =
+        CacheHandleGuard<BlobContents>(blob_cache_.get(), cache_handle);
+
+    assert(cached_blob->GetValue());
+
+    RecordTick(statistics_, BLOB_DB_CACHE_ADD);
+    RecordTick(statistics_, BLOB_DB_CACHE_BYTES_WRITE,
+               cached_blob->GetValue()->size());
+
+  } else {
+    RecordTick(statistics_, BLOB_DB_CACHE_ADD_FAILURES);
+  }
+
+  return s;
+}
+
+Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
+  Cache::Handle* cache_handle = nullptr;
+
+  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+    Cache::CreateCallback create_cb =
+        [allocator = blob_cache_->memory_allocator()](
+            const void* buf, size_t size, void** out_obj,
+            size_t* charge) -> Status {
+      return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
+                                          size, out_obj, charge);
+    };
+
+    cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
+                                       create_cb, Cache::Priority::BOTTOM,
+                                       true /* wait_for_cache */, statistics_);
+  } else {
+    cache_handle = blob_cache_->Lookup(key, statistics_);
+  }
+
+  return cache_handle;
+}
+
+void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+                               PinnableSlice* value) {
+  assert(cached_blob);
+  assert(cached_blob->GetValue());
+  assert(value);
+
+  // To avoid copying the cached blob into the buffer provided by the
+  // application, we can simply transfer ownership of the cache handle to
+  // the target PinnableSlice. This has the potential to save a lot of
+  // CPU, especially with large blob values.
+
+  value->Reset();
+
+  constexpr Cleanable* cleanable = nullptr;
+  value->PinSlice(cached_blob->GetValue()->data(), cleanable);
+
+  cached_blob->TransferTo(value);
+}
+
+void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+                              PinnableSlice* value) {
+  assert(owned_blob);
+  assert(*owned_blob);
+  assert(value);
+
+  BlobContents* const blob = owned_blob->release();
+  assert(blob);
+
+  value->Reset();
+  value->PinSlice(
+      blob->data(),
+      [](void* arg1, void* /* arg2 */) {
+        delete static_cast<BlobContents*>(arg1);
+      },
+      blob, nullptr);
+}
+
+Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
+                                        size_t charge,
+                                        Cache::Handle** cache_handle,
+                                        Cache::Priority priority) const {
+  Status s;
+
+  Cache::CacheItemHelper* const cache_item_helper =
+      BlobContents::GetCacheItemHelper();
+  assert(cache_item_helper);
+
+  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
+    s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
+                            priority);
+  } else {
+    s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
+                            cache_handle, priority);
+  }
+
+  return s;
+}
+
+Status BlobSource::GetBlob(const ReadOptions& read_options,
+                           const Slice& user_key, uint64_t file_number,
+                           uint64_t offset, uint64_t file_size,
+                           uint64_t value_size,
+                           CompressionType compression_type,
+                           FilePrefetchBuffer* prefetch_buffer,
+                           PinnableSlice* value, uint64_t* bytes_read) {
+  assert(value);
+
+  Status s;
+
+  const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+
+  CacheHandleGuard<BlobContents> blob_handle;
+
+  // First, try to get the blob from the cache
+  //
+  // If blob cache is enabled, we'll try to read from it.
+  if (blob_cache_) {
+    Slice key = cache_key.AsSlice();
+    s = GetBlobFromCache(key, &blob_handle);
+    if (s.ok()) {
+      PinCachedBlob(&blob_handle, value);
+
+      // For consistency, the size of on-disk (possibly compressed) blob record
+      // is assigned to bytes_read.
+      uint64_t adjustment =
+          read_options.verify_checksums
+              ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+                    user_key.size())
+              : 0;
+      assert(offset >= adjustment);
+
+      uint64_t record_size = value_size + adjustment;
+      if (bytes_read) {
+        *bytes_read = record_size;
+      }
+      return s;
+    }
+  }
+
+  assert(blob_handle.IsEmpty());
+
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  if (no_io) {
+    s = Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+    return s;
+  }
+
+  // Can't find the blob from the cache. Since I/O is allowed, read from the
+  // file.
+  std::unique_ptr<BlobContents> blob_contents;
+
+  {
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    if (!s.ok()) {
+      return s;
+    }
+
+    assert(blob_file_reader.GetValue());
+
+    if (compression_type != blob_file_reader.GetValue()->GetCompressionType()) {
+      return Status::Corruption("Compression type mismatch when reading blob");
+    }
+
+    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+                                           ? blob_cache_->memory_allocator()
+                                           : nullptr;
+
+    uint64_t read_size = 0;
+    s = blob_file_reader.GetValue()->GetBlob(
+        read_options, user_key, offset, value_size, compression_type,
+        prefetch_buffer, allocator, &blob_contents, &read_size);
+    if (!s.ok()) {
+      return s;
+    }
+    if (bytes_read) {
+      *bytes_read = read_size;
+    }
+  }
+
+  if (blob_cache_ && read_options.fill_cache) {
+    // If filling cache is allowed and a cache is configured, try to put the
+    // blob to the cache.
+    Slice key = cache_key.AsSlice();
+    s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+    if (!s.ok()) {
+      return s;
+    }
+
+    PinCachedBlob(&blob_handle, value);
+  } else {
+    PinOwnedBlob(&blob_contents, value);
+  }
+
+  assert(s.ok());
+  return s;
+}
+
+void BlobSource::MultiGetBlob(const ReadOptions& read_options,
+                              autovector<BlobFileReadRequests>& blob_reqs,
+                              uint64_t* bytes_read) {
+  assert(blob_reqs.size() > 0);
+
+  uint64_t total_bytes_read = 0;
+  uint64_t bytes_read_in_file = 0;
+
+  for (auto& [file_number, file_size, blob_reqs_in_file] : blob_reqs) {
+    // sort blob_reqs_in_file by file offset.
+    std::sort(
+        blob_reqs_in_file.begin(), blob_reqs_in_file.end(),
+        [](const BlobReadRequest& lhs, const BlobReadRequest& rhs) -> bool {
+          return lhs.offset < rhs.offset;
+        });
+
+    MultiGetBlobFromOneFile(read_options, file_number, file_size,
+                            blob_reqs_in_file, &bytes_read_in_file);
+
+    total_bytes_read += bytes_read_in_file;
+  }
+
+  if (bytes_read) {
+    *bytes_read = total_bytes_read;
+  }
+}
+
+void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
+                                         uint64_t file_number,
+                                         uint64_t /*file_size*/,
+                                         autovector<BlobReadRequest>& blob_reqs,
+                                         uint64_t* bytes_read) {
+  const size_t num_blobs = blob_reqs.size();
+  assert(num_blobs > 0);
+  assert(num_blobs <= MultiGetContext::MAX_BATCH_SIZE);
+
+#ifndef NDEBUG
+  for (size_t i = 0; i < num_blobs - 1; ++i) {
+    assert(blob_reqs[i].offset <= blob_reqs[i + 1].offset);
+  }
+#endif  // !NDEBUG
+
+  using Mask = uint64_t;
+  Mask cache_hit_mask = 0;
+
+  uint64_t total_bytes = 0;
+  const OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+  if (blob_cache_) {
+    size_t cached_blob_count = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      auto& req = blob_reqs[i];
+
+      CacheHandleGuard<BlobContents> blob_handle;
+      const CacheKey cache_key = base_cache_key.WithOffset(req.offset);
+      const Slice key = cache_key.AsSlice();
+
+      const Status s = GetBlobFromCache(key, &blob_handle);
+
+      if (s.ok()) {
+        assert(req.status);
+        *req.status = s;
+
+        PinCachedBlob(&blob_handle, req.result);
+
+        // Update the counter for the number of valid blobs read from the cache.
+        ++cached_blob_count;
+
+        // For consistency, the size of each on-disk (possibly compressed) blob
+        // record is accumulated to total_bytes.
+        uint64_t adjustment =
+            read_options.verify_checksums
+                ? BlobLogRecord::CalculateAdjustmentForRecordHeader(
+                      req.user_key->size())
+                : 0;
+        assert(req.offset >= adjustment);
+        total_bytes += req.len + adjustment;
+        cache_hit_mask |= (Mask{1} << i);  // cache hit
+      }
+    }
+
+    // All blobs were read from the cache.
+    if (cached_blob_count == num_blobs) {
+      if (bytes_read) {
+        *bytes_read = total_bytes;
+      }
+      return;
+    }
+  }
+
+  const bool no_io = read_options.read_tier == kBlockCacheTier;
+  if (no_io) {
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (!(cache_hit_mask & (Mask{1} << i))) {
+        BlobReadRequest& req = blob_reqs[i];
+        assert(req.status);
+
+        *req.status =
+            Status::Incomplete("Cannot read blob(s): no disk I/O allowed");
+      }
+    }
+    return;
+  }
+
+  {
+    // Find the rest of blobs from the file since I/O is allowed.
+    autovector<std::pair<BlobReadRequest*, std::unique_ptr<BlobContents>>>
+        _blob_reqs;
+    uint64_t _bytes_read = 0;
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (!(cache_hit_mask & (Mask{1} << i))) {
+        _blob_reqs.emplace_back(&blob_reqs[i], std::unique_ptr<BlobContents>());
+      }
+    }
+
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    Status s =
+        blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    if (!s.ok()) {
+      for (size_t i = 0; i < _blob_reqs.size(); ++i) {
+        BlobReadRequest* const req = _blob_reqs[i].first;
+        assert(req);
+        assert(req->status);
+
+        *req->status = s;
+      }
+      return;
+    }
+
+    assert(blob_file_reader.GetValue());
+
+    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
+                                           ? blob_cache_->memory_allocator()
+                                           : nullptr;
+
+    blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
+                                              _blob_reqs, &_bytes_read);
+
+    if (blob_cache_ && read_options.fill_cache) {
+      // If filling cache is allowed and a cache is configured, try to put
+      // the blob(s) to the cache.
+      for (auto& [req, blob_contents] : _blob_reqs) {
+        assert(req);
+
+        if (req->status->ok()) {
+          CacheHandleGuard<BlobContents> blob_handle;
+          const CacheKey cache_key = base_cache_key.WithOffset(req->offset);
+          const Slice key = cache_key.AsSlice();
+          s = PutBlobIntoCache(key, &blob_contents, &blob_handle);
+          if (!s.ok()) {
+            *req->status = s;
+          } else {
+            PinCachedBlob(&blob_handle, req->result);
+          }
+        }
+      }
+    } else {
+      for (auto& [req, blob_contents] : _blob_reqs) {
+        assert(req);
+
+        if (req->status->ok()) {
+          PinOwnedBlob(&blob_contents, req->result);
+        }
+      }
+    }
+
+    total_bytes += _bytes_read;
+    if (bytes_read) {
+      *bytes_read = total_bytes;
+    }
+  }
+}
+
+bool BlobSource::TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+                                  uint64_t offset, size_t* charge) const {
+  const CacheKey cache_key = GetCacheKey(file_number, file_size, offset);
+  const Slice key = cache_key.AsSlice();
+
+  CacheHandleGuard<BlobContents> blob_handle;
+  const Status s = GetBlobFromCache(key, &blob_handle);
+
+  if (s.ok() && blob_handle.GetValue() != nullptr) {
+    if (charge) {
+      const Cache* const cache = blob_handle.GetCache();
+      assert(cache);
+
+      Cache::Handle* const handle = blob_handle.GetCacheHandle();
+      assert(handle);
+
+      *charge = cache->GetUsage(handle);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source.h b/src/rocksdb/db/blob/blob_source.h
new file mode 100644
index 000000000..2ed296eeb
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source.h
@@ -0,0 +1,153 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cinttypes>
+#include <memory>
+
+#include "cache/cache_helpers.h"
+#include "cache/cache_key.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_read_request.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/rocksdb_namespace.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableOptions;
+class Status;
+class FilePrefetchBuffer;
+class Slice;
+class BlobContents;
+
+// BlobSource is a class that provides universal access to blobs, regardless of
+// whether they are in the blob cache, secondary cache, or (remote) storage.
+// Depending on user settings, it always fetch blobs from multi-tier cache and
+// storage with minimal cost.
+class BlobSource {
+ public:
+  BlobSource(const ImmutableOptions* immutable_options,
+             const std::string& db_id, const std::string& db_session_id,
+             BlobFileCache* blob_file_cache);
+
+  BlobSource(const BlobSource&) = delete;
+  BlobSource& operator=(const BlobSource&) = delete;
+
+  ~BlobSource();
+
+  // Read a blob from the underlying cache or one blob file.
+  //
+  // If successful, returns ok and sets "*value" to the newly retrieved
+  // uncompressed blob. If there was an error while fetching the blob, sets
+  // "*value" to empty and returns a non-ok status.
+  //
+  // Note: For consistency, whether the blob is found in the cache or on disk,
+  // sets "*bytes_read" to the size of on-disk (possibly compressed) blob
+  // record.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 uint64_t file_number, uint64_t offset, uint64_t file_size,
+                 uint64_t value_size, CompressionType compression_type,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read);
+
+  // Read multiple blobs from the underlying cache or blob file(s).
+  //
+  // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+  // to the newly retrieved uncompressed blobs. If there was an error while
+  // fetching one of blobs, sets its "result" to empty and sets its
+  // corresponding "status" to a non-ok status.
+  //
+  // Note:
+  //  - The main difference between this function and MultiGetBlobFromOneFile is
+  //    that this function can read multiple blobs from multiple blob files.
+  //
+  //  - For consistency, whether the blob is found in the cache or on disk, sets
+  //  "*bytes_read" to the total size of on-disk (possibly compressed) blob
+  //  records.
+  void MultiGetBlob(const ReadOptions& read_options,
+                    autovector<BlobFileReadRequests>& blob_reqs,
+                    uint64_t* bytes_read);
+
+  // Read multiple blobs from the underlying cache or one blob file.
+  //
+  // If successful, returns ok and sets "result" in the elements of "blob_reqs"
+  // to the newly retrieved uncompressed blobs. If there was an error while
+  // fetching one of blobs, sets its "result" to empty and sets its
+  // corresponding "status" to a non-ok status.
+  //
+  // Note:
+  //  - The main difference between this function and MultiGetBlob is that this
+  //  function is only used for the case where the demanded blobs are stored in
+  //  one blob file. MultiGetBlob will call this function multiple times if the
+  //  demanded blobs are stored in multiple blob files.
+  //
+  //  - For consistency, whether the blob is found in the cache or on disk, sets
+  //  "*bytes_read" to the total size of on-disk (possibly compressed) blob
+  //  records.
+  void MultiGetBlobFromOneFile(const ReadOptions& read_options,
+                               uint64_t file_number, uint64_t file_size,
+                               autovector<BlobReadRequest>& blob_reqs,
+                               uint64_t* bytes_read);
+
+  inline Status GetBlobFileReader(
+      uint64_t blob_file_number,
+      CacheHandleGuard<BlobFileReader>* blob_file_reader) {
+    return blob_file_cache_->GetBlobFileReader(blob_file_number,
+                                               blob_file_reader);
+  }
+
+  inline Cache* GetBlobCache() const { return blob_cache_.get(); }
+
+  bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
+                        uint64_t offset, size_t* charge = nullptr) const;
+
+ private:
+  Status GetBlobFromCache(const Slice& cache_key,
+                          CacheHandleGuard<BlobContents>* cached_blob) const;
+
+  Status PutBlobIntoCache(const Slice& cache_key,
+                          std::unique_ptr<BlobContents>* blob,
+                          CacheHandleGuard<BlobContents>* cached_blob) const;
+
+  static void PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
+                            PinnableSlice* value);
+
+  static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
+                           PinnableSlice* value);
+
+  Cache::Handle* GetEntryFromCache(const Slice& key) const;
+
+  Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
+                              size_t charge, Cache::Handle** cache_handle,
+                              Cache::Priority priority) const;
+
+  inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
+                              uint64_t offset) const {
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+    return base_cache_key.WithOffset(offset);
+  }
+
+  const std::string& db_id_;
+  const std::string& db_session_id_;
+
+  Statistics* statistics_;
+
+  // A cache to store blob file reader.
+  BlobFileCache* blob_file_cache_;
+
+  // A cache to store uncompressed blobs.
+  std::shared_ptr<Cache> blob_cache_;
+
+  // The control option of how the cache tiers will be used. Currently rocksdb
+  // support block/blob cache (volatile tier) and secondary cache (this tier
+  // isn't strictly speaking a non-volatile tier since the compressed cache in
+  // this tier is in volatile memory).
+  const CacheTier lowest_used_cache_tier_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/blob_source_test.cc b/src/rocksdb/db/blob/blob_source_test.cc
new file mode 100644
index 000000000..a85ed8646
--- /dev/null
+++ b/src/rocksdb/db/blob/blob_source_test.cc
@@ -0,0 +1,1624 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_source.h"
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "cache/charged_cache.h"
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_contents.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_log_writer.h"
+#include "db/db_test_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "options/cf_options.h"
+#include "rocksdb/options.h"
+#include "util/compression.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Creates a test blob file with `num` blobs in it.
+void WriteBlobFile(const ImmutableOptions& immutable_options,
+                   uint32_t column_family_id, bool has_ttl,
+                   const ExpirationRange& expiration_range_header,
+                   const ExpirationRange& expiration_range_footer,
+                   uint64_t blob_file_number, const std::vector<Slice>& keys,
+                   const std::vector<Slice>& blobs, CompressionType compression,
+                   std::vector<uint64_t>& blob_offsets,
+                   std::vector<uint64_t>& blob_sizes) {
+  assert(!immutable_options.cf_paths.empty());
+  size_t num = keys.size();
+  assert(num == blobs.size());
+  assert(num == blob_offsets.size());
+  assert(num == blob_sizes.size());
+
+  const std::string blob_file_path =
+      BlobFileName(immutable_options.cf_paths.front().path, blob_file_number);
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(NewWritableFile(immutable_options.fs.get(), blob_file_path, &file,
+                            FileOptions()));
+
+  std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+      std::move(file), blob_file_path, FileOptions(), immutable_options.clock));
+
+  constexpr Statistics* statistics = nullptr;
+  constexpr bool use_fsync = false;
+  constexpr bool do_flush = false;
+
+  BlobLogWriter blob_log_writer(std::move(file_writer), immutable_options.clock,
+                                statistics, blob_file_number, use_fsync,
+                                do_flush);
+
+  BlobLogHeader header(column_family_id, compression, has_ttl,
+                       expiration_range_header);
+
+  ASSERT_OK(blob_log_writer.WriteHeader(header));
+
+  std::vector<std::string> compressed_blobs(num);
+  std::vector<Slice> blobs_to_write(num);
+  if (kNoCompression == compression) {
+    for (size_t i = 0; i < num; ++i) {
+      blobs_to_write[i] = blobs[i];
+      blob_sizes[i] = blobs[i].size();
+    }
+  } else {
+    CompressionOptions opts;
+    CompressionContext context(compression);
+    constexpr uint64_t sample_for_compression = 0;
+    CompressionInfo info(opts, context, CompressionDict::GetEmptyDict(),
+                         compression, sample_for_compression);
+
+    constexpr uint32_t compression_format_version = 2;
+
+    for (size_t i = 0; i < num; ++i) {
+      ASSERT_TRUE(CompressData(blobs[i], info, compression_format_version,
+                               &compressed_blobs[i]));
+      blobs_to_write[i] = compressed_blobs[i];
+      blob_sizes[i] = compressed_blobs[i].size();
+    }
+  }
+
+  for (size_t i = 0; i < num; ++i) {
+    uint64_t key_offset = 0;
+    ASSERT_OK(blob_log_writer.AddRecord(keys[i], blobs_to_write[i], &key_offset,
+                                        &blob_offsets[i]));
+  }
+
+  BlobLogFooter footer;
+  footer.blob_count = num;
+  footer.expiration_range = expiration_range_footer;
+
+  std::string checksum_method;
+  std::string checksum_value;
+  ASSERT_OK(
+      blob_log_writer.AppendFooter(footer, &checksum_method, &checksum_value));
+}
+
+}  // anonymous namespace
+
+class BlobSourceTest : public DBTestBase {
+ protected:
+ public:
+  explicit BlobSourceTest()
+      : DBTestBase("blob_source_test", /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    LRUCacheOptions co;
+    co.capacity = 8 << 20;
+    co.num_shard_bits = 2;
+    co.metadata_charge_policy = kDontChargeCacheMetadata;
+    co.high_pri_pool_ratio = 0.2;
+    co.low_pri_pool_ratio = 0.2;
+    options_.blob_cache = NewLRUCache(co);
+    options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  Options options_;
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+TEST_F(BlobSourceTest, GetBlobsFromCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_GetBlobsFromCache"), 0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 16;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 1024;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    // GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+    uint64_t blob_bytes = 0;
+    uint64_t total_bytes = 0;
+
+    read_options.fill_cache = false;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+      total_bytes += bytes_read;
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+    read_options.fill_cache = true;
+    blob_bytes = 0;
+    total_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      blob_bytes += blob_sizes[i];
+      total_bytes += bytes_read;
+      ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i);
+      ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_count, i + 1);
+      ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, num_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, total_bytes);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), blob_bytes);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              blob_bytes);
+
+    read_options.fill_cache = true;
+    total_bytes = 0;
+    blob_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      total_bytes += bytes_read;    // on-disk blob record size
+      blob_bytes += blob_sizes[i];  // cached blob value size
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // without i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // without i/o
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+
+    // Cache-only GetBlob
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    total_bytes = 0;
+    blob_bytes = 0;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer, &values[i],
+                                    &bytes_read));
+      ASSERT_EQ(values[i], blobs[i]);
+      ASSERT_TRUE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      total_bytes += bytes_read;
+      blob_bytes += blob_sizes[i];
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 3);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // without i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // without i/o
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  options_.blob_cache->EraseUnRefEntries();
+
+  {
+    // Cache-only GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_TRUE(blob_source
+                      .GetBlob(read_options, keys[i], blob_file_number,
+                               blob_offsets[i], file_size, blob_sizes[i],
+                               kNoCompression, prefetch_buffer, &values[i],
+                               &bytes_read)
+                      .IsIncomplete());
+      ASSERT_TRUE(values[i].empty());
+      ASSERT_FALSE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read, 0);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  {
+    // GetBlob from non-existing file
+    std::vector<PinnableSlice> values(keys.size());
+    uint64_t bytes_read = 0;
+    uint64_t file_number = 100;  // non-existing file
+
+    read_options.read_tier = ReadTier::kReadAllTier;
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_TRUE(blob_source
+                      .GetBlob(read_options, keys[i], file_number,
+                               blob_offsets[i], file_size, blob_sizes[i],
+                               kNoCompression, prefetch_buffer, &values[i],
+                               &bytes_read)
+                      .IsIOError());
+      ASSERT_TRUE(values[i].empty());
+      ASSERT_FALSE(values[i].IsPinned());
+      ASSERT_EQ(bytes_read, 0);
+
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved the blob cache num_blobs * 3 times via TEST_BlobInCache,
+    // GetBlob, and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 3);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+TEST_F(BlobSourceTest, GetCompressedBlobs) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  const CompressionType compression = kSnappyCompression;
+
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_GetCompressedBlobs"), 0);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr size_t num_blobs = 256;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+  }
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  constexpr size_t capacity = 1024;
+  auto backing_cache = NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, nullptr /*HistogramImpl*/, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  uint64_t bytes_read = 0;
+  std::vector<PinnableSlice> values(keys.size());
+
+  {
+    // Snappy Compression
+    const uint64_t file_number = 1;
+
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                  expiration_range, expiration_range, file_number, keys, blobs,
+                  compression, blob_offsets, blob_sizes);
+
+    CacheHandleGuard<BlobFileReader> blob_file_reader;
+    ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
+    ASSERT_NE(blob_file_reader.GetValue(), nullptr);
+
+    const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
+    ASSERT_EQ(blob_file_reader.GetValue()->GetCompressionType(), compression);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_NE(blobs[i].size() /*uncompressed size*/,
+                blob_sizes[i] /*compressed size*/);
+    }
+
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[i]));
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    compression, nullptr /*prefetch_buffer*/,
+                                    &values[i], &bytes_read));
+      ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+      ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    ASSERT_GE((int)get_perf_context()->blob_decompress_time, 0);
+
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    get_perf_context()->Reset();
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+
+      // Compressed blob size is passed in GetBlob
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    compression, nullptr /*prefetch_buffer*/,
+                                    &values[i], &bytes_read));
+      ASSERT_EQ(values[i], blobs[i] /*uncompressed blob*/);
+      ASSERT_NE(values[i].size(), blob_sizes[i] /*compressed size*/);
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+  }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromMultiFiles) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromMultiFiles"),
+      0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_files = 2;
+  constexpr size_t num_blobs = 32;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  uint64_t blob_value_bytes = 0;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    blob_value_bytes += blobs[i].size();
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+  const uint64_t blob_records_bytes =
+      file_size - BlobLogHeader::kSize - BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  {
+    // Write key/blob pairs to multiple blob files.
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      WriteBlobFile(immutable_options, column_family_id, has_ttl,
+                    expiration_range, expiration_range, file_number, keys,
+                    blobs, kNoCompression, blob_offsets, blob_sizes);
+    }
+  }
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  uint64_t bytes_read = 0;
+
+  {
+    // MultiGetBlob
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    autovector<BlobFileReadRequests> blob_reqs;
+    std::array<autovector<BlobReadRequest>, blob_files> blob_reqs_in_file;
+    std::array<PinnableSlice, num_blobs * blob_files> value_buf;
+    std::array<Status, num_blobs * blob_files> statuses_buf;
+
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        blob_reqs_in_file[i].emplace_back(
+            keys[j], blob_offsets[j], blob_sizes[j], kNoCompression,
+            &value_buf[i * num_blobs + j], &statuses_buf[i * num_blobs + j]);
+      }
+      blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file[i]);
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        ASSERT_OK(statuses_buf[i * num_blobs + j]);
+        ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                 blob_offsets[j]));
+      }
+    }
+
+    // Retrieved all blobs from 2 blob files twice via MultiGetBlob and
+    // TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+              num_blobs * blob_files);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              num_blobs * blob_files);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              blob_records_bytes * blob_files);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS),
+              num_blobs * blob_files);  // MultiGetBlob
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+              num_blobs * blob_files);  // TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD),
+              num_blobs * blob_files);  // MultiGetBlob
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_value_bytes * blob_files);  // TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              blob_value_bytes * blob_files);  // MultiGetBlob
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    autovector<BlobReadRequest> fake_blob_reqs_in_file;
+    std::array<PinnableSlice, num_blobs> fake_value_buf;
+    std::array<Status, num_blobs> fake_statuses_buf;
+
+    const uint64_t fake_file_number = 100;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      fake_blob_reqs_in_file.emplace_back(
+          keys[i], blob_offsets[i], blob_sizes[i], kNoCompression,
+          &fake_value_buf[i], &fake_statuses_buf[i]);
+    }
+
+    // Add a fake multi-get blob request.
+    blob_reqs.emplace_back(fake_file_number, file_size, fake_blob_reqs_in_file);
+
+    blob_source.MultiGetBlob(read_options, blob_reqs, &bytes_read);
+
+    // Check the real blob read requests.
+    for (size_t i = 0; i < blob_files; ++i) {
+      const uint64_t file_number = i + 1;
+      for (size_t j = 0; j < num_blobs; ++j) {
+        ASSERT_OK(statuses_buf[i * num_blobs + j]);
+        ASSERT_EQ(value_buf[i * num_blobs + j], blobs[j]);
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                 blob_offsets[j]));
+      }
+    }
+
+    // Check the fake blob request.
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(fake_statuses_buf[i].IsIOError());
+      ASSERT_TRUE(fake_value_buf[i].empty());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(fake_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    // Retrieved all blobs from 3 blob files (including the fake one) twice
+    // via MultiGetBlob and TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count,
+              num_blobs * blob_files * 2);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              0);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    // Fake blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    // Real blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT),
+              num_blobs * blob_files * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    // Real blob requests: MultiGetBlob and TEST_BlobInCache
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_value_bytes * blob_files * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+TEST_F(BlobSourceTest, MultiGetBlobsFromCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(env_, "BlobSourceTest_MultiGetBlobsFromCache"), 0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t blob_file_number = 1;
+  constexpr size_t num_blobs = 16;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> blob_strs;
+
+  for (size_t i = 0; i < num_blobs; ++i) {
+    key_strs.push_back("key" + std::to_string(i));
+    blob_strs.push_back("blob" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys;
+  std::vector<Slice> blobs;
+
+  uint64_t file_size = BlobLogHeader::kSize;
+  for (size_t i = 0; i < num_blobs; ++i) {
+    keys.push_back({key_strs[i]});
+    blobs.push_back({blob_strs[i]});
+    file_size += BlobLogRecord::kHeaderSize + keys[i].size() + blobs[i].size();
+  }
+  file_size += BlobLogFooter::kSize;
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, blob_file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache =
+      NewLRUCache(capacity);  // Blob file cache
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          column_family_id, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+
+  {
+    // MultiGetBlobFromOneFile
+    uint64_t bytes_read = 0;
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i += 2) {  // even index
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    read_options.fill_cache = true;
+    read_options.read_tier = ReadTier::kReadAllTier;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    // Get half of blobs
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    uint64_t fs_read_bytes = 0;
+    uint64_t ca_read_bytes = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      if (i % 2 == 0) {
+        ASSERT_OK(statuses_buf[i]);
+        ASSERT_EQ(value_buf[i], blobs[i]);
+        ASSERT_TRUE(value_buf[i].IsPinned());
+        fs_read_bytes +=
+            blob_sizes[i] + keys[i].size() + BlobLogRecord::kHeaderSize;
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                 blob_offsets[i]));
+        ca_read_bytes += blob_sizes[i];
+      } else {
+        statuses_buf[i].PermitUncheckedError();
+        ASSERT_TRUE(value_buf[i].empty());
+        ASSERT_FALSE(value_buf[i].IsPinned());
+        ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                  blob_offsets[i]));
+      }
+    }
+
+    constexpr int num_even_blobs = num_blobs / 2;
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_even_blobs);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count,
+              num_even_blobs);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte,
+              fs_read_bytes);  // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_even_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), num_even_blobs);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              ca_read_bytes);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE),
+              ca_read_bytes);
+
+    // Get the rest of blobs
+    for (size_t i = 1; i < num_blobs; i += 2) {  // odd index
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+
+      ASSERT_OK(blob_source.GetBlob(read_options, keys[i], blob_file_number,
+                                    blob_offsets[i], file_size, blob_sizes[i],
+                                    kNoCompression, prefetch_buffer,
+                                    &value_buf[i], &bytes_read));
+      ASSERT_EQ(value_buf[i], blobs[i]);
+      ASSERT_TRUE(value_buf[i].IsPinned());
+      ASSERT_EQ(bytes_read,
+                BlobLogRecord::kHeaderSize + keys[i].size() + blob_sizes[i]);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+    }
+
+    // Cache-only MultiGetBlobFromOneFile
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_reqs.clear();
+    for (size_t i = 0; i < num_blobs; ++i) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+    }
+
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_OK(statuses_buf[i]);
+      ASSERT_EQ(value_buf[i], blobs[i]);
+      ASSERT_TRUE(value_buf[i].IsPinned());
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                               blob_offsets[i]));
+      blob_bytes += blob_sizes[i];
+    }
+
+    // Retrieved the blob cache num_blobs * 2 times via GetBlob and
+    // TEST_BlobInCache.
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, num_blobs * 2);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_GE((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ),
+              blob_bytes * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  options_.blob_cache->EraseUnRefEntries();
+
+  {
+    // Cache-only MultiGetBlobFromOneFile
+    uint64_t bytes_read = 0;
+    read_options.read_tier = ReadTier::kBlockCacheTier;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i++) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlobFromOneFile(read_options, blob_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(statuses_buf[i].IsIncomplete());
+      ASSERT_TRUE(value_buf[i].empty());
+      ASSERT_FALSE(value_buf[i].IsPinned());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(blob_file_number, file_size,
+                                                blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+
+  {
+    // MultiGetBlobFromOneFile from non-existing file
+    uint64_t bytes_read = 0;
+    uint64_t non_existing_file_number = 100;
+    read_options.read_tier = ReadTier::kReadAllTier;
+
+    std::array<Status, num_blobs> statuses_buf;
+    std::array<PinnableSlice, num_blobs> value_buf;
+    autovector<BlobReadRequest> blob_reqs;
+
+    for (size_t i = 0; i < num_blobs; i++) {
+      blob_reqs.emplace_back(keys[i], blob_offsets[i], blob_sizes[i],
+                             kNoCompression, &value_buf[i], &statuses_buf[i]);
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+                                                file_size, blob_offsets[i]));
+    }
+
+    get_perf_context()->Reset();
+    statistics->Reset().PermitUncheckedError();
+
+    blob_source.MultiGetBlobFromOneFile(read_options, non_existing_file_number,
+                                        file_size, blob_reqs, &bytes_read);
+
+    for (size_t i = 0; i < num_blobs; ++i) {
+      ASSERT_TRUE(statuses_buf[i].IsIOError());
+      ASSERT_TRUE(value_buf[i].empty());
+      ASSERT_FALSE(value_buf[i].IsPinned());
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(non_existing_file_number,
+                                                file_size, blob_offsets[i]));
+    }
+
+    ASSERT_EQ((int)get_perf_context()->blob_cache_hit_count, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_read_count, 0);  // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_read_byte, 0);   // blocking i/o
+    ASSERT_EQ((int)get_perf_context()->blob_checksum_time, 0);
+    ASSERT_EQ((int)get_perf_context()->blob_decompress_time, 0);
+
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_MISS), num_blobs * 2);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_HIT), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_READ), 0);
+    ASSERT_EQ(statistics->getTickerCount(BLOB_DB_CACHE_BYTES_WRITE), 0);
+  }
+}
+
+class BlobSecondaryCacheTest : public DBTestBase {
+ protected:
+ public:
+  explicit BlobSecondaryCacheTest()
+      : DBTestBase("blob_secondary_cache_test", /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    // Set a small cache capacity to evict entries from the cache, and to test
+    // that secondary cache is used properly.
+    lru_cache_opts_.capacity = 1024;
+    lru_cache_opts_.num_shard_bits = 0;
+    lru_cache_opts_.strict_capacity_limit = true;
+    lru_cache_opts_.metadata_charge_policy = kDontChargeCacheMetadata;
+    lru_cache_opts_.high_pri_pool_ratio = 0.2;
+    lru_cache_opts_.low_pri_pool_ratio = 0.2;
+
+    secondary_cache_opts_.capacity = 8 << 20;  // 8 MB
+    secondary_cache_opts_.num_shard_bits = 0;
+    secondary_cache_opts_.metadata_charge_policy =
+        kDefaultCacheMetadataChargePolicy;
+
+    // Read blobs from the secondary cache if they are not in the primary cache
+    options_.lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier;
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  Options options_;
+
+  LRUCacheOptions lru_cache_opts_;
+  CompressedSecondaryCacheOptions secondary_cache_opts_;
+
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  secondary_cache_opts_.compression_type = kSnappyCompression;
+  lru_cache_opts_.secondary_cache =
+      NewCompressedSecondaryCache(secondary_cache_opts_);
+  options_.blob_cache = NewLRUCache(lru_cache_opts_);
+
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_, "BlobSecondaryCacheTest_GetBlobsFromSecondaryCache"),
+      0);
+
+  options_.statistics = CreateDBStatistics();
+  Statistics* statistics = options_.statistics.get();
+  assert(statistics);
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr uint32_t column_family_id = 1;
+  constexpr bool has_ttl = false;
+  constexpr ExpirationRange expiration_range;
+  constexpr uint64_t file_number = 1;
+
+  Random rnd(301);
+
+  std::vector<std::string> key_strs{"key0", "key1"};
+  std::vector<std::string> blob_strs{rnd.RandomString(512),
+                                     rnd.RandomString(768)};
+
+  std::vector<Slice> keys{key_strs[0], key_strs[1]};
+  std::vector<Slice> blobs{blob_strs[0], blob_strs[1]};
+
+  std::vector<uint64_t> blob_offsets(keys.size());
+  std::vector<uint64_t> blob_sizes(keys.size());
+
+  WriteBlobFile(immutable_options, column_family_id, has_ttl, expiration_range,
+                expiration_range, file_number, keys, blobs, kNoCompression,
+                blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 1024;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache(new BlobFileCache(
+      backing_cache.get(), &immutable_options, &file_options, column_family_id,
+      blob_file_read_hist, nullptr /*IOTracer*/));
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  CacheHandleGuard<BlobFileReader> file_reader;
+  ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
+  ASSERT_NE(file_reader.GetValue(), nullptr);
+  const uint64_t file_size = file_reader.GetValue()->GetFileSize();
+  ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  auto blob_cache = options_.blob_cache;
+  auto secondary_cache = lru_cache_opts_.secondary_cache;
+
+  Cache::CreateCallback create_cb = [](const void* buf, size_t size,
+                                       void** out_obj,
+                                       size_t* charge) -> Status {
+    CacheAllocationPtr allocation(new char[size]);
+
+    return BlobContents::CreateCallback(std::move(allocation), buf, size,
+                                        out_obj, charge);
+  };
+
+  {
+    // GetBlob
+    std::vector<PinnableSlice> values(keys.size());
+
+    read_options.fill_cache = true;
+    get_perf_context()->Reset();
+
+    // key0 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+                                  blob_offsets[0], file_size, blob_sizes[0],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[0], nullptr /* bytes_read */));
+    // Release cache handle
+    values[0].Reset();
+
+    // key0 should be evicted and key0's dummy item is inserted into secondary
+    // cache. key1 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+                                  blob_offsets[1], file_size, blob_sizes[1],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[1], nullptr /* bytes_read */));
+
+    // Release cache handle
+    values[1].Reset();
+
+    // key0 should be filled to the primary cache from the blob file. key1
+    // should be evicted and key1's dummy item is inserted into secondary cache.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[0], file_number,
+                                  blob_offsets[0], file_size, blob_sizes[0],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[0], nullptr /* bytes_read */));
+    ASSERT_EQ(values[0], blobs[0]);
+    ASSERT_TRUE(
+        blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[0]));
+
+    // Release cache handle
+    values[0].Reset();
+
+    // key0 should be evicted and is inserted into secondary cache.
+    // key1 should be filled to the primary cache from the blob file.
+    ASSERT_OK(blob_source.GetBlob(read_options, keys[1], file_number,
+                                  blob_offsets[1], file_size, blob_sizes[1],
+                                  kNoCompression, nullptr /* prefetch_buffer */,
+                                  &values[1], nullptr /* bytes_read */));
+    ASSERT_EQ(values[1], blobs[1]);
+    ASSERT_TRUE(
+        blob_source.TEST_BlobInCache(file_number, file_size, blob_offsets[1]));
+
+    // Release cache handle
+    values[1].Reset();
+
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, file_number);
+
+    // blob_cache here only looks at the primary cache since we didn't provide
+    // the cache item helper for the secondary cache. However, since key0 is
+    // demoted to the secondary cache, we shouldn't be able to find it in the
+    // primary cache.
+    {
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
+      const Slice key0 = cache_key.AsSlice();
+      auto handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_EQ(handle0, nullptr);
+
+      // key0's item should be in the secondary cache.
+      bool is_in_sec_cache = false;
+      auto sec_handle0 =
+          secondary_cache->Lookup(key0, create_cb, true,
+                                  /*advise_erase=*/true, is_in_sec_cache);
+      ASSERT_FALSE(is_in_sec_cache);
+      ASSERT_NE(sec_handle0, nullptr);
+      ASSERT_TRUE(sec_handle0->IsReady());
+      auto value = static_cast<BlobContents*>(sec_handle0->Value());
+      ASSERT_NE(value, nullptr);
+      ASSERT_EQ(value->data(), blobs[0]);
+      delete value;
+
+      // key0 doesn't exist in the blob cache although key0's dummy
+      // item exist in the secondary cache.
+      ASSERT_FALSE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                                blob_offsets[0]));
+    }
+
+    // key1 should exists in the primary cache. key1's dummy item exists
+    // in the secondary cache.
+    {
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
+      const Slice key1 = cache_key.AsSlice();
+      auto handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_NE(handle1, nullptr);
+      blob_cache->Release(handle1);
+
+      bool is_in_sec_cache = false;
+      auto sec_handle1 =
+          secondary_cache->Lookup(key1, create_cb, true,
+                                  /*advise_erase=*/true, is_in_sec_cache);
+      ASSERT_FALSE(is_in_sec_cache);
+      ASSERT_EQ(sec_handle1, nullptr);
+
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+    }
+
+    {
+      // fetch key0 from the blob file to the primary cache.
+      // key1 is evicted and inserted into the secondary cache.
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys[0], file_number, blob_offsets[0], file_size,
+          blob_sizes[0], kNoCompression, nullptr /* prefetch_buffer */,
+          &values[0], nullptr /* bytes_read */));
+      ASSERT_EQ(values[0], blobs[0]);
+
+      // Release cache handle
+      values[0].Reset();
+
+      // key0 should be in the primary cache.
+      CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
+      const Slice key0 = cache_key0.AsSlice();
+      auto handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_NE(handle0, nullptr);
+      auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
+      ASSERT_NE(value, nullptr);
+      ASSERT_EQ(value->data(), blobs[0]);
+      blob_cache->Release(handle0);
+
+      // key1 is not in the primary cache and is in the secondary cache.
+      CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
+      const Slice key1 = cache_key1.AsSlice();
+      auto handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_EQ(handle1, nullptr);
+
+      // erase key0 from the primary cache.
+      blob_cache->Erase(key0);
+      handle0 = blob_cache->Lookup(key0, statistics);
+      ASSERT_EQ(handle0, nullptr);
+
+      // key1 promotion should succeed due to the primary cache being empty. we
+      // did't call secondary cache's Lookup() here, because it will remove the
+      // key but it won't be able to promote the key to the primary cache.
+      // Instead we use the end-to-end blob source API to read key1.
+      // In function TEST_BlobInCache, key1's dummy item is inserted into the
+      // primary cache and a standalone handle is checked by GetValue().
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+
+      // key1's dummy handle is in the primary cache and key1's item is still
+      // in the secondary cache. So, the primary cache's Lookup() without
+      // secondary cache support cannot see it. (NOTE: The dummy handle used
+      // to be a leaky abstraction but not anymore.)
+      handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_EQ(handle1, nullptr);
+
+      // But after another access, it is promoted to primary cache
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
+                                               blob_offsets[1]));
+
+      // And Lookup() can find it (without secondary cache support)
+      handle1 = blob_cache->Lookup(key1, statistics);
+      ASSERT_NE(handle1, nullptr);
+      ASSERT_NE(blob_cache->Value(handle1), nullptr);
+      blob_cache->Release(handle1);
+    }
+  }
+}
+
+class BlobSourceCacheReservationTest : public DBTestBase {
+ public:
+  explicit BlobSourceCacheReservationTest()
+      : DBTestBase("blob_source_cache_reservation_test",
+                   /*env_do_fsync=*/true) {
+    options_.env = env_;
+    options_.enable_blob_files = true;
+    options_.create_if_missing = true;
+
+    LRUCacheOptions co;
+    co.capacity = kCacheCapacity;
+    co.num_shard_bits = kNumShardBits;
+    co.metadata_charge_policy = kDontChargeCacheMetadata;
+
+    co.high_pri_pool_ratio = 0.0;
+    co.low_pri_pool_ratio = 0.0;
+    std::shared_ptr<Cache> blob_cache = NewLRUCache(co);
+
+    co.high_pri_pool_ratio = 0.5;
+    co.low_pri_pool_ratio = 0.5;
+    std::shared_ptr<Cache> block_cache = NewLRUCache(co);
+
+    options_.blob_cache = blob_cache;
+    options_.lowest_used_cache_tier = CacheTier::kVolatileTier;
+
+    BlockBasedTableOptions block_based_options;
+    block_based_options.no_block_cache = false;
+    block_based_options.block_cache = block_cache;
+    block_based_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kBlobCache,
+         {/* charged = */ CacheEntryRoleOptions::Decision::kEnabled}});
+    options_.table_factory.reset(
+        NewBlockBasedTableFactory(block_based_options));
+
+    assert(db_->GetDbIdentity(db_id_).ok());
+    assert(db_->GetDbSessionId(db_session_id_).ok());
+  }
+
+  void GenerateKeysAndBlobs() {
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      key_strs_.push_back("key" + std::to_string(i));
+      blob_strs_.push_back("blob" + std::to_string(i));
+    }
+
+    blob_file_size_ = BlobLogHeader::kSize;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      keys_.push_back({key_strs_[i]});
+      blobs_.push_back({blob_strs_[i]});
+      blob_file_size_ +=
+          BlobLogRecord::kHeaderSize + keys_[i].size() + blobs_[i].size();
+    }
+    blob_file_size_ += BlobLogFooter::kSize;
+  }
+
+  static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
+      CacheEntryRole::kBlobCache>::GetDummyEntrySize();
+  static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
+  static constexpr int kNumShardBits = 0;  // 2^0 shard
+
+  static constexpr uint32_t kColumnFamilyId = 1;
+  static constexpr bool kHasTTL = false;
+  static constexpr uint64_t kBlobFileNumber = 1;
+  static constexpr size_t kNumBlobs = 16;
+
+  std::vector<Slice> keys_;
+  std::vector<Slice> blobs_;
+  std::vector<std::string> key_strs_;
+  std::vector<std::string> blob_strs_;
+  uint64_t blob_file_size_;
+
+  Options options_;
+  std::string db_id_;
+  std::string db_session_id_;
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_, "BlobSourceCacheReservationTest_SimpleCacheReservation"),
+      0);
+
+  GenerateKeysAndBlobs();
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+
+  constexpr ExpirationRange expiration_range;
+
+  std::vector<uint64_t> blob_offsets(keys_.size());
+  std::vector<uint64_t> blob_sizes(keys_.size());
+
+  WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+                expiration_range, kBlobFileNumber, keys_, blobs_,
+                kNoCompression, blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ConcurrentCacheReservationManager* cache_res_mgr =
+      static_cast<ChargedCache*>(blob_source.GetBlobCache())
+          ->TEST_GetCacheReservationManager();
+  ASSERT_NE(cache_res_mgr, nullptr);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  {
+    read_options.fill_cache = false;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+    }
+  }
+
+  {
+    read_options.fill_cache = true;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    // num_blobs is 16, so the total blob cache usage is less than a single
+    // dummy entry. Therefore, cache reservation manager only reserves one dummy
+    // entry here.
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+      size_t charge = 0;
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+                                               blob_offsets[i], &charge));
+
+      blob_bytes += charge;
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+
+  {
+    OffsetableCacheKey base_cache_key(db_id_, db_session_id_, kBlobFileNumber);
+    size_t blob_bytes = options_.blob_cache->GetUsage();
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      size_t charge = 0;
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+                                               blob_offsets[i], &charge));
+
+      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[i]);
+      // We didn't call options_.blob_cache->Erase() here, this is because
+      // the cache wrapper's Erase() method must be called to update the
+      // cache usage after erasing the cache entry.
+      blob_source.GetBlobCache()->Erase(cache_key.AsSlice());
+      if (i == kNumBlobs - 1) {
+        // All the blobs got removed from the cache. cache_res_mgr should not
+        // reserve any space for them.
+        ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      } else {
+        ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      }
+      blob_bytes -= charge;
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+}
+
+TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
+  options_.cf_paths.emplace_back(
+      test::PerThreadDBPath(
+          env_,
+          "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
+      0);
+
+  GenerateKeysAndBlobs();
+
+  DestroyAndReopen(options_);
+
+  ImmutableOptions immutable_options(options_);
+  constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
+  for (size_t i = 0; i < kNumBlobs; ++i) {
+    blob_file_size_ -= blobs_[i].size();  // old blob size
+    blob_strs_[i].resize(blob_size, '@');
+    blobs_[i] = Slice(blob_strs_[i]);
+    blob_file_size_ += blobs_[i].size();  // new blob size
+  }
+
+  std::vector<uint64_t> blob_offsets(keys_.size());
+  std::vector<uint64_t> blob_sizes(keys_.size());
+
+  constexpr ExpirationRange expiration_range;
+  WriteBlobFile(immutable_options, kColumnFamilyId, kHasTTL, expiration_range,
+                expiration_range, kBlobFileNumber, keys_, blobs_,
+                kNoCompression, blob_offsets, blob_sizes);
+
+  constexpr size_t capacity = 10;
+  std::shared_ptr<Cache> backing_cache = NewLRUCache(capacity);
+
+  FileOptions file_options;
+  constexpr HistogramImpl* blob_file_read_hist = nullptr;
+
+  std::unique_ptr<BlobFileCache> blob_file_cache =
+      std::make_unique<BlobFileCache>(
+          backing_cache.get(), &immutable_options, &file_options,
+          kColumnFamilyId, blob_file_read_hist, nullptr /*IOTracer*/);
+
+  BlobSource blob_source(&immutable_options, db_id_, db_session_id_,
+                         blob_file_cache.get());
+
+  ConcurrentCacheReservationManager* cache_res_mgr =
+      static_cast<ChargedCache*>(blob_source.GetBlobCache())
+          ->TEST_GetCacheReservationManager();
+  ASSERT_NE(cache_res_mgr, nullptr);
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+
+  {
+    read_options.fill_cache = false;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), 0);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), 0);
+    }
+  }
+
+  {
+    read_options.fill_cache = true;
+
+    std::vector<PinnableSlice> values(keys_.size());
+
+    // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
+    // can't fit all the blobs in the cache at the same time, which means we
+    // should observe cache evictions once we reach the cache's capacity.
+    // Due to the overhead of the cache and the BlobContents objects, as well as
+    // jemalloc bin sizes, this happens after inserting seven blobs.
+    uint64_t blob_bytes = 0;
+    for (size_t i = 0; i < kNumBlobs; ++i) {
+      ASSERT_OK(blob_source.GetBlob(
+          read_options, keys_[i], kBlobFileNumber, blob_offsets[i],
+          blob_file_size_, blob_sizes[i], kNoCompression,
+          nullptr /* prefetch_buffer */, &values[i], nullptr /* bytes_read */));
+
+      // Release cache handle
+      values[i].Reset();
+
+      if (i < kNumBlobs / 2 - 1) {
+        size_t charge = 0;
+        ASSERT_TRUE(blob_source.TEST_BlobInCache(
+            kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
+
+        blob_bytes += charge;
+      }
+
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
+      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
+                options_.blob_cache->GetUsage());
+    }
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_basic_test.cc b/src/rocksdb/db/blob/db_blob_basic_test.cc
new file mode 100644
index 000000000..e6832a2ae
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_basic_test.cc
@@ -0,0 +1,1789 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <sstream>
+#include <string>
+
+#include "cache/compressed_secondary_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobBasicTest : public DBTestBase {
+ protected:
+  DBBlobBasicTest()
+      : DBTestBase("db_blob_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBBlobBasicTest, GetBlob) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get(key), blob_value);
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches; however, the blob itself can only be
+  // read from the blob file, so the read should return Incomplete.
+  ReadOptions read_options;
+  read_options.read_tier = kBlockCacheTier;
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                  .IsIncomplete());
+}
+
+TEST_F(DBBlobBasicTest, GetBlobFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  read_options.fill_cache = false;
+
+  {
+    PinnableSlice result;
+
+    read_options.read_tier = kReadAllTier;
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+
+    result.Reset();
+    read_options.read_tier = kBlockCacheTier;
+
+    // Try again with no I/O allowed. Since we didn't re-fill the cache, the
+    // blob itself can only be read from the blob file, so the read should
+    // return Incomplete.
+    ASSERT_TRUE(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result)
+                    .IsIncomplete());
+    ASSERT_TRUE(result.empty());
+  }
+
+  read_options.fill_cache = true;
+
+  {
+    PinnableSlice result;
+
+    read_options.read_tier = kReadAllTier;
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+
+    result.Reset();
+    read_options.read_tier = kBlockCacheTier;
+
+    // Try again with no I/O allowed. The table and the necessary blocks/blobs
+    // should already be in their respective caches.
+    ASSERT_OK(db_->Get(read_options, db_->DefaultColumnFamily(), key, &result));
+    ASSERT_EQ(result, blob_value);
+  }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  int num_blobs = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> blobs;
+
+  for (int i = 0; i < num_blobs; ++i) {
+    keys.push_back("key" + std::to_string(i));
+    blobs.push_back("blob" + std::to_string(i));
+    ASSERT_OK(Put(keys[i], blobs[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kReadAllTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Try again with no I/O allowed. Since we didn't re-fill the cache,
+    // the blob itself can only be read from the blob file, so iter->Valid()
+    // should be false.
+    iter->SeekToFirst();
+    ASSERT_NOK(iter->status());
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+
+  {
+    read_options.fill_cache = true;
+    read_options.read_tier = kReadAllTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Read blobs from the file and refill the cache.
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD),
+              num_blobs);
+  }
+
+  {
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    // Try again with no I/O allowed. The table and the necessary blocks/blobs
+    // should already be in their respective caches.
+    int i = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key().ToString(), keys[i]);
+      ASSERT_EQ(iter->value().ToString(), blobs[i]);
+      ++i;
+    }
+    ASSERT_EQ(i, num_blobs);
+    ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 0);
+  }
+}
+
+TEST_F(DBBlobBasicTest, IterateBlobsFromCachePinning) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions cache_options;
+  cache_options.capacity = 2048;
+  cache_options.num_shard_bits = 0;
+  cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+  options.blob_cache = NewLRUCache(cache_options);
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then iterate over three key-values. The second value is below the size
+  // limit and is thus stored inline; the other two are stored separately as
+  // blobs. We expect to have something pinned in the cache iff we are
+  // positioned on a blob.
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "long_value";
+  static_assert(sizeof(first_value) - 1 >= min_blob_size,
+                "first_value too short to be stored as blob");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "short";
+  static_assert(sizeof(second_value) - 1 < min_blob_size,
+                "second_value too long to be inlined");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = true;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+  }
+
+  {
+    ReadOptions read_options;
+    read_options.fill_cache = false;
+    read_options.read_tier = kBlockCacheTier;
+
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), third_key);
+    ASSERT_EQ(iter->value(), third_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), second_key);
+    ASSERT_EQ(iter->value(), second_value);
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(iter->key(), first_key);
+    ASSERT_EQ(iter->value(), first_value);
+    ASSERT_GT(options.blob_cache->GetPinnedUsage(), 0);
+
+    iter->Prev();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(options.blob_cache->GetPinnedUsage(), 0);
+  }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobs) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The table and the necessary blocks should
+  // already be in their respective caches. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  constexpr size_t min_blob_size = 6;
+  options.min_blob_size = min_blob_size;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  DestroyAndReopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(Put(third_key, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. The first (inlined) value should be
+  // successfully read; however, the two blob values could only be read from the
+  // blob file, so for those the read should return Incomplete.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_TRUE(statuses[1].IsIncomplete());
+
+    ASSERT_TRUE(statuses[2].IsIncomplete());
+  }
+
+  // Fill the cache when reading blobs from the blob file.
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = true;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+
+  // Try again with no I/O allowed. All blobs should be successfully read from
+  // the cache.
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
+  Options options = GetDefaultOptions();
+
+  // First, create an external SST file ["b"].
+  const std::string file_path = dbname_ + "/test.sst";
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), GetDefaultOptions());
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    ASSERT_OK(sst_file_writer.Put("b", "b_value"));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 1000;
+  options.use_direct_reads = true;
+  options.allow_ingest_behind = true;
+
+  // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+  // new table file when encountering a new key whose 1-byte prefix changes.
+  constexpr size_t key_len = 1;
+  options.sst_partitioner_factory =
+      NewSstPartitionerFixedPrefixFactory(key_len);
+
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    ROCKSDB_GTEST_SKIP("This test requires direct IO support");
+    return;
+  }
+  ASSERT_OK(s);
+
+  constexpr size_t num_keys = 3;
+  constexpr size_t blob_size = 3000;
+
+  constexpr char first_key[] = "a";
+  const std::string first_blob(blob_size, 'a');
+  ASSERT_OK(Put(first_key, first_blob));
+
+  constexpr char second_key[] = "b";
+  const std::string second_blob(2 * blob_size, 'b');
+  ASSERT_OK(Put(second_key, second_blob));
+
+  constexpr char third_key[] = "d";
+  const std::string third_blob(blob_size, 'd');
+  ASSERT_OK(Put(third_key, third_blob));
+
+  // first_blob, second_blob and third_blob in the same blob file.
+  //      SST                    Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|
+  //       |       |       |         ^       ^        ^
+  //       |       |       |         |       |        |
+  //       |       |       +---------|-------|--------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  constexpr char fourth_key[] = "c";
+  const std::string fourth_blob(blob_size, 'c');
+  ASSERT_OK(Put(fourth_key, fourth_blob));
+  // fourth_blob in another blob file.
+  //      SST                    Blob file                 SST     Blob file
+  // L0  ["a",    "b",    "d"]   |'aaaa', 'bbbb', 'dddd'|  ["c"]   |'cccc'|
+  //       |       |       |         ^       ^        ^      |       ^
+  //       |       |       |         |       |        |      |       |
+  //       |       |       +---------|-------|--------+      +-------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+
+  // Due to the above sst partitioner, we get 4 L1 files. The blob files are
+  // unchanged.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+  {
+    // Ingest the external SST file into bottommost level.
+    std::vector<std::string> ext_files{file_path};
+    IngestExternalFileOptions opts;
+    opts.ingest_behind = true;
+    ASSERT_OK(
+        db_->IngestExternalFile(db_->DefaultColumnFamily(), ext_files, opts));
+  }
+
+  // Now the database becomes as follows.
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]   ["b"]   ["c"]       |       |   ["d"]       |
+  //       |       |       |         |       |               |
+  //       |       |       +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------------------------+
+  //
+  // L6          ["b"]
+
+  {
+    // Compact ["b"] to bottommost level.
+    Slice begin = Slice(second_key);
+    Slice end = Slice(second_key);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, &begin, &end));
+  }
+
+  //                             |'aaaa', 'bbbb', 'dddd'|  |'cccc'|
+  //                                 ^       ^     ^         ^
+  //                                 |       |     |         |
+  // L0                              |       |     |         |
+  // L1  ["a"]           ["c"]       |       |   ["d"]       |
+  //       |               |         |       |               |
+  //       |               +---------|-------|---------------+
+  //       |       +-----------------|-------+
+  //       +-------|-----------------+
+  //               |
+  // L6          ["b"]
+  ASSERT_EQ(3, NumTableFilesAtLevel(/*level=*/1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(/*level=*/6));
+
+  bool called = false;
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "RandomAccessFileReader::MultiRead:AlignedReqs", [&](void* arg) {
+        auto* aligned_reqs = static_cast<std::vector<FSReadRequest>*>(arg);
+        assert(aligned_reqs);
+        ASSERT_EQ(1, aligned_reqs->size());
+        called = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::array<Slice, num_keys> keys{{first_key, third_key, second_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    // The MultiGet(), when constructing the KeyContexts, will process the keys
+    // in such order: a, d, b. The reason is that ["a"] and ["d"] are in L1,
+    // while ["b"] resides in L6.
+    // Consequently, the original FSReadRequest list prepared by
+    // Version::MultiGetblob() will be for "a", "d" and "b". It is unsorted as
+    // follows:
+    //
+    // ["a", offset=30, len=3033],
+    // ["d", offset=9096, len=3033],
+    // ["b", offset=3063, len=6033]
+    //
+    // If we do not sort them before calling MultiRead() in DirectIO, then the
+    // underlying IO merging logic will yield two requests.
+    //
+    // [offset=0, len=4096] (for "a")
+    // [offset=0, len=12288] (result of merging the request for "d" and "b")
+    //
+    // We need to sort them in Version::MultiGetBlob() so that the underlying
+    // IO merging logic in DirectIO mode works as expected. The correct
+    // behavior will be one aligned request:
+    //
+    // [offset=0, len=12288]
+
+    db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_TRUE(called);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_blob);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], third_blob);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], second_blob);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 2 << 20;  // 2MB
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  Reopen(options);
+
+  constexpr size_t kNumBlobFiles = 3;
+  constexpr size_t kNumBlobsPerFile = 3;
+  constexpr size_t kNumKeys = kNumBlobsPerFile * kNumBlobFiles;
+
+  std::vector<std::string> key_strs;
+  std::vector<std::string> value_strs;
+  for (size_t i = 0; i < kNumBlobFiles; ++i) {
+    for (size_t j = 0; j < kNumBlobsPerFile; ++j) {
+      std::string key = "key" + std::to_string(i) + "_" + std::to_string(j);
+      std::string value =
+          "value_as_blob" + std::to_string(i) + "_" + std::to_string(j);
+      ASSERT_OK(Put(key, value));
+      key_strs.push_back(key);
+      value_strs.push_back(value);
+    }
+    ASSERT_OK(Flush());
+  }
+  assert(key_strs.size() == kNumKeys);
+  std::array<Slice, kNumKeys> keys;
+  for (size_t i = 0; i < keys.size(); ++i) {
+    keys[i] = key_strs[i];
+  }
+
+  ReadOptions read_options;
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = false;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_TRUE(statuses[i].IsIncomplete());
+      ASSERT_TRUE(values[i].empty());
+    }
+  }
+
+  read_options.read_tier = kReadAllTier;
+  read_options.fill_cache = true;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+
+  read_options.read_tier = kBlockCacheTier;
+
+  {
+    std::array<PinnableSlice, kNumKeys> values;
+    std::array<Status, kNumKeys> statuses;
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), kNumKeys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    for (size_t i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(value_strs[i], values[i]);
+    }
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Version::Get::TamperWithBlobIndex", [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_CorruptIndex) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_strs;
+  std::array<std::string, kNumOfKeys> value_strs;
+  std::array<Slice, kNumOfKeys + 1> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_strs[i] = "foo" + std::to_string(i);
+    value_strs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_strs[i], value_strs[i]));
+    keys[i] = key_strs[i];
+  }
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  ASSERT_OK(Put(key, blob));
+  keys[kNumOfKeys] = key;
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Version::MultiGet::TamperWithBlobIndex", [&key](void* arg) {
+        KeyContext* const key_context = static_cast<KeyContext*>(arg);
+        assert(key_context);
+        assert(key_context->key);
+
+        if (*(key_context->key) == key) {
+          Slice* const blob_index = key_context->value;
+          assert(blob_index);
+          assert(!blob_index->empty());
+          blob_index->remove_prefix(1);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::array<PinnableSlice, kNumOfKeys + 1> values;
+  std::array<Status, kNumOfKeys + 1> statuses;
+  db_->MultiGet(ReadOptions(), dbfull()->DefaultColumnFamily(), kNumOfKeys + 1,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/false);
+  for (size_t i = 0; i < kNumOfKeys + 1; ++i) {
+    if (i != kNumOfKeys) {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ("blob_value" + std::to_string(i), values[i]);
+    } else {
+      ASSERT_TRUE(statuses[i].IsCorruption());
+    }
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, MultiGetBlob_ExceedSoftLimit) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t kNumOfKeys = 3;
+  std::array<std::string, kNumOfKeys> key_bufs;
+  std::array<std::string, kNumOfKeys> value_bufs;
+  std::array<Slice, kNumOfKeys> keys;
+  for (size_t i = 0; i < kNumOfKeys; ++i) {
+    key_bufs[i] = "foo" + std::to_string(i);
+    value_bufs[i] = "blob_value" + std::to_string(i);
+    ASSERT_OK(Put(key_bufs[i], value_bufs[i]));
+    keys[i] = key_bufs[i];
+  }
+  ASSERT_OK(Flush());
+
+  std::array<PinnableSlice, kNumOfKeys> values;
+  std::array<Status, kNumOfKeys> statuses;
+  ReadOptions read_opts;
+  read_opts.value_size_soft_limit = 1;
+  db_->MultiGet(read_opts, dbfull()->DefaultColumnFamily(), kNumOfKeys,
+                keys.data(), values.data(), statuses.data(),
+                /*sorted_input=*/true);
+  for (const auto& s : statuses) {
+    ASSERT_TRUE(s.IsAborted());
+  }
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsCorruption());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, GenerateIOTracing) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::string trace_file = dbname_ + "/io_trace_file";
+
+  Reopen(options);
+  {
+    // Create IO trace file
+    std::unique_ptr<TraceWriter> trace_writer;
+    ASSERT_OK(
+        NewFileTraceWriter(env_, EnvOptions(), trace_file, &trace_writer));
+    ASSERT_OK(db_->StartIOTrace(TraceOptions(), std::move(trace_writer)));
+
+    constexpr char key[] = "key";
+    constexpr char blob_value[] = "blob_value";
+
+    ASSERT_OK(Put(key, blob_value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(Get(key), blob_value);
+
+    ASSERT_OK(db_->EndIOTrace());
+    ASSERT_OK(env_->FileExists(trace_file));
+  }
+  {
+    // Parse trace file to check file operations related to blob files are
+    // recorded.
+    std::unique_ptr<TraceReader> trace_reader;
+    ASSERT_OK(
+        NewFileTraceReader(env_, EnvOptions(), trace_file, &trace_reader));
+    IOTraceReader reader(std::move(trace_reader));
+
+    IOTraceHeader header;
+    ASSERT_OK(reader.ReadHeader(&header));
+    ASSERT_EQ(kMajorVersion, static_cast<int>(header.rocksdb_major_version));
+    ASSERT_EQ(kMinorVersion, static_cast<int>(header.rocksdb_minor_version));
+
+    // Read records.
+    int blob_files_op_count = 0;
+    Status status;
+    while (true) {
+      IOTraceRecord record;
+      status = reader.ReadIOOp(&record);
+      if (!status.ok()) {
+        break;
+      }
+      if (record.file_name.find("blob") != std::string::npos) {
+        blob_files_op_count++;
+      }
+    }
+    // Assuming blob files will have Append, Close and then Read operations.
+    ASSERT_GT(blob_files_op_count, 2);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  constexpr int kNumTableFiles = 2;
+  for (int i = 0; i < kNumTableFiles; ++i) {
+    for (char ch = 'a'; ch != 'c'; ++ch) {
+      std::string key(1, ch);
+      ASSERT_OK(Put(key, "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Close();
+
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  std::string blob_file_path;
+  uint64_t max_blob_file_num = kInvalidBlobFileNumber;
+  for (const auto& fname : files) {
+    uint64_t file_num = 0;
+    FileType type;
+    if (ParseFileName(fname, &file_num, /*info_log_name_prefix=*/"", &type) &&
+        type == kBlobFile) {
+      if (file_num > max_blob_file_num) {
+        max_blob_file_num = file_num;
+        blob_file_path = dbname_ + "/" + fname;
+      }
+    }
+  }
+  ASSERT_OK(env_->DeleteFile(blob_file_path));
+
+  options.best_efforts_recovery = true;
+  Reopen(options);
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "a", &value));
+  ASSERT_EQ("value" + std::to_string(kNumTableFiles - 2), value);
+}
+
+TEST_F(DBBlobBasicTest, GetMergeBlobWithPut) {
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key1", "v3"));
+  ASSERT_OK(Flush());
+
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "Key1", &value));
+  ASSERT_EQ(Get("Key1"), "v1,v2,v3");
+}
+
+TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
+  constexpr size_t num_keys = 3;
+
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("Key0", "v0_0"));
+  ASSERT_OK(Put("Key1", "v1_0"));
+  ASSERT_OK(Put("Key2", "v2_0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_1"));
+  ASSERT_OK(Merge("Key1", "v1_1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("Key0", "v0_2"));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+  ASSERT_OK(statuses[1]);
+  ASSERT_EQ(values[1], "v1_0,v1_1");
+
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v2_0");
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, Properties) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr size_t key1_size = sizeof(key1) - 1;
+
+  constexpr char key2[] = "key2";
+  constexpr size_t key2_size = sizeof(key2) - 1;
+
+  constexpr char key3[] = "key3";
+  constexpr size_t key3_size = sizeof(key3) - 1;
+
+  constexpr char blob[] = "00000000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  constexpr char longer_blob[] = "00000000000000000000";
+  constexpr size_t longer_blob_size = sizeof(longer_blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Put(key2, longer_blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t first_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key1_size) + blob_size +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+      longer_blob_size + BlobLogFooter::kSize;
+
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr size_t second_blob_file_expected_size =
+      BlobLogHeader::kSize +
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key3_size) + blob_size +
+      BlobLogFooter::kSize;
+
+  constexpr size_t total_expected_size =
+      first_blob_file_expected_size + second_blob_file_expected_size;
+
+  // Number of blob files
+  uint64_t num_blob_files = 0;
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kNumBlobFiles, &num_blob_files));
+  ASSERT_EQ(num_blob_files, 2);
+
+  // Total size of live blob files
+  uint64_t live_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileSize,
+                                  &live_blob_file_size));
+  ASSERT_EQ(live_blob_file_size, total_expected_size);
+
+  // Total amount of garbage in live blob files
+  {
+    uint64_t live_blob_file_garbage_size = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+                                    &live_blob_file_garbage_size));
+    ASSERT_EQ(live_blob_file_garbage_size, 0);
+  }
+
+  // Total size of all blob files across all versions
+  // Note: this should be the same as above since we only have one
+  // version at this point.
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size, total_expected_size);
+
+  // Delete key2 to create some garbage
+  ASSERT_OK(Delete(key2));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  constexpr size_t expected_garbage_size =
+      BlobLogRecord::CalculateAdjustmentForRecordHeader(key2_size) +
+      longer_blob_size;
+
+  constexpr double expected_space_amp =
+      static_cast<double>(total_expected_size) /
+      (total_expected_size - expected_garbage_size);
+
+  // Blob file stats
+  std::string blob_stats;
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kBlobStats, &blob_stats));
+
+  std::ostringstream oss;
+  oss << "Number of blob files: 2\nTotal size of blob files: "
+      << total_expected_size
+      << "\nTotal size of garbage in blob files: " << expected_garbage_size
+      << "\nBlob file space amplification: " << expected_space_amp << '\n';
+
+  ASSERT_EQ(blob_stats, oss.str());
+
+  // Total amount of garbage in live blob files
+  {
+    uint64_t live_blob_file_garbage_size = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kLiveBlobFileGarbageSize,
+                                    &live_blob_file_garbage_size));
+    ASSERT_EQ(live_blob_file_garbage_size, expected_garbage_size);
+  }
+}
+
+TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key1[] = "key1";
+  constexpr char key2[] = "key2";
+  constexpr char key3[] = "key3";
+
+  constexpr size_t key_size = sizeof(key1) - 1;
+  static_assert(sizeof(key2) - 1 == key_size, "unexpected size: key2");
+  static_assert(sizeof(key3) - 1 == key_size, "unexpected size: key3");
+
+  constexpr char blob[] = "0000000000";
+  constexpr size_t blob_size = sizeof(blob) - 1;
+
+  ASSERT_OK(Put(key1, blob));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(key2, blob));
+  ASSERT_OK(Flush());
+
+  // Create an iterator to keep the current version alive
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
+
+  // Note: the Delete and subsequent compaction results in the first blob file
+  // not making it to the final version. (It is still part of the previous
+  // version kept alive by the iterator though.) On the other hand, the Put
+  // results in a third blob file.
+  ASSERT_OK(Delete(key1));
+  ASSERT_OK(Put(key3, blob));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Total size of all blob files across all versions: between the two versions,
+  // we should have three blob files of the same size with one blob each.
+  // The version kept alive by the iterator contains the first and the second
+  // blob file, while the final version contains the second and the third blob
+  // file. (The second blob file is thus shared by the two versions but should
+  // be counted only once.)
+  uint64_t total_blob_file_size = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kTotalBlobFileSize,
+                                  &total_blob_file_size));
+  ASSERT_EQ(total_blob_file_size,
+            3 * (BlobLogHeader::kSize +
+                 BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
+                 blob_size + BlobLogFooter::kSize));
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
+                               public testing::WithParamInterface<std::string> {
+ protected:
+  DBBlobBasicIOErrorTest() : sync_point_(GetParam()) {
+    fault_injection_env_.reset(new FaultInjectionTestEnv(env_));
+  }
+  ~DBBlobBasicIOErrorTest() { Close(); }
+
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env_;
+  std::string sync_point_;
+};
+
+class DBBlobBasicIOErrorMultiGetTest : public DBBlobBasicIOErrorTest {
+ public:
+  DBBlobBasicIOErrorMultiGetTest() : DBBlobBasicIOErrorTest() {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::GetBlob:ReadFromFile"}));
+
+INSTANTIATE_TEST_CASE_P(DBBlobBasicTest, DBBlobBasicIOErrorMultiGetTest,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileReader::OpenFile:NewRandomAccessFile",
+                            "BlobFileReader::MultiGetBlob:ReadFromFile"}));
+
+TEST_P(DBBlobBasicIOErrorTest, GetBlob_IOError) {
+  Options options;
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  PinnableSlice result;
+  ASSERT_TRUE(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), key, &result)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultiGetBlobs_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{first_key, second_key}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(statuses[0].IsIOError());
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+TEST_P(DBBlobBasicIOErrorMultiGetTest, MultipleBlobFiles) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr size_t num_keys = 2;
+
+  constexpr char key1[] = "key1";
+  constexpr char value1[] = "blob1";
+
+  ASSERT_OK(Put(key1, value1));
+  ASSERT_OK(Flush());
+
+  constexpr char key2[] = "key2";
+  constexpr char value2[] = "blob2";
+
+  ASSERT_OK(Put(key2, value2));
+  ASSERT_OK(Flush());
+
+  std::array<Slice, num_keys> keys{{key1, key2}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  bool first_blob_file = true;
+  SyncPoint::GetInstance()->SetCallBack(
+      sync_point_, [&first_blob_file, this](void* /* arg */) {
+        if (first_blob_file) {
+          first_blob_file = false;
+          return;
+        }
+        fault_injection_env_->SetFilesystemActive(false,
+                                                  Status::IOError(sync_point_));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                keys.data(), values.data(), statuses.data());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(value1, values[0]);
+  ASSERT_TRUE(statuses[1].IsIOError());
+}
+
+namespace {
+
+class ReadBlobCompactionFilter : public CompactionFilter {
+ public:
+  ReadBlobCompactionFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.read.blob";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType value_type,
+      const Slice& existing_value, std::string* new_value,
+      std::string* /*skip_until*/) const override {
+    if (value_type != CompactionFilter::ValueType::kValue) {
+      return CompactionFilter::Decision::kKeep;
+    }
+    assert(new_value);
+    new_value->assign(existing_value.data(), existing_value.size());
+    return CompactionFilter::Decision::kChangeValue;
+  }
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBlobBasicIOErrorTest, CompactionFilterReadBlob_IOError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_injection_env_.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ReadBlobCompactionFilter);
+  options.compaction_filter = compaction_filter_guard.get();
+
+  DestroyAndReopen(options);
+  constexpr char key[] = "foo";
+  constexpr char blob_value[] = "foo_blob_value";
+  ASSERT_OK(Put(key, blob_value));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* /* arg */) {
+    fault_injection_env_->SetFilesystemActive(false,
+                                              Status::IOError(sync_point_));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 1 << 25;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumBlobs = 10;
+  constexpr size_t kValueSize = 100;
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= kNumBlobs; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(i * 2, options.statistics->getTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlobs * 2,
+            options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
+  Options options = GetDefaultOptions();
+
+  LRUCacheOptions co;
+  co.capacity = 1 << 25;
+  co.num_shard_bits = 2;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto backing_cache = NewLRUCache(co);
+
+  options.blob_cache = backing_cache;
+
+  BlockBasedTableOptions block_based_options;
+  block_based_options.no_block_cache = false;
+  block_based_options.block_cache = backing_cache;
+  block_based_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(block_based_options));
+
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  constexpr size_t kNumBlobs = 10;
+  constexpr size_t kValueSize = 100;
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= 5; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({{"prepopulate_blob_cache", "kDisable"}}));
+
+  for (size_t i = 6; i <= kNumBlobs; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Put(std::to_string(i + kNumBlobs), value));  // Add some overlap
+    ASSERT_OK(Flush());
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(value, Get(std::to_string(i + kNumBlobs)));
+    ASSERT_EQ(2, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD));
+    ASSERT_EQ(2,
+              options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS));
+    ASSERT_EQ(0, options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT));
+  }
+
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
+  CompressedSecondaryCacheOptions secondary_cache_opts;
+  secondary_cache_opts.capacity = 1 << 20;
+  secondary_cache_opts.num_shard_bits = 0;
+  secondary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  secondary_cache_opts.compression_type = kNoCompression;
+
+  LRUCacheOptions primary_cache_opts;
+  primary_cache_opts.capacity = 1024;
+  primary_cache_opts.num_shard_bits = 0;
+  primary_cache_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  primary_cache_opts.secondary_cache =
+      NewCompressedSecondaryCache(secondary_cache_opts);
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.statistics = CreateDBStatistics();
+  options.enable_blob_files = true;
+  options.blob_cache = NewLRUCache(primary_cache_opts);
+  options.prepopulate_blob_cache = PrepopulateBlobCache::kFlushOnly;
+
+  DestroyAndReopen(options);
+
+  // Note: only one of the two blobs fit in the primary cache at any given time.
+  constexpr char first_key[] = "foo";
+  constexpr size_t first_blob_size = 512;
+  const std::string first_blob(first_blob_size, 'a');
+
+  constexpr char second_key[] = "bar";
+  constexpr size_t second_blob_size = 768;
+  const std::string second_blob(second_blob_size, 'b');
+
+  // First blob is inserted into primary cache during flush.
+  ASSERT_OK(Put(first_key, first_blob));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+  // Second blob is inserted into primary cache during flush,
+  // First blob is evicted but only a dummy handle is inserted into secondary
+  // cache.
+  ASSERT_OK(Put(second_key, second_blob));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_ADD), 1);
+
+  // First blob is inserted into primary cache.
+  // Second blob is evicted but only a dummy handle is inserted into secondary
+  // cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            0);
+  // Second blob is inserted into primary cache,
+  // First blob is evicted and is inserted into secondary cache.
+  ASSERT_EQ(Get(second_key), second_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            0);
+
+  // First blob's dummy item is inserted into primary cache b/c of lookup.
+  // Second blob is still in primary cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            1);
+
+  // First blob's item is inserted into primary cache b/c of lookup.
+  // Second blob is evicted and inserted into secondary cache.
+  ASSERT_EQ(Get(first_key), first_blob);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_MISS), 0);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(BLOB_DB_CACHE_HIT), 1);
+  ASSERT_EQ(options.statistics->getAndResetTickerCount(SECONDARY_CACHE_HITS),
+            1);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_compaction_test.cc b/src/rocksdb/db/blob/db_blob_compaction_test.cc
new file mode 100644
index 000000000..f3fe3c03b
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_compaction_test.cc
@@ -0,0 +1,913 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCompactionTest : public DBTestBase {
+ public:
+  explicit DBBlobCompactionTest()
+      : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
+
+#ifndef ROCKSDB_LITE
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+#endif  // ROCKSDB_LITE
+};
+
+namespace {
+
+class FilterByKeyLength : public CompactionFilter {
+ public:
+  explicit FilterByKeyLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.key.length";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class FilterByValueLength : public CompactionFilter {
+ public:
+  explicit FilterByValueLength(size_t len) : length_threshold_(len) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.by.value.length";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& existing_value, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (existing_value.size() < length_threshold_) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+ private:
+  size_t length_threshold_;
+};
+
+class BadBlobCompactionFilter : public CompactionFilter {
+ public:
+  explicit BadBlobCompactionFilter(std::string prefix,
+                                   CompactionFilter::Decision filter_by_key,
+                                   CompactionFilter::Decision filter_v2)
+      : prefix_(std::move(prefix)),
+        filter_blob_by_key_(filter_by_key),
+        filter_v2_(filter_v2) {}
+  const char* Name() const override { return "rocksdb.compaction.filter.bad"; }
+  CompactionFilter::Decision FilterBlobByKey(
+      int /*level*/, const Slice& key, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    if (key.size() >= prefix_.size() &&
+        0 == strncmp(prefix_.data(), key.data(), prefix_.size())) {
+      return CompactionFilter::Decision::kUndetermined;
+    }
+    return filter_blob_by_key_;
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return filter_v2_;
+  }
+
+ private:
+  const std::string prefix_;
+  const CompactionFilter::Decision filter_blob_by_key_;
+  const CompactionFilter::Decision filter_v2_;
+};
+
+class ValueBlindWriteFilter : public CompactionFilter {
+ public:
+  explicit ValueBlindWriteFilter(std::string new_val)
+      : new_value_(std::move(new_val)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.blind.write";
+  }
+  CompactionFilter::Decision FilterBlobByKey(
+      int level, const Slice& key, std::string* new_value,
+      std::string* skip_until) const override;
+
+ private:
+  const std::string new_value_;
+};
+
+CompactionFilter::Decision ValueBlindWriteFilter::FilterBlobByKey(
+    int /*level*/, const Slice& /*key*/, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(new_value);
+  new_value->assign(new_value_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class ValueMutationFilter : public CompactionFilter {
+ public:
+  explicit ValueMutationFilter(std::string padding)
+      : padding_(std::move(padding)) {}
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.value.mutation";
+  }
+  CompactionFilter::Decision FilterV2(int level, const Slice& key,
+                                      ValueType value_type,
+                                      const Slice& existing_value,
+                                      std::string* new_value,
+                                      std::string* skip_until) const override;
+
+ private:
+  const std::string padding_;
+};
+
+CompactionFilter::Decision ValueMutationFilter::FilterV2(
+    int /*level*/, const Slice& /*key*/, ValueType value_type,
+    const Slice& existing_value, std::string* new_value,
+    std::string* /*skip_until*/) const {
+  assert(CompactionFilter::ValueType::kBlobIndex != value_type);
+  if (CompactionFilter::ValueType::kValue != value_type) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  assert(new_value);
+  new_value->assign(existing_value.data(), existing_value.size());
+  new_value->append(padding_);
+  return CompactionFilter::Decision::kChangeValue;
+}
+
+class AlwaysKeepFilter : public CompactionFilter {
+ public:
+  explicit AlwaysKeepFilter() = default;
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.always.keep";
+  }
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& /*key*/, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    return CompactionFilter::Decision::kKeep;
+  }
+};
+
+class SkipUntilFilter : public CompactionFilter {
+ public:
+  explicit SkipUntilFilter(std::string skip_until)
+      : skip_until_(std::move(skip_until)) {}
+
+  const char* Name() const override {
+    return "rocksdb.compaction.filter.skip.until";
+  }
+
+  CompactionFilter::Decision FilterV2(int /* level */, const Slice& /* key */,
+                                      ValueType /* value_type */,
+                                      const Slice& /* existing_value */,
+                                      std::string* /* new_value */,
+                                      std::string* skip_until) const override {
+    assert(skip_until);
+    *skip_until = skip_until_;
+
+    return CompactionFilter::Decision::kRemoveAndSkipUntil;
+  }
+
+ private:
+  std::string skip_until_;
+};
+
+}  // anonymous namespace
+
+class DBBlobBadCompactionFilterTest
+    : public DBBlobCompactionTest,
+      public testing::WithParamInterface<
+          std::tuple<std::string, CompactionFilter::Decision,
+                     CompactionFilter::Decision>> {
+ public:
+  explicit DBBlobBadCompactionFilterTest()
+      : compaction_filter_guard_(new BadBlobCompactionFilter(
+            std::get<0>(GetParam()), std::get<1>(GetParam()),
+            std::get<2>(GetParam()))) {}
+
+ protected:
+  std::unique_ptr<CompactionFilter> compaction_filter_guard_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    BadCompactionFilter, DBBlobBadCompactionFilterTest,
+    testing::Combine(
+        testing::Values("a"),
+        testing::Values(CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError),
+        testing::Values(CompactionFilter::Decision::kUndetermined,
+                        CompactionFilter::Decision::kChangeBlobIndex,
+                        CompactionFilter::Decision::kIOError)));
+
+TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr size_t kKeyLength = 2;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByKeyLength(kKeyLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  constexpr char short_key[] = "a";
+  constexpr char long_key[] = "abc";
+  constexpr char blob_value[] = "value";
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put(short_key, blob_value));
+  ASSERT_OK(Put(long_key, blob_value));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), short_key, &value).IsNotFound());
+  value.clear();
+  ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
+  ASSERT_EQ("value", value);
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove solely based on key;
+  // this involves neither reading nor writing blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, FilterByValueLength) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 5;
+  options.create_if_missing = true;
+  constexpr size_t kValueLength = 5;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new FilterByValueLength(kValueLength));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  const std::vector<std::string> short_value_keys = {"a", "e", "j"};
+  constexpr char short_value[] = "val";
+  const std::vector<std::string> long_value_keys = {"b", "f", "k"};
+  constexpr char long_value[] = "valuevalue";
+
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_OK(Put(short_value_keys[i], short_value));
+  }
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_OK(Put(long_value_keys[i], long_value));
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  std::string value;
+  for (size_t i = 0; i < short_value_keys.size(); ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), short_value_keys[i], &value).IsNotFound());
+    value.clear();
+  }
+  for (size_t i = 0; i < long_value_keys.size(); ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), long_value_keys[i], &value));
+    ASSERT_EQ(long_value, value);
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides between kKeep and kRemove based on value;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
+  Options options = GetDefaultOptions();
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 1000;
+  options.blob_file_starting_level = 5;
+  options.create_if_missing = true;
+
+  // Open DB with fixed-prefix sst-partitioner so that compaction will cut
+  // new table file when encountering a new key whose 1-byte prefix changes.
+  constexpr size_t key_len = 1;
+  options.sst_partitioner_factory =
+      NewSstPartitionerFixedPrefixFactory(key_len);
+
+  ASSERT_OK(TryReopen(options));
+
+  constexpr size_t blob_size = 3000;
+
+  constexpr char first_key[] = "a";
+  const std::string first_blob(blob_size, 'a');
+  ASSERT_OK(Put(first_key, first_blob));
+
+  constexpr char second_key[] = "b";
+  const std::string second_blob(2 * blob_size, 'b');
+  ASSERT_OK(Put(second_key, second_blob));
+
+  constexpr char third_key[] = "d";
+  const std::string third_blob(blob_size, 'd');
+  ASSERT_OK(Put(third_key, third_blob));
+
+  ASSERT_OK(Flush());
+
+  constexpr char fourth_key[] = "c";
+  const std::string fourth_blob(blob_size, 'c');
+  ASSERT_OK(Put(fourth_key, fourth_blob));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, GetBlobFileNumbers().size());
+  ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+
+  // No blob file should be created since blob_file_starting_level is 5.
+  ASSERT_EQ(0, GetBlobFileNumbers().size());
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+  ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+
+  {
+    options.blob_file_starting_level = 1;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(first_key, first_blob));
+    ASSERT_OK(Put(second_key, second_blob));
+    ASSERT_OK(Put(third_key, third_blob));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(fourth_key, fourth_blob));
+    ASSERT_OK(Flush());
+
+    ASSERT_EQ(0, GetBlobFileNumbers().size());
+    ASSERT_EQ(2, NumTableFilesAtLevel(/*level=*/0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/1));
+
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr));
+    // The compaction's output level equals to blob_file_starting_level.
+    ASSERT_EQ(1, GetBlobFileNumbers().size());
+    ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0));
+    ASSERT_EQ(4, NumTableFilesAtLevel(/*level=*/1));
+  }
+
+  Close();
+}
+#endif
+
+TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  constexpr char new_blob_value[] = "new_blob_value";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueBlindWriteFilter(new_blob_value));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::string> keys = {"a", "b", "c"};
+  const std::vector<std::string> values = {"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& key : keys) {
+    ASSERT_EQ(new_blob_value, Get(key));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter unconditionally changes value in FilterBlobByKey;
+  // this involves writing but not reading blobs
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, SkipUntilFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new SkipUntilFilter("z"));
+  options.compaction_filter = compaction_filter_guard.get();
+
+  Reopen(options);
+
+  const std::vector<std::string> keys{"a", "b", "c"};
+  const std::vector<std::string> values{"a_value", "b_value", "c_value"};
+  assert(keys.size() == values.size());
+
+  for (size_t i = 0; i < keys.size(); ++i) {
+    ASSERT_OK(Put(keys[i], values[i]));
+  }
+
+  ASSERT_OK(Flush());
+
+  int process_in_flow_called = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobCountingIterator::UpdateAndCountBlobIfNeeded:ProcessInFlow",
+      [&process_in_flow_called](void* /* arg */) { ++process_in_flow_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+                              /* end */ nullptr));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  for (const auto& key : keys) {
+    ASSERT_EQ(Get(key), "NOT_FOUND");
+  }
+
+  // Make sure SkipUntil was performed using iteration rather than Seek
+  ASSERT_EQ(process_in_flow_called, keys.size());
+
+  Close();
+}
+
+TEST_P(DBBlobBadCompactionFilterTest, BadDecisionFromCompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.compaction_filter = compaction_filter_guard_.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+
+  DestroyAndReopen(options);
+  std::string key(std::get<0>(GetParam()));
+  ASSERT_OK(Put(key, "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsNotSupported());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter_InlinedTTLIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+  constexpr uint64_t expiration = 1234567890;
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilter) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  constexpr char padding[] = "_delta";
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(padding));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  const std::vector<std::pair<std::string, std::string>> kvs = {
+      {"a", "a_value"}, {"b", "b_value"}, {"c", "c_value"}};
+  for (const auto& kv : kvs) {
+    ASSERT_OK(Put(kv.first, kv.second));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  for (const auto& kv : kvs) {
+    ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
+  }
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter changes the value using the previous value in FilterV2;
+  // this involves reading and writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CorruptedBlobIndex) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter(""));
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob[] = "blob";
+
+  ASSERT_OK(Put(key, blob));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+      [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                                /*end=*/nullptr)
+                  .IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new AlwaysKeepFilter());
+  options.compaction_filter = compaction_filter_guard.get();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Flush());
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(1, blob_files.size());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(blob_files, GetBlobFileNumbers());
+
+#ifndef ROCKSDB_LITE
+  const auto& compaction_stats = GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  // Filter decides to keep the existing value in FilterV2;
+  // this involves reading but not writing blobs
+  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+#endif  // ROCKSDB_LITE
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, TrackGarbage) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  // First table+blob file pair: 4 blobs with different keys
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  // Second table+blob file pair: overwrite 2 existing keys
+  constexpr char new_first_value[] = "new_first_value";
+  constexpr char new_second_value[] = "new_second_value";
+
+  ASSERT_OK(Put(first_key, new_first_value));
+  ASSERT_OK(Put(second_key, new_second_value));
+  ASSERT_OK(Flush());
+
+  // Compact them together. The first blob file should have 2 garbage blobs
+  // corresponding to the 2 overwritten keys.
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 2);
+
+  {
+    const auto& meta = blob_files.front();
+    assert(meta);
+
+    constexpr uint64_t first_expected_bytes =
+        sizeof(first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t second_expected_bytes =
+        sizeof(second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+    constexpr uint64_t third_expected_bytes =
+        sizeof(third_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(third_key) -
+                                                          1);
+    constexpr uint64_t fourth_expected_bytes =
+        sizeof(fourth_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(fourth_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 4);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              first_expected_bytes + second_expected_bytes +
+                  third_expected_bytes + fourth_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 2);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(),
+              first_expected_bytes + second_expected_bytes);
+  }
+
+  {
+    const auto& meta = blob_files.back();
+    assert(meta);
+
+    constexpr uint64_t new_first_expected_bytes =
+        sizeof(new_first_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(first_key) -
+                                                          1);
+    constexpr uint64_t new_second_expected_bytes =
+        sizeof(new_second_value) - 1 +
+        BlobLogRecord::CalculateAdjustmentForRecordHeader(sizeof(second_key) -
+                                                          1);
+
+    ASSERT_EQ(meta->GetTotalBlobCount(), 2);
+    ASSERT_EQ(meta->GetTotalBlobBytes(),
+              new_first_expected_bytes + new_second_expected_bytes);
+    ASSERT_EQ(meta->GetGarbageBlobCount(), 0);
+    ASSERT_EQ(meta->GetGarbageBlobBytes(), 0);
+  }
+}
+
+TEST_F(DBBlobCompactionTest, MergeBlobWithBase) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("Key1", "v1_1"));
+  ASSERT_OK(Put("Key2", "v2_1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_2"));
+  ASSERT_OK(Merge("Key2", "v2_2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("Key1", "v1_3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  ASSERT_EQ(Get("Key1"), "v1_1,v1_2,v1_3");
+  ASSERT_EQ(Get("Key2"), "v2_1,v2_2");
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadGarbageCollection) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "pie"));
+  ASSERT_OK(Put("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "pie");
+  ASSERT_EQ(Get("foo"), "baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadFilter) {
+  Options options = GetDefaultOptions();
+
+  std::unique_ptr<CompactionFilter> compaction_filter_guard(
+      new ValueMutationFilter("pie"));
+
+  options.compaction_filter = compaction_filter_guard.get();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "limepie");
+  ASSERT_EQ(Get("foo"), "barpie");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionReadaheadMerge) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.blob_compaction_readahead_size = 1 << 10;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Merge("key", "pie"));
+  ASSERT_OK(Merge("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  size_t num_non_prefetch_reads = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileReader::GetBlob:ReadFromFile",
+      [&num_non_prefetch_reads](void* /* arg */) { ++num_non_prefetch_reads; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(Get("key"), "lime,pie");
+  ASSERT_EQ(Get("foo"), "bar,baz");
+  ASSERT_EQ(num_non_prefetch_reads, 0);
+
+  Close();
+}
+
+TEST_F(DBBlobCompactionTest, CompactionDoNotFillCache) {
+  Options options = GetDefaultOptions();
+
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+
+  LRUCacheOptions cache_options;
+  cache_options.capacity = 1 << 20;
+  cache_options.metadata_charge_policy = kDontChargeCacheMetadata;
+
+  options.blob_cache = NewLRUCache(cache_options);
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "lime"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("key", "pie"));
+  ASSERT_OK(Put("foo", "baz"));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(options.statistics->getTickerCount(BLOB_DB_CACHE_ADD), 0);
+
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_corruption_test.cc b/src/rocksdb/db/blob/db_blob_corruption_test.cc
new file mode 100644
index 000000000..7ac7ce3fc
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_corruption_test.cc
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlobCorruptionTest : public DBTestBase {
+ protected:
+  DBBlobCorruptionTest()
+      : DBTestBase("db_blob_corruption_test", /* env_do_fsync */ false) {}
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    uint64_t picked_number = kInvalidBlobFileNumber;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          number > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = number;
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+
+  ASSERT_OK(Put(Slice("key_1"), Slice("blob_value_1")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Slice("key_2"), Slice("blob_value_2")));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  Close();
+
+  Corrupt(kBlobFile, 0, 2);
+
+  ASSERT_OK(TryReopen(options));
+
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        const Status* s = static_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/db_blob_index_test.cc b/src/rocksdb/db/blob/db_blob_index_test.cc
new file mode 100644
index 000000000..64c550894
--- /dev/null
+++ b/src/rocksdb/db/blob/db_blob_index_test.cc
@@ -0,0 +1,602 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kTypeBlobIndex is a value type used by BlobDB only. The base rocksdb
+// should accept the value type on write, and report not supported value
+// for reads, unless caller request for it explicitly. The base rocksdb
+// doesn't understand format of actual blob index (the value).
+class DBBlobIndexTest : public DBTestBase {
+ public:
+  enum Tier {
+    kMemtable = 0,
+    kImmutableMemtables = 1,
+    kL0SstFile = 2,
+    kLnSstFile = 3,
+  };
+  const std::vector<Tier> kAllTiers = {Tier::kMemtable,
+                                       Tier::kImmutableMemtables,
+                                       Tier::kL0SstFile, Tier::kLnSstFile};
+
+  DBBlobIndexTest() : DBTestBase("db_blob_index_test", /*env_do_fsync=*/true) {}
+
+  ColumnFamilyHandle* cfh() { return dbfull()->DefaultColumnFamily(); }
+
+  ColumnFamilyData* cfd() {
+    return static_cast_with_check<ColumnFamilyHandleImpl>(cfh())->cfd();
+  }
+
+  Status PutBlobIndex(WriteBatch* batch, const Slice& key,
+                      const Slice& blob_index) {
+    return WriteBatchInternal::PutBlobIndex(batch, cfd()->GetID(), key,
+                                            blob_index);
+  }
+
+  Status Write(WriteBatch* batch) {
+    return dbfull()->Write(WriteOptions(), batch);
+  }
+
+  std::string GetImpl(const Slice& key, bool* is_blob_index = nullptr,
+                      const Snapshot* snapshot = nullptr) {
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = cfh();
+    get_impl_options.value = &value;
+    get_impl_options.is_blob_index = is_blob_index;
+    auto s = dbfull()->GetImpl(read_options, key, get_impl_options);
+    if (s.IsNotFound()) {
+      return "NOT_FOUND";
+    }
+    if (s.IsCorruption()) {
+      return "CORRUPTION";
+    }
+    if (s.IsNotSupported()) {
+      return "NOT_SUPPORTED";
+    }
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+
+  std::string GetBlobIndex(const Slice& key,
+                           const Snapshot* snapshot = nullptr) {
+    bool is_blob_index = false;
+    std::string value = GetImpl(key, &is_blob_index, snapshot);
+    if (!is_blob_index) {
+      return "NOT_BLOB";
+    }
+    return value;
+  }
+
+  ArenaWrappedDBIter* GetBlobIterator() {
+    return dbfull()->NewIteratorImpl(
+        ReadOptions(), cfd(), dbfull()->GetLatestSequenceNumber(),
+        nullptr /*read_callback*/, true /*expose_blob_index*/);
+  }
+
+  Options GetTestOptions() {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.disable_auto_compactions = true;
+    // Disable auto flushes.
+    options.max_write_buffer_number = 10;
+    options.min_write_buffer_number_to_merge = 10;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    return options;
+  }
+
+  void MoveDataTo(Tier tier) {
+    switch (tier) {
+      case Tier::kMemtable:
+        break;
+      case Tier::kImmutableMemtables:
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+        break;
+      case Tier::kL0SstFile:
+        ASSERT_OK(Flush());
+        break;
+      case Tier::kLnSstFile:
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("a", "dummy"));
+        ASSERT_OK(Put("z", "dummy"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+        ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+        break;
+    }
+  }
+};
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. We should be able to write kTypeBlobIndex to memtables and
+// SST files.
+TEST_F(DBBlobIndexTest, Write) {
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+
+    std::vector<std::pair<std::string, std::string>> key_values;
+
+    constexpr size_t num_key_values = 5;
+
+    key_values.reserve(num_key_values);
+
+    for (size_t i = 1; i <= num_key_values; ++i) {
+      std::string key = "key" + std::to_string(i);
+
+      std::string blob_index;
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210,
+                                  "blob" + std::to_string(i));
+
+      key_values.emplace_back(std::move(key), std::move(blob_index));
+    }
+
+    for (const auto& key_value : key_values) {
+      WriteBatch batch;
+      ASSERT_OK(PutBlobIndex(&batch, key_value.first, key_value.second));
+      ASSERT_OK(Write(&batch));
+    }
+
+    MoveDataTo(tier);
+
+    for (const auto& key_value : key_values) {
+      ASSERT_EQ(GetBlobIndex(key_value.first), key_value.second);
+    }
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should be able to return blob index if is_blob_index is
+// provided, otherwise it should return Status::NotSupported (when reading from
+// memtable) or Status::Corruption (when reading from SST). Reading from SST
+// returns Corruption because we can't differentiate between the application
+// accidentally opening the base DB of a stacked BlobDB and actual corruption
+// when using the integrated BlobDB.
+TEST_F(DBBlobIndexTest, Get) {
+  std::string blob_index;
+  BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "value"));
+    ASSERT_OK(PutBlobIndex(&batch, "blob_key", blob_index));
+    ASSERT_OK(Write(&batch));
+
+    MoveDataTo(tier);
+
+    // Verify normal value
+    bool is_blob_index = false;
+    PinnableSlice value;
+    ASSERT_EQ("value", Get("key"));
+    ASSERT_EQ("value", GetImpl("key"));
+    ASSERT_EQ("value", GetImpl("key", &is_blob_index));
+    ASSERT_FALSE(is_blob_index);
+
+    // Verify blob index
+    if (tier <= kImmutableMemtables) {
+      ASSERT_TRUE(Get("blob_key", &value).IsNotSupported());
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("blob_key"));
+    } else {
+      ASSERT_TRUE(Get("blob_key", &value).IsCorruption());
+      ASSERT_EQ("CORRUPTION", GetImpl("blob_key"));
+    }
+    ASSERT_EQ(blob_index, GetImpl("blob_key", &is_blob_index));
+    ASSERT_TRUE(is_blob_index);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. Get should NOT return Status::NotSupported/Status::Corruption
+// if blob index is updated with a normal value. See the test case above for
+// more details.
+TEST_F(DBBlobIndexTest, Updated) {
+  std::string blob_index;
+  BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 9876543210, "blob");
+
+  for (auto tier : kAllTiers) {
+    DestroyAndReopen(GetTestOptions());
+    WriteBatch batch;
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(PutBlobIndex(&batch, "key" + std::to_string(i), blob_index));
+    }
+    ASSERT_OK(Write(&batch));
+    // Avoid blob values from being purged.
+    const Snapshot* snapshot = dbfull()->GetSnapshot();
+    ASSERT_OK(Put("key1", "new_value"));
+    ASSERT_OK(Merge("key2", "a"));
+    ASSERT_OK(Merge("key2", "b"));
+    ASSERT_OK(Merge("key2", "c"));
+    ASSERT_OK(Delete("key3"));
+    ASSERT_OK(SingleDelete("key4"));
+    ASSERT_OK(Delete("key5"));
+    ASSERT_OK(Merge("key5", "a"));
+    ASSERT_OK(Merge("key5", "b"));
+    ASSERT_OK(Merge("key5", "c"));
+    ASSERT_OK(dbfull()->DeleteRange(WriteOptions(), cfh(), "key6", "key9"));
+    MoveDataTo(tier);
+    for (int i = 0; i < 10; i++) {
+      ASSERT_EQ(blob_index, GetBlobIndex("key" + std::to_string(i), snapshot));
+    }
+    ASSERT_EQ("new_value", Get("key1"));
+    if (tier <= kImmutableMemtables) {
+      ASSERT_EQ("NOT_SUPPORTED", GetImpl("key2"));
+    } else {
+      ASSERT_EQ("CORRUPTION", GetImpl("key2"));
+    }
+    ASSERT_EQ("NOT_FOUND", Get("key3"));
+    ASSERT_EQ("NOT_FOUND", Get("key4"));
+    ASSERT_EQ("a,b,c", GetImpl("key5"));
+    for (int i = 6; i < 9; i++) {
+      ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+    }
+    ASSERT_EQ(blob_index, GetBlobIndex("key9"));
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+// Note: the following test case pertains to the StackableDB-based BlobDB
+// implementation. When a blob iterator is used, it should set the
+// expose_blob_index flag for the underlying DBIter, and retrieve/return the
+// corresponding blob value. If a regular DBIter is created (i.e.
+// expose_blob_index is not set), it should return Status::Corruption.
+TEST_F(DBBlobIndexTest, Iterate) {
+  const std::vector<std::vector<ValueType>> data = {
+      /*00*/ {kTypeValue},
+      /*01*/ {kTypeBlobIndex},
+      /*02*/ {kTypeValue},
+      /*03*/ {kTypeBlobIndex, kTypeValue},
+      /*04*/ {kTypeValue},
+      /*05*/ {kTypeValue, kTypeBlobIndex},
+      /*06*/ {kTypeValue},
+      /*07*/ {kTypeDeletion, kTypeBlobIndex},
+      /*08*/ {kTypeValue},
+      /*09*/ {kTypeSingleDeletion, kTypeBlobIndex},
+      /*10*/ {kTypeValue},
+      /*11*/ {kTypeMerge, kTypeMerge, kTypeMerge, kTypeBlobIndex},
+      /*12*/ {kTypeValue},
+      /*13*/
+      {kTypeMerge, kTypeMerge, kTypeMerge, kTypeDeletion, kTypeBlobIndex},
+      /*14*/ {kTypeValue},
+      /*15*/ {kTypeBlobIndex},
+      /*16*/ {kTypeValue},
+  };
+
+  auto get_key = [](int index) {
+    char buf[20];
+    snprintf(buf, sizeof(buf), "%02d", index);
+    return "key" + std::string(buf);
+  };
+
+  auto get_value = [&](int index, int version) {
+    return get_key(index) + "_value" + std::to_string(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status::Code expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status().code());
+    if (expected_status == Status::kOk) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto create_normal_iterator = [&]() -> Iterator* {
+    return dbfull()->NewIterator(ReadOptions());
+  };
+
+  auto create_blob_iterator = [&]() -> Iterator* { return GetBlobIterator(); };
+
+  auto check_is_blob = [&](bool is_blob) {
+    return [is_blob](Iterator* iterator) {
+      ASSERT_EQ(is_blob,
+                reinterpret_cast<ArenaWrappedDBIter*>(iterator)->IsBlob());
+    };
+  };
+
+  auto verify = [&](int index, Status::Code expected_status,
+                    const Slice& forward_value, const Slice& backward_value,
+                    std::function<Iterator*()> create_iterator,
+                    std::function<void(Iterator*)> extra_check = nullptr) {
+    // Seek
+    auto* iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index));
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Next
+    iterator = create_iterator();
+    ASSERT_OK(iterator->Refresh());
+    iterator->Seek(get_key(index - 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Next();
+    check_iterator(iterator, expected_status, forward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // SeekForPrev
+    iterator = create_iterator();
+    ASSERT_OK(iterator->status());
+    ASSERT_OK(iterator->Refresh());
+    iterator->SeekForPrev(get_key(index));
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+
+    // Prev
+    iterator = create_iterator();
+    iterator->Seek(get_key(index + 1));
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_OK(iterator->status());
+    iterator->Prev();
+    check_iterator(iterator, expected_status, backward_value);
+    if (extra_check) {
+      extra_check(iterator);
+    }
+    delete iterator;
+  };
+
+  for (auto tier : {Tier::kMemtable} /*kAllTiers*/) {
+    // Avoid values from being purged.
+    std::vector<const Snapshot*> snapshots;
+    DestroyAndReopen(GetTestOptions());
+
+    // fill data
+    for (int i = 0; i < static_cast<int>(data.size()); i++) {
+      for (int j = static_cast<int>(data[i].size()) - 1; j >= 0; j--) {
+        std::string key = get_key(i);
+        std::string value = get_value(i, j);
+        WriteBatch batch;
+        switch (data[i][j]) {
+          case kTypeValue:
+            ASSERT_OK(Put(key, value));
+            break;
+          case kTypeDeletion:
+            ASSERT_OK(Delete(key));
+            break;
+          case kTypeSingleDeletion:
+            ASSERT_OK(SingleDelete(key));
+            break;
+          case kTypeMerge:
+            ASSERT_OK(Merge(key, value));
+            break;
+          case kTypeBlobIndex:
+            ASSERT_OK(PutBlobIndex(&batch, key, value));
+            ASSERT_OK(Write(&batch));
+            break;
+          default:
+            FAIL();
+        };
+      }
+      snapshots.push_back(dbfull()->GetSnapshot());
+    }
+    ASSERT_OK(
+        dbfull()->DeleteRange(WriteOptions(), cfh(), get_key(15), get_key(16)));
+    snapshots.push_back(dbfull()->GetSnapshot());
+    MoveDataTo(tier);
+
+    // Normal iterator
+    verify(1, Status::kCorruption, "", "", create_normal_iterator);
+    verify(3, Status::kCorruption, "", "", create_normal_iterator);
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_normal_iterator);
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_normal_iterator);
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_normal_iterator);
+    verify(11, Status::kCorruption, "", "", create_normal_iterator);
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_normal_iterator);
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_normal_iterator);
+
+    // Iterator with blob support
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+
+#ifndef ROCKSDB_LITE
+    // Iterator with blob support and using seek.
+    ASSERT_OK(dbfull()->SetOptions(
+        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
+    verify(1, Status::kOk, get_value(1, 0), get_value(1, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(3, Status::kOk, get_value(3, 0), get_value(3, 0),
+           create_blob_iterator, check_is_blob(true));
+    verify(5, Status::kOk, get_value(5, 0), get_value(5, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(7, Status::kOk, get_value(8, 0), get_value(6, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(9, Status::kOk, get_value(10, 0), get_value(8, 0),
+           create_blob_iterator, check_is_blob(false));
+    if (tier <= kImmutableMemtables) {
+      verify(11, Status::kNotSupported, "", "", create_blob_iterator);
+    } else {
+      verify(11, Status::kCorruption, "", "", create_blob_iterator);
+    }
+    verify(13, Status::kOk,
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           get_value(13, 2) + "," + get_value(13, 1) + "," + get_value(13, 0),
+           create_blob_iterator, check_is_blob(false));
+    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
+           create_blob_iterator, check_is_blob(false));
+#endif  // !ROCKSDB_LITE
+
+    for (auto* snapshot : snapshots) {
+      dbfull()->ReleaseSnapshot(snapshot);
+    }
+  }
+}
+
+TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
+  const std::vector<std::vector<std::string>> data = {
+      /*00*/ {"Put"},
+      /*01*/ {"Put", "Merge", "Merge", "Merge"},
+      /*02*/ {"Put"}};
+
+  auto get_key = [](size_t index) { return ("key" + std::to_string(index)); };
+
+  auto get_value = [&](size_t index, size_t version) {
+    return get_key(index) + "_value" + std::to_string(version);
+  };
+
+  auto check_iterator = [&](Iterator* iterator, Status expected_status,
+                            const Slice& expected_value) {
+    ASSERT_EQ(expected_status, iterator->status());
+    if (expected_status.ok()) {
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ(expected_value, iterator->value());
+    } else {
+      ASSERT_FALSE(iterator->Valid());
+    }
+  };
+
+  auto verify = [&](size_t index, Status expected_status,
+                    const Slice& expected_value) {
+    // Seek
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Next
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->Refresh());
+      iterator->Seek(get_key(index - 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Next();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // SeekForPrev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      ASSERT_OK(iterator->status());
+      ASSERT_OK(iterator->Refresh());
+      iterator->SeekForPrev(get_key(index));
+      check_iterator(iterator, expected_status, expected_value);
+    }
+    // Prev
+    {
+      Iterator* iterator = db_->NewIterator(ReadOptions());
+      std::unique_ptr<Iterator> iterator_guard(iterator);
+      iterator->Seek(get_key(index + 1));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_OK(iterator->status());
+      iterator->Prev();
+      check_iterator(iterator, expected_status, expected_value);
+    }
+  };
+
+  Options options = GetTestOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  DestroyAndReopen(options);
+
+  // fill data
+  for (size_t i = 0; i < data.size(); i++) {
+    for (size_t j = 0; j < data[i].size(); j++) {
+      std::string key = get_key(i);
+      std::string value = get_value(i, j);
+      if (data[i][j] == "Put") {
+        ASSERT_OK(Put(key, value));
+        ASSERT_OK(Flush());
+      } else if (data[i][j] == "Merge") {
+        ASSERT_OK(Merge(key, value));
+        ASSERT_OK(Flush());
+      }
+    }
+  }
+
+  std::string expected_value = get_value(1, 0) + "," + get_value(1, 1) + "," +
+                               get_value(1, 2) + "," + get_value(1, 3);
+  Status expected_status;
+  verify(1, expected_status, expected_value);
+
+#ifndef ROCKSDB_LITE
+  // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
+  ASSERT_OK(dbfull()->SetOptions(cfh(),
+                                 {{"max_sequential_skip_in_iterations", "0"}}));
+  verify(1, expected_status, expected_value);
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.cc b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
new file mode 100644
index 000000000..079576f51
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.cc
@@ -0,0 +1,21 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/prefetch_buffer_collection.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FilePrefetchBuffer* PrefetchBufferCollection::GetOrCreatePrefetchBuffer(
+    uint64_t file_number) {
+  auto& prefetch_buffer = prefetch_buffers_[file_number];
+  if (!prefetch_buffer) {
+    prefetch_buffer.reset(
+        new FilePrefetchBuffer(readahead_size_, readahead_size_));
+  }
+
+  return prefetch_buffer.get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/blob/prefetch_buffer_collection.h b/src/rocksdb/db/blob/prefetch_buffer_collection.h
new file mode 100644
index 000000000..b973eddc0
--- /dev/null
+++ b/src/rocksdb/db/blob/prefetch_buffer_collection.h
@@ -0,0 +1,38 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+
+#include "file/file_prefetch_buffer.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A class that owns a collection of FilePrefetchBuffers using the file number
+// as key. Used for implementing compaction readahead for blob files. Designed
+// to be accessed by a single thread only: every (sub)compaction needs its own
+// buffers since they are guaranteed to read different blobs from different
+// positions even when reading the same file.
+class PrefetchBufferCollection {
+ public:
+  explicit PrefetchBufferCollection(uint64_t readahead_size)
+      : readahead_size_(readahead_size) {
+    assert(readahead_size_ > 0);
+  }
+
+  FilePrefetchBuffer* GetOrCreatePrefetchBuffer(uint64_t file_number);
+
+ private:
+  uint64_t readahead_size_;
+  std::unordered_map<uint64_t, std::unique_ptr<FilePrefetchBuffer>>
+      prefetch_buffers_;  // maps file number to prefetch buffer
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.cc b/src/rocksdb/db/builder.cc
new file mode 100644
index 000000000..9283ffd64
--- /dev/null
+++ b/src/rocksdb/db/builder.cc
@@ -0,0 +1,434 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/builder.h"
+
+#include <algorithm>
+#include <deque>
+#include <vector>
+
+#include "db/blob/blob_file_builder.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/event_helpers.h"
+#include "db/internal_stats.h"
+#include "db/merge_helper.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TableFactory;
+
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file) {
+  assert((tboptions.column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         tboptions.column_family_name.empty());
+  return tboptions.ioptions.table_factory->NewTableBuilder(tboptions, file);
+}
+
+Status BuildTable(
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason,
+    const SeqnoToTimeMapping& seqno_to_time_mapping, EventLogger* event_logger,
+    int job_id, const Env::IOPriority io_priority,
+    TableProperties* table_properties, Env::WriteLifeTimeHint write_hint,
+    const std::string* full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback, uint64_t* num_input_entries,
+    uint64_t* memtable_payload_bytes, uint64_t* memtable_garbage_bytes) {
+  assert((tboptions.column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) ==
+         tboptions.column_family_name.empty());
+  auto& mutable_cf_options = tboptions.moptions;
+  auto& ioptions = tboptions.ioptions;
+  // Reports the IOStats for flush for every following bytes.
+  const size_t kReportFlushIOStatsEvery = 1048576;
+  OutputValidator output_validator(
+      tboptions.internal_comparator,
+      /*enable_order_check=*/
+      mutable_cf_options.check_flush_compaction_key_order,
+      /*enable_hash=*/paranoid_file_checks);
+  Status s;
+  meta->fd.file_size = 0;
+  iter->SeekToFirst();
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&tboptions.internal_comparator,
+                                       snapshots, full_history_ts_low));
+  uint64_t num_unfragmented_tombstones = 0;
+  uint64_t total_tombstone_payload_bytes = 0;
+  for (auto& range_del_iter : range_del_iters) {
+    num_unfragmented_tombstones +=
+        range_del_iter->num_unfragmented_tombstones();
+    total_tombstone_payload_bytes +=
+        range_del_iter->total_tombstone_payload_bytes();
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+
+  std::string fname = TableFileName(ioptions.cf_paths, meta->fd.GetNumber(),
+                                    meta->fd.GetPathId());
+  std::vector<std::string> blob_file_paths;
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
+                                               tboptions.column_family_name,
+                                               fname, job_id, tboptions.reason);
+#endif  // !ROCKSDB_LITE
+  Env* env = db_options.env;
+  assert(env);
+  FileSystem* fs = db_options.fs.get();
+  assert(fs);
+
+  TableProperties tp;
+  bool table_file_created = false;
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions.compaction_filter_factory != nullptr &&
+        ioptions.compaction_filter_factory->ShouldFilterTableFileCreation(
+            tboptions.reason)) {
+      CompactionFilter::Context context;
+      context.is_full_compaction = false;
+      context.is_manual_compaction = false;
+      context.column_family_id = tboptions.column_family_id;
+      context.reason = tboptions.reason;
+      compaction_filter =
+          ioptions.compaction_filter_factory->CreateCompactionFilter(context);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s.PermitUncheckedError();
+        return Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+      }
+    }
+
+    TableBuilder* builder;
+    std::unique_ptr<WritableFileWriter> file_writer;
+    {
+      std::unique_ptr<FSWritableFile> file;
+#ifndef NDEBUG
+      bool use_direct_writes = file_options.use_direct_writes;
+      TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes);
+#endif  // !NDEBUG
+      IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
+      assert(s.ok());
+      s = io_s;
+      if (io_status->ok()) {
+        *io_status = io_s;
+      }
+      if (!s.ok()) {
+        EventHelpers::LogAndNotifyTableFileCreationFinished(
+            event_logger, ioptions.listeners, dbname,
+            tboptions.column_family_name, fname, job_id, meta->fd,
+            kInvalidBlobFileNumber, tp, tboptions.reason, s, file_checksum,
+            file_checksum_func_name);
+        return s;
+      }
+
+      table_file_created = true;
+      FileTypeSet tmp_set = ioptions.checksum_handoff_file_types;
+      file->SetIOPriority(io_priority);
+      file->SetWriteLifeTimeHint(write_hint);
+      file_writer.reset(new WritableFileWriter(
+          std::move(file), fname, file_options, ioptions.clock, io_tracer,
+          ioptions.stats, ioptions.listeners,
+          ioptions.file_checksum_gen_factory.get(),
+          tmp_set.Contains(FileType::kTableFile), false));
+
+      builder = NewTableBuilder(tboptions, file_writer.get());
+    }
+
+    MergeHelper merge(
+        env, tboptions.internal_comparator.user_comparator(),
+        ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
+        true /* internal key corruption is not ok */,
+        snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
+
+    std::unique_ptr<BlobFileBuilder> blob_file_builder(
+        (mutable_cf_options.enable_blob_files &&
+         tboptions.level_at_creation >=
+             mutable_cf_options.blob_file_starting_level &&
+         blob_file_additions)
+            ? new BlobFileBuilder(
+                  versions, fs, &ioptions, &mutable_cf_options, &file_options,
+                  tboptions.db_id, tboptions.db_session_id, job_id,
+                  tboptions.column_family_id, tboptions.column_family_name,
+                  io_priority, write_hint, io_tracer, blob_callback,
+                  blob_creation_reason, &blob_file_paths, blob_file_additions)
+            : nullptr);
+
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionIterator c_iter(
+        iter, tboptions.internal_comparator.user_comparator(), &merge,
+        kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
+        job_snapshot, snapshot_checker, env,
+        ShouldReportDetailedTime(env, ioptions.stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        blob_file_builder.get(), ioptions.allow_data_in_errors,
+        ioptions.enforce_single_del_contracts,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
+
+    c_iter.SeekToFirst();
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const Slice& key = c_iter.key();
+      const Slice& value = c_iter.value();
+      const ParsedInternalKey& ikey = c_iter.ikey();
+      // Generate a rolling 64-bit hash of the key and values
+      // Note :
+      // Here "key" integrates 'sequence_number'+'kType'+'user key'.
+      s = output_validator.Add(key, value);
+      if (!s.ok()) {
+        break;
+      }
+      builder->Add(key, value);
+
+      s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+      if (!s.ok()) {
+        break;
+      }
+
+      // TODO(noetzli): Update stats after flush, too.
+      if (io_priority == Env::IO_HIGH &&
+          IOSTATS(bytes_written) >= kReportFlushIOStatsEvery) {
+        ThreadStatusUtil::SetThreadOperationProperty(
+            ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+      }
+    }
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
+
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        auto kv = tombstone.Serialize();
+        builder->Add(kv.first.Encode(), kv.second);
+        meta->UpdateBoundariesForRange(kv.first, tombstone.SerializeEndKey(),
+                                       tombstone.seq_,
+                                       tboptions.internal_comparator);
+      }
+    }
+
+    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
+    const bool empty = builder->IsEmpty();
+    if (num_input_entries != nullptr) {
+      *num_input_entries =
+          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+    }
+    if (!s.ok() || empty) {
+      builder->Abandon();
+    } else {
+      std::string seqno_time_mapping_str;
+      seqno_to_time_mapping.Encode(
+          seqno_time_mapping_str, meta->fd.smallest_seqno,
+          meta->fd.largest_seqno, meta->file_creation_time);
+      builder->SetSeqnoTimeTableProperties(
+          seqno_time_mapping_str,
+          ioptions.compaction_style == CompactionStyle::kCompactionStyleFIFO
+              ? meta->file_creation_time
+              : meta->oldest_ancester_time);
+      s = builder->Finish();
+    }
+    if (io_status->ok()) {
+      *io_status = builder->io_status();
+    }
+
+    if (s.ok() && !empty) {
+      uint64_t file_size = builder->FileSize();
+      meta->fd.file_size = file_size;
+      meta->marked_for_compaction = builder->NeedCompact();
+      assert(meta->fd.GetFileSize() > 0);
+      tp = builder
+               ->GetTableProperties();  // refresh now that builder is finished
+      if (memtable_payload_bytes != nullptr &&
+          memtable_garbage_bytes != nullptr) {
+        const CompactionIterationStats& ci_stats = c_iter.iter_stats();
+        uint64_t total_payload_bytes = ci_stats.total_input_raw_key_bytes +
+                                       ci_stats.total_input_raw_value_bytes +
+                                       total_tombstone_payload_bytes;
+        uint64_t total_payload_bytes_written =
+            (tp.raw_key_size + tp.raw_value_size);
+        // Prevent underflow, which may still happen at this point
+        // since we only support inserts, deletes, and deleteRanges.
+        if (total_payload_bytes_written <= total_payload_bytes) {
+          *memtable_payload_bytes = total_payload_bytes;
+          *memtable_garbage_bytes =
+              total_payload_bytes - total_payload_bytes_written;
+        } else {
+          *memtable_payload_bytes = 0;
+          *memtable_garbage_bytes = 0;
+        }
+      }
+      if (table_properties) {
+        *table_properties = tp;
+      }
+    }
+    delete builder;
+
+    // Finish and check for file errors
+    TEST_SYNC_POINT("BuildTable:BeforeSyncTable");
+    if (s.ok() && !empty) {
+      StopWatch sw(ioptions.clock, ioptions.stats, TABLE_SYNC_MICROS);
+      *io_status = file_writer->Sync(ioptions.use_fsync);
+    }
+    TEST_SYNC_POINT("BuildTable:BeforeCloseTableFile");
+    if (s.ok() && io_status->ok() && !empty) {
+      *io_status = file_writer->Close();
+    }
+    if (s.ok() && io_status->ok() && !empty) {
+      // Add the checksum information to file metadata.
+      meta->file_checksum = file_writer->GetFileChecksum();
+      meta->file_checksum_func_name = file_writer->GetFileChecksumFuncName();
+      file_checksum = meta->file_checksum;
+      file_checksum_func_name = meta->file_checksum_func_name;
+      // Set unique_id only if db_id and db_session_id exist
+      if (!tboptions.db_id.empty() && !tboptions.db_session_id.empty()) {
+        if (!GetSstInternalUniqueId(tboptions.db_id, tboptions.db_session_id,
+                                    meta->fd.GetNumber(), &(meta->unique_id))
+                 .ok()) {
+          // if failed to get unique id, just set it Null
+          meta->unique_id = kNullUniqueId64x2;
+        }
+      }
+    }
+
+    if (s.ok()) {
+      s = *io_status;
+    }
+
+    if (blob_file_builder) {
+      if (s.ok()) {
+        s = blob_file_builder->Finish();
+      } else {
+        blob_file_builder->Abandon(s);
+      }
+      blob_file_builder.reset();
+    }
+
+    // TODO Also check the IO status when create the Iterator.
+
+    TEST_SYNC_POINT("BuildTable:BeforeOutputValidation");
+    if (s.ok() && !empty) {
+      // Verify that the table is usable
+      // We set for_compaction to false and don't OptimizeForCompactionTableRead
+      // here because this is a special case after we finish the table building.
+      // No matter whether use_direct_io_for_flush_and_compaction is true,
+      // the goal is to cache it here for further user reads.
+      ReadOptions read_options;
+      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
+          read_options, file_options, tboptions.internal_comparator, *meta,
+          nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
+          nullptr,
+          (internal_stats == nullptr) ? nullptr
+                                      : internal_stats->GetFileReadHist(0),
+          TableReaderCaller::kFlush, /*arena=*/nullptr,
+          /*skip_filter=*/false, tboptions.level_at_creation,
+          MaxFileSizeForL0MetaPin(mutable_cf_options),
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key*/ nullptr,
+          /*allow_unprepared_value*/ false));
+      s = it->status();
+      if (s.ok() && paranoid_file_checks) {
+        OutputValidator file_validator(tboptions.internal_comparator,
+                                       /*enable_order_check=*/true,
+                                       /*enable_hash=*/true);
+        for (it->SeekToFirst(); it->Valid(); it->Next()) {
+          // Generate a rolling 64-bit hash of the key and values
+          file_validator.Add(it->key(), it->value()).PermitUncheckedError();
+        }
+        s = it->status();
+        if (s.ok() && !output_validator.CompareValidator(file_validator)) {
+          s = Status::Corruption("Paranoid checksums do not match");
+        }
+      }
+    }
+  }
+
+  // Check for input iterator errors
+  if (!iter->status().ok()) {
+    s = iter->status();
+  }
+
+  if (!s.ok() || meta->fd.GetFileSize() == 0) {
+    TEST_SYNC_POINT("BuildTable:BeforeDeleteFile");
+
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (table_file_created) {
+      Status ignored = fs->DeleteFile(fname, IOOptions(), dbg);
+      ignored.PermitUncheckedError();
+    }
+
+    assert(blob_file_additions || blob_file_paths.empty());
+
+    if (blob_file_additions) {
+      for (const std::string& blob_file_path : blob_file_paths) {
+        Status ignored = DeleteDBFile(&db_options, blob_file_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
+        ignored.PermitUncheckedError();
+        TEST_SYNC_POINT("BuildTable::AfterDeleteFile");
+      }
+    }
+  }
+
+  Status status_for_listener = s;
+  if (meta->fd.GetFileSize() == 0) {
+    fname = "(nil)";
+    if (s.ok()) {
+      status_for_listener = Status::Aborted("Empty SST file not kept");
+    }
+  }
+  // Output to event logger and fire events.
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger, ioptions.listeners, dbname, tboptions.column_family_name,
+      fname, job_id, meta->fd, meta->oldest_blob_file_number, tp,
+      tboptions.reason, status_for_listener, file_checksum,
+      file_checksum_func_name);
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/builder.h b/src/rocksdb/db/builder.h
new file mode 100644
index 000000000..a028fd2ba
--- /dev/null
+++ b/src/rocksdb/db/builder.h
@@ -0,0 +1,77 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/range_tombstone_fragmenter.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/table_properties_collector.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/types.h"
+#include "table/scoped_arena_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionSet;
+class BlobFileAddition;
+class SnapshotChecker;
+class TableCache;
+class TableBuilder;
+class WritableFileWriter;
+class InternalStats;
+class BlobFileCompletionCallback;
+
+// Convenience function for NewTableBuilder on the embedded table_factory.
+TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
+                              WritableFileWriter* file);
+
+// Build a Table file from the contents of *iter.  The generated file
+// will be named according to number specified in meta. On success, the rest of
+// *meta will be filled with metadata about the generated table.
+// If no data is present in *iter, meta->file_size will be set to
+// zero, and no Table file will be produced.
+//
+// @param column_family_name Name of the column family that is also identified
+//    by column_family_id, or empty string if unknown.
+extern Status BuildTable(
+    const std::string& dbname, VersionSet* versions,
+    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
+    const FileOptions& file_options, TableCache* table_cache,
+    InternalIterator* iter,
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters,
+    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
+    std::vector<SequenceNumber> snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, SnapshotChecker* snapshot_checker,
+    bool paranoid_file_checks, InternalStats* internal_stats,
+    IOStatus* io_status, const std::shared_ptr<IOTracer>& io_tracer,
+    BlobFileCreationReason blob_creation_reason,
+    const SeqnoToTimeMapping& seqno_to_time_mapping,
+    EventLogger* event_logger = nullptr, int job_id = 0,
+    const Env::IOPriority io_priority = Env::IO_HIGH,
+    TableProperties* table_properties = nullptr,
+    Env::WriteLifeTimeHint write_hint = Env::WLTH_NOT_SET,
+    const std::string* full_history_ts_low = nullptr,
+    BlobFileCompletionCallback* blob_callback = nullptr,
+    uint64_t* num_input_entries = nullptr,
+    uint64_t* memtable_payload_bytes = nullptr,
+    uint64_t* memtable_garbage_bytes = nullptr);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/c.cc b/src/rocksdb/db/c.cc
new file mode 100644
index 000000000..a7e4360c6
--- /dev/null
+++ b/src/rocksdb/db/c.cc
@@ -0,0 +1,6390 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/c.h"
+
+#include <cstdlib>
+#include <map>
+#include <unordered_set>
+#include <vector>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/universal_compaction.h"
+#include "rocksdb/utilities/backup_engine.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/utilities/memory_util.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/options_util.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "rocksdb/utilities/transaction.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_batch.h"
+#include "utilities/merge_operators.h"
+
+using ROCKSDB_NAMESPACE::BackupEngine;
+using ROCKSDB_NAMESPACE::BackupEngineOptions;
+using ROCKSDB_NAMESPACE::BackupID;
+using ROCKSDB_NAMESPACE::BackupInfo;
+using ROCKSDB_NAMESPACE::BatchResult;
+using ROCKSDB_NAMESPACE::BlockBasedTableOptions;
+using ROCKSDB_NAMESPACE::BottommostLevelCompaction;
+using ROCKSDB_NAMESPACE::BytewiseComparator;
+using ROCKSDB_NAMESPACE::Cache;
+using ROCKSDB_NAMESPACE::Checkpoint;
+using ROCKSDB_NAMESPACE::ColumnFamilyDescriptor;
+using ROCKSDB_NAMESPACE::ColumnFamilyHandle;
+using ROCKSDB_NAMESPACE::ColumnFamilyMetaData;
+using ROCKSDB_NAMESPACE::ColumnFamilyOptions;
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionFilterFactory;
+using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::Comparator;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::CuckooTableOptions;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DBOptions;
+using ROCKSDB_NAMESPACE::DbPath;
+using ROCKSDB_NAMESPACE::Env;
+using ROCKSDB_NAMESPACE::EnvOptions;
+using ROCKSDB_NAMESPACE::FileLock;
+using ROCKSDB_NAMESPACE::FilterPolicy;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::InfoLogLevel;
+using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::LevelMetaData;
+using ROCKSDB_NAMESPACE::LiveFileMetaData;
+using ROCKSDB_NAMESPACE::Logger;
+using ROCKSDB_NAMESPACE::LRUCacheOptions;
+using ROCKSDB_NAMESPACE::MemoryAllocator;
+using ROCKSDB_NAMESPACE::MemoryUtil;
+using ROCKSDB_NAMESPACE::MergeOperator;
+using ROCKSDB_NAMESPACE::NewBloomFilterPolicy;
+using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
+using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
+using ROCKSDB_NAMESPACE::NewLRUCache;
+using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
+using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::PerfContext;
+using ROCKSDB_NAMESPACE::PerfLevel;
+using ROCKSDB_NAMESPACE::PinnableSlice;
+using ROCKSDB_NAMESPACE::PrepopulateBlobCache;
+using ROCKSDB_NAMESPACE::RandomAccessFile;
+using ROCKSDB_NAMESPACE::Range;
+using ROCKSDB_NAMESPACE::RateLimiter;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::RestoreOptions;
+using ROCKSDB_NAMESPACE::SequentialFile;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::SliceParts;
+using ROCKSDB_NAMESPACE::SliceTransform;
+using ROCKSDB_NAMESPACE::Snapshot;
+using ROCKSDB_NAMESPACE::SstFileMetaData;
+using ROCKSDB_NAMESPACE::SstFileWriter;
+using ROCKSDB_NAMESPACE::Status;
+using ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory;
+using ROCKSDB_NAMESPACE::Transaction;
+using ROCKSDB_NAMESPACE::TransactionDB;
+using ROCKSDB_NAMESPACE::TransactionDBOptions;
+using ROCKSDB_NAMESPACE::TransactionLogIterator;
+using ROCKSDB_NAMESPACE::TransactionOptions;
+using ROCKSDB_NAMESPACE::WALRecoveryMode;
+using ROCKSDB_NAMESPACE::WritableFile;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteBatchWithIndex;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+using std::unordered_set;
+using std::vector;
+
+extern "C" {
+
+struct rocksdb_t {
+  DB* rep;
+};
+struct rocksdb_backup_engine_t {
+  BackupEngine* rep;
+};
+struct rocksdb_backup_engine_info_t {
+  std::vector<BackupInfo> rep;
+};
+struct rocksdb_restore_options_t {
+  RestoreOptions rep;
+};
+struct rocksdb_iterator_t {
+  Iterator* rep;
+};
+struct rocksdb_writebatch_t {
+  WriteBatch rep;
+};
+struct rocksdb_writebatch_wi_t {
+  WriteBatchWithIndex* rep;
+};
+struct rocksdb_snapshot_t {
+  const Snapshot* rep;
+};
+struct rocksdb_flushoptions_t {
+  FlushOptions rep;
+};
+struct rocksdb_fifo_compaction_options_t {
+  CompactionOptionsFIFO rep;
+};
+struct rocksdb_readoptions_t {
+  ReadOptions rep;
+  // stack variables to set pointers to in ReadOptions
+  Slice upper_bound;
+  Slice lower_bound;
+  Slice timestamp;
+  Slice iter_start_ts;
+};
+struct rocksdb_writeoptions_t {
+  WriteOptions rep;
+};
+struct rocksdb_options_t {
+  Options rep;
+};
+struct rocksdb_compactoptions_t {
+  CompactRangeOptions rep;
+  Slice full_history_ts_low;
+};
+struct rocksdb_block_based_table_options_t {
+  BlockBasedTableOptions rep;
+};
+struct rocksdb_cuckoo_table_options_t {
+  CuckooTableOptions rep;
+};
+struct rocksdb_seqfile_t {
+  SequentialFile* rep;
+};
+struct rocksdb_randomfile_t {
+  RandomAccessFile* rep;
+};
+struct rocksdb_writablefile_t {
+  WritableFile* rep;
+};
+struct rocksdb_wal_iterator_t {
+  TransactionLogIterator* rep;
+};
+struct rocksdb_wal_readoptions_t {
+  TransactionLogIterator::ReadOptions rep;
+};
+struct rocksdb_filelock_t {
+  FileLock* rep;
+};
+struct rocksdb_logger_t {
+  std::shared_ptr<Logger> rep;
+};
+struct rocksdb_lru_cache_options_t {
+  LRUCacheOptions rep;
+};
+struct rocksdb_memory_allocator_t {
+  std::shared_ptr<MemoryAllocator> rep;
+};
+struct rocksdb_cache_t {
+  std::shared_ptr<Cache> rep;
+};
+struct rocksdb_livefiles_t {
+  std::vector<LiveFileMetaData> rep;
+};
+struct rocksdb_column_family_handle_t {
+  ColumnFamilyHandle* rep;
+};
+struct rocksdb_column_family_metadata_t {
+  ColumnFamilyMetaData rep;
+};
+struct rocksdb_level_metadata_t {
+  const LevelMetaData* rep;
+};
+struct rocksdb_sst_file_metadata_t {
+  const SstFileMetaData* rep;
+};
+struct rocksdb_envoptions_t {
+  EnvOptions rep;
+};
+struct rocksdb_ingestexternalfileoptions_t {
+  IngestExternalFileOptions rep;
+};
+struct rocksdb_sstfilewriter_t {
+  SstFileWriter* rep;
+};
+struct rocksdb_ratelimiter_t {
+  std::shared_ptr<RateLimiter> rep;
+};
+struct rocksdb_perfcontext_t {
+  PerfContext* rep;
+};
+struct rocksdb_pinnableslice_t {
+  PinnableSlice rep;
+};
+struct rocksdb_transactiondb_options_t {
+  TransactionDBOptions rep;
+};
+struct rocksdb_transactiondb_t {
+  TransactionDB* rep;
+};
+struct rocksdb_transaction_options_t {
+  TransactionOptions rep;
+};
+struct rocksdb_transaction_t {
+  Transaction* rep;
+};
+struct rocksdb_backup_engine_options_t {
+  BackupEngineOptions rep;
+};
+struct rocksdb_checkpoint_t {
+  Checkpoint* rep;
+};
+struct rocksdb_optimistictransactiondb_t {
+  OptimisticTransactionDB* rep;
+};
+struct rocksdb_optimistictransaction_options_t {
+  OptimisticTransactionOptions rep;
+};
+
+struct rocksdb_compactionfiltercontext_t {
+  CompactionFilter::Context rep;
+};
+
+struct rocksdb_compactionfilter_t : public CompactionFilter {
+  void* state_;
+  void (*destructor_)(void*);
+  unsigned char (*filter_)(void*, int level, const char* key, size_t key_length,
+                           const char* existing_value, size_t value_length,
+                           char** new_value, size_t* new_value_length,
+                           unsigned char* value_changed);
+  const char* (*name_)(void*);
+  unsigned char ignore_snapshots_;
+
+  ~rocksdb_compactionfilter_t() override { (*destructor_)(state_); }
+
+  bool Filter(int level, const Slice& key, const Slice& existing_value,
+              std::string* new_value, bool* value_changed) const override {
+    char* c_new_value = nullptr;
+    size_t new_value_length = 0;
+    unsigned char c_value_changed = 0;
+    unsigned char result =
+        (*filter_)(state_, level, key.data(), key.size(), existing_value.data(),
+                   existing_value.size(), &c_new_value, &new_value_length,
+                   &c_value_changed);
+    if (c_value_changed) {
+      new_value->assign(c_new_value, new_value_length);
+      *value_changed = true;
+    }
+    return result;
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool IgnoreSnapshots() const override { return ignore_snapshots_; }
+};
+
+struct rocksdb_compactionfilterfactory_t : public CompactionFilterFactory {
+  void* state_;
+  void (*destructor_)(void*);
+  rocksdb_compactionfilter_t* (*create_compaction_filter_)(
+      void*, rocksdb_compactionfiltercontext_t* context);
+  const char* (*name_)(void*);
+
+  ~rocksdb_compactionfilterfactory_t() override { (*destructor_)(state_); }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    rocksdb_compactionfiltercontext_t ccontext;
+    ccontext.rep = context;
+    CompactionFilter* cf = (*create_compaction_filter_)(state_, &ccontext);
+    return std::unique_ptr<CompactionFilter>(cf);
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_comparator_t : public Comparator {
+  void* state_;
+  void (*destructor_)(void*);
+  int (*compare_)(void*, const char* a, size_t alen, const char* b,
+                  size_t blen);
+  const char* (*name_)(void*);
+  int (*compare_ts_)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                     size_t b_tslen);
+  int (*compare_without_ts_)(void*, const char* a, size_t alen,
+                             unsigned char a_has_ts, const char* b, size_t blen,
+                             unsigned char b_has_ts);
+
+  rocksdb_comparator_t() : Comparator() {}
+
+  rocksdb_comparator_t(size_t ts_size) : Comparator(ts_size) {}
+
+  ~rocksdb_comparator_t() override { (*destructor_)(state_); }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    return (*compare_)(state_, a.data(), a.size(), b.data(), b.size());
+  }
+
+  int CompareTimestamp(const Slice& a_ts, const Slice& b_ts) const override {
+    if (compare_ts_ == nullptr) {
+      return 0;
+    }
+    return (*compare_ts_)(state_, a_ts.data(), a_ts.size(), b_ts.data(),
+                          b_ts.size());
+  }
+
+  int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                              bool b_has_ts) const override {
+    if (compare_without_ts_ == nullptr) {
+      return Compare(a, b);
+    }
+    return (*compare_without_ts_)(state_, a.data(), a.size(), a_has_ts,
+                                  b.data(), b.size(), b_has_ts);
+  }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  // No-ops since the C binding does not support key shortening methods.
+  void FindShortestSeparator(std::string*, const Slice&) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+struct rocksdb_filterpolicy_t : public FilterPolicy {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+
+  ~rocksdb_filterpolicy_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+};
+
+struct rocksdb_mergeoperator_t : public MergeOperator {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*full_merge_)(void*, const char* key, size_t key_length,
+                       const char* existing_value, size_t existing_value_length,
+                       const char* const* operands_list,
+                       const size_t* operands_list_length, int num_operands,
+                       unsigned char* success, size_t* new_value_length);
+  char* (*partial_merge_)(void*, const char* key, size_t key_length,
+                          const char* const* operands_list,
+                          const size_t* operands_list_length, int num_operands,
+                          unsigned char* success, size_t* new_value_length);
+  void (*delete_value_)(void*, const char* value, size_t value_length);
+
+  ~rocksdb_mergeoperator_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    size_t n = merge_in.operand_list.size();
+    std::vector<const char*> operand_pointers(n);
+    std::vector<size_t> operand_sizes(n);
+    for (size_t i = 0; i < n; i++) {
+      Slice operand(merge_in.operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    const char* existing_value_data = nullptr;
+    size_t existing_value_len = 0;
+    if (merge_in.existing_value != nullptr) {
+      existing_value_data = merge_in.existing_value->data();
+      existing_value_len = merge_in.existing_value->size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*full_merge_)(
+        state_, merge_in.key.data(), merge_in.key.size(), existing_value_data,
+        existing_value_len, &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(n), &success, &new_value_len);
+    merge_out->new_value.assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* /*logger*/) const override {
+    size_t operand_count = operand_list.size();
+    std::vector<const char*> operand_pointers(operand_count);
+    std::vector<size_t> operand_sizes(operand_count);
+    for (size_t i = 0; i < operand_count; ++i) {
+      Slice operand(operand_list[i]);
+      operand_pointers[i] = operand.data();
+      operand_sizes[i] = operand.size();
+    }
+
+    unsigned char success;
+    size_t new_value_len;
+    char* tmp_new_value = (*partial_merge_)(
+        state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0],
+        static_cast<int>(operand_count), &success, &new_value_len);
+    new_value->assign(tmp_new_value, new_value_len);
+
+    if (delete_value_ != nullptr) {
+      (*delete_value_)(state_, tmp_new_value, new_value_len);
+    } else {
+      free(tmp_new_value);
+    }
+
+    return success;
+  }
+};
+
+struct rocksdb_dbpath_t {
+  DbPath rep;
+};
+
+struct rocksdb_env_t {
+  Env* rep;
+  bool is_default;
+};
+
+struct rocksdb_slicetransform_t : public SliceTransform {
+  void* state_;
+  void (*destructor_)(void*);
+  const char* (*name_)(void*);
+  char* (*transform_)(void*, const char* key, size_t length,
+                      size_t* dst_length);
+  unsigned char (*in_domain_)(void*, const char* key, size_t length);
+  unsigned char (*in_range_)(void*, const char* key, size_t length);
+
+  ~rocksdb_slicetransform_t() override { (*destructor_)(state_); }
+
+  const char* Name() const override { return (*name_)(state_); }
+
+  Slice Transform(const Slice& src) const override {
+    size_t len;
+    char* dst = (*transform_)(state_, src.data(), src.size(), &len);
+    return Slice(dst, len);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    return (*in_domain_)(state_, src.data(), src.size());
+  }
+
+  bool InRange(const Slice& src) const override {
+    return (*in_range_)(state_, src.data(), src.size());
+  }
+};
+
+struct rocksdb_universal_compaction_options_t {
+  ROCKSDB_NAMESPACE::CompactionOptionsUniversal* rep;
+};
+
+static bool SaveError(char** errptr, const Status& s) {
+  assert(errptr != nullptr);
+  if (s.ok()) {
+    return false;
+  } else if (*errptr == nullptr) {
+    *errptr = strdup(s.ToString().c_str());
+  } else {
+    // TODO(sanjay): Merge with existing error?
+    // This is a bug if *errptr is not created by malloc()
+    free(*errptr);
+    *errptr = strdup(s.ToString().c_str());
+  }
+  return true;
+}
+
+static char* CopyString(const std::string& str) {
+  char* result = reinterpret_cast<char*>(malloc(sizeof(char) * str.size()));
+  memcpy(result, str.data(), sizeof(char) * str.size());
+  return result;
+}
+
+rocksdb_t* rocksdb_open(const rocksdb_options_t* options, const char* name,
+                        char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::Open(options->rep, std::string(name), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_with_ttl(const rocksdb_options_t* options,
+                                 const char* name, int ttl, char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            options->rep, std::string(name), &db, ttl))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only(const rocksdb_options_t* options,
+                                      const char* name,
+                                      unsigned char error_if_wal_file_exists,
+                                      char** errptr) {
+  DB* db;
+  if (SaveError(errptr, DB::OpenForReadOnly(options->rep, std::string(name),
+                                            &db, error_if_wal_file_exists))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary(const rocksdb_options_t* options,
+                                     const char* name,
+                                     const char* secondary_path,
+                                     char** errptr) {
+  DB* db;
+  if (SaveError(errptr,
+                DB::OpenAsSecondary(options->rep, std::string(name),
+                                    std::string(secondary_path), &db))) {
+    return nullptr;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open(
+    const rocksdb_options_t* options, const char* path, char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(
+                            options->rep.env,
+                            BackupEngineOptions(path, nullptr, true,
+                                                options->rep.info_log.get()),
+                            &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+rocksdb_backup_engine_t* rocksdb_backup_engine_open_opts(
+    const rocksdb_backup_engine_options_t* options, rocksdb_env_t* env,
+    char** errptr) {
+  BackupEngine* be;
+  if (SaveError(errptr, BackupEngine::Open(options->rep, env->rep, &be))) {
+    return nullptr;
+  }
+  rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t;
+  result->rep = be;
+  return result;
+}
+
+void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be,
+                                             rocksdb_t* db, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep));
+}
+
+void rocksdb_backup_engine_create_new_backup_flush(
+    rocksdb_backup_engine_t* be, rocksdb_t* db,
+    unsigned char flush_before_backup, char** errptr) {
+  SaveError(errptr, be->rep->CreateNewBackup(db->rep, flush_before_backup));
+}
+
+void rocksdb_backup_engine_purge_old_backups(rocksdb_backup_engine_t* be,
+                                             uint32_t num_backups_to_keep,
+                                             char** errptr) {
+  SaveError(errptr, be->rep->PurgeOldBackups(num_backups_to_keep));
+}
+
+rocksdb_restore_options_t* rocksdb_restore_options_create() {
+  return new rocksdb_restore_options_t;
+}
+
+void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt,
+                                                int v) {
+  opt->rep.keep_log_files = v;
+}
+
+void rocksdb_backup_engine_verify_backup(rocksdb_backup_engine_t* be,
+                                         uint32_t backup_id, char** errptr) {
+  SaveError(errptr, be->rep->VerifyBackup(static_cast<BackupID>(backup_id)));
+}
+
+void rocksdb_backup_engine_restore_db_from_latest_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir),
+                                                       std::string(wal_dir),
+                                                       restore_options->rep));
+}
+
+void rocksdb_backup_engine_restore_db_from_backup(
+    rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir,
+    const rocksdb_restore_options_t* restore_options, const uint32_t backup_id,
+    char** errptr) {
+  SaveError(errptr, be->rep->RestoreDBFromBackup(backup_id, std::string(db_dir),
+                                                 std::string(wal_dir),
+                                                 restore_options->rep));
+}
+
+const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info(
+    rocksdb_backup_engine_t* be) {
+  rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t;
+  be->rep->GetBackupInfo(&result->rep);
+  return result;
+}
+
+int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) {
+  return static_cast<int>(info->rep.size());
+}
+
+int64_t rocksdb_backup_engine_info_timestamp(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].timestamp;
+}
+
+uint32_t rocksdb_backup_engine_info_backup_id(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].backup_id;
+}
+
+uint64_t rocksdb_backup_engine_info_size(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].size;
+}
+
+uint32_t rocksdb_backup_engine_info_number_files(
+    const rocksdb_backup_engine_info_t* info, int index) {
+  return info->rep[index].number_files;
+}
+
+void rocksdb_backup_engine_info_destroy(
+    const rocksdb_backup_engine_info_t* info) {
+  delete info;
+}
+
+void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) {
+  delete be->rep;
+  delete be;
+}
+
+rocksdb_backup_engine_options_t* rocksdb_backup_engine_options_create(
+    const char* backup_dir) {
+  return new rocksdb_backup_engine_options_t{
+      BackupEngineOptions(std::string(backup_dir))};
+}
+
+void rocksdb_backup_engine_options_set_backup_dir(
+    rocksdb_backup_engine_options_t* options, const char* backup_dir) {
+  options->rep.backup_dir = std::string(backup_dir);
+}
+
+void rocksdb_backup_engine_options_set_env(
+    rocksdb_backup_engine_options_t* options, rocksdb_env_t* env) {
+  options->rep.backup_env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_backup_engine_options_set_share_table_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.share_table_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_share_table_files(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.share_table_files;
+}
+
+void rocksdb_backup_engine_options_set_sync(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.sync = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_sync(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.sync;
+}
+
+void rocksdb_backup_engine_options_set_destroy_old_data(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.destroy_old_data = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_destroy_old_data(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.destroy_old_data;
+}
+
+void rocksdb_backup_engine_options_set_backup_log_files(
+    rocksdb_backup_engine_options_t* options, unsigned char val) {
+  options->rep.backup_log_files = val;
+}
+
+unsigned char rocksdb_backup_engine_options_get_backup_log_files(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.backup_log_files;
+}
+
+void rocksdb_backup_engine_options_set_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit) {
+  options->rep.backup_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_backup_rate_limit(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.backup_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options, uint64_t limit) {
+  options->rep.restore_rate_limit = limit;
+}
+
+uint64_t rocksdb_backup_engine_options_get_restore_rate_limit(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.restore_rate_limit;
+}
+
+void rocksdb_backup_engine_options_set_max_background_operations(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.max_background_operations = val;
+}
+
+int rocksdb_backup_engine_options_get_max_background_operations(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.max_background_operations;
+}
+
+void rocksdb_backup_engine_options_set_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options, uint64_t size) {
+  options->rep.callback_trigger_interval_size = size;
+}
+
+uint64_t rocksdb_backup_engine_options_get_callback_trigger_interval_size(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.callback_trigger_interval_size;
+}
+
+void rocksdb_backup_engine_options_set_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.max_valid_backups_to_open = val;
+}
+
+int rocksdb_backup_engine_options_get_max_valid_backups_to_open(
+    rocksdb_backup_engine_options_t* options) {
+  return options->rep.max_valid_backups_to_open;
+}
+
+void rocksdb_backup_engine_options_set_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options, int val) {
+  options->rep.share_files_with_checksum_naming =
+      static_cast<BackupEngineOptions::ShareFilesNaming>(val);
+}
+
+int rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+    rocksdb_backup_engine_options_t* options) {
+  return static_cast<int>(options->rep.share_files_with_checksum_naming);
+}
+
+void rocksdb_backup_engine_options_destroy(
+    rocksdb_backup_engine_options_t* options) {
+  delete options;
+}
+
+rocksdb_checkpoint_t* rocksdb_checkpoint_object_create(rocksdb_t* db,
+                                                       char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+void rocksdb_checkpoint_create(rocksdb_checkpoint_t* checkpoint,
+                               const char* checkpoint_dir,
+                               uint64_t log_size_for_flush, char** errptr) {
+  SaveError(errptr, checkpoint->rep->CreateCheckpoint(
+                        std::string(checkpoint_dir), log_size_for_flush));
+}
+
+void rocksdb_checkpoint_object_destroy(rocksdb_checkpoint_t* checkpoint) {
+  delete checkpoint->rep;
+  delete checkpoint;
+}
+
+void rocksdb_close(rocksdb_t* db) {
+  delete db->rep;
+  delete db;
+}
+
+void rocksdb_options_set_uint64add_merge_operator(rocksdb_options_t* opt) {
+  opt->rep.merge_operator =
+      ROCKSDB_NAMESPACE::MergeOperators::CreateUInt64AddOperator();
+}
+
+rocksdb_t* rocksdb_open_and_trim_history(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char* trim_ts,
+    size_t trim_tslen, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  std::string trim_ts_(trim_ts, trim_tslen);
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAndTrimHistory(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &db, trim_ts_))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::Open(DBOptions(db_options->rep), std::string(name),
+                                 column_families, &handles, &db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_column_families_with_ttl(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, const int* ttls,
+    char** errptr) {
+  std::vector<int32_t> ttls_vec;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    ttls_vec.push_back(ttls[i]);
+
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  ROCKSDB_NAMESPACE::DBWithTTL* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, ROCKSDB_NAMESPACE::DBWithTTL::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &db, ttls_vec))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_for_read_only_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles,
+    unsigned char error_if_wal_file_exists, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr,
+                DB::OpenForReadOnly(DBOptions(db_options->rep),
+                                    std::string(name), column_families,
+                                    &handles, &db, error_if_wal_file_exists))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+rocksdb_t* rocksdb_open_as_secondary_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    const char* secondary_path, int num_column_families,
+    const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i != num_column_families; ++i) {
+    column_families.emplace_back(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep));
+  }
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, DB::OpenAsSecondary(DBOptions(db_options->rep),
+                                            std::string(name),
+                                            std::string(secondary_path),
+                                            column_families, &handles, &db))) {
+    return nullptr;
+  }
+  for (size_t i = 0; i != handles.size(); ++i) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_t* result = new rocksdb_t;
+  result->rep = db;
+  return result;
+}
+
+char** rocksdb_list_column_families(const rocksdb_options_t* options,
+                                    const char* name, size_t* lencfs,
+                                    char** errptr) {
+  std::vector<std::string> fams;
+  SaveError(errptr, DB::ListColumnFamilies(DBOptions(options->rep),
+                                           std::string(name), &fams));
+
+  *lencfs = fams.size();
+  char** column_families =
+      static_cast<char**>(malloc(sizeof(char*) * fams.size()));
+  for (size_t i = 0; i < fams.size(); i++) {
+    column_families[i] = strdup(fams[i].c_str());
+  }
+  return column_families;
+}
+
+void rocksdb_list_column_families_destroy(char** list, size_t len) {
+  for (size_t i = 0; i < len; ++i) {
+    free(list[i]);
+  }
+  free(list);
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    const char* column_family_name, int ttl, char** errptr) {
+  ROCKSDB_NAMESPACE::DBWithTTL* db_with_ttl =
+      static_cast<ROCKSDB_NAMESPACE::DBWithTTL*>(db->rep);
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, db_with_ttl->CreateColumnFamilyWithTtl(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep), ttl));
+  return handle;
+}
+
+void rocksdb_drop_column_family(rocksdb_t* db,
+                                rocksdb_column_family_handle_t* handle,
+                                char** errptr) {
+  SaveError(errptr, db->rep->DropColumnFamily(handle->rep));
+}
+
+uint32_t rocksdb_column_family_handle_get_id(
+    rocksdb_column_family_handle_t* handle) {
+  return handle->rep->GetID();
+}
+
+char* rocksdb_column_family_handle_get_name(
+    rocksdb_column_family_handle_t* handle, size_t* name_len) {
+  auto name = handle->rep->GetName();
+  *name_len = name.size();
+  return CopyString(name);
+}
+
+void rocksdb_column_family_handle_destroy(
+    rocksdb_column_family_handle_t* handle) {
+  delete handle->rep;
+  delete handle;
+}
+
+void rocksdb_put(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                 const char* key, size_t keylen, const char* val, size_t vallen,
+                 char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                    rocksdb_column_family_handle_t* column_family,
+                    const char* key, size_t keylen, const char* val,
+                    size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Put(options->rep, column_family->rep,
+                                 Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_put_with_ts(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                         const char* key, size_t keylen, const char* ts,
+                         size_t tslen, const char* val, size_t vallen,
+                         char** errptr) {
+  SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen),
+                                 Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_put_cf_with_ts(rocksdb_t* db,
+                            const rocksdb_writeoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, size_t keylen, const char* ts,
+                            size_t tslen, const char* val, size_t vallen,
+                            char** errptr) {
+  SaveError(errptr,
+            db->rep->Put(options->rep, column_family->rep, Slice(key, keylen),
+                         Slice(ts, tslen), Slice(val, vallen)));
+}
+
+void rocksdb_delete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                    const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_delete_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                       rocksdb_column_family_handle_t* column_family,
+                       const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+                                    Slice(key, keylen)));
+}
+
+void rocksdb_delete_with_ts(rocksdb_t* db,
+                            const rocksdb_writeoptions_t* options,
+                            const char* key, size_t keylen, const char* ts,
+                            size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, Slice(key, keylen),
+                                    Slice(ts, tslen)));
+}
+
+void rocksdb_delete_cf_with_ts(rocksdb_t* db,
+                               const rocksdb_writeoptions_t* options,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, size_t keylen, const char* ts,
+                               size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->Delete(options->rep, column_family->rep,
+                                    Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                          const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_cf(rocksdb_t* db,
+                             const rocksdb_writeoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, size_t keylen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, column_family->rep,
+                                          Slice(key, keylen)));
+}
+
+void rocksdb_singledelete_with_ts(rocksdb_t* db,
+                                  const rocksdb_writeoptions_t* options,
+                                  const char* key, size_t keylen,
+                                  const char* ts, size_t tslen, char** errptr) {
+  SaveError(errptr, db->rep->SingleDelete(options->rep, Slice(key, keylen),
+                                          Slice(ts, tslen)));
+}
+
+void rocksdb_singledelete_cf_with_ts(
+    rocksdb_t* db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, const char* ts, size_t tslen, char** errptr) {
+  SaveError(errptr,
+            db->rep->SingleDelete(options->rep, column_family->rep,
+                                  Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_increase_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* ts_low, size_t ts_lowlen, char** errptr) {
+  std::string ts(ts_low, ts_lowlen);
+  SaveError(errptr, db->rep->IncreaseFullHistoryTsLow(column_family->rep, ts));
+}
+
+char* rocksdb_get_full_history_ts_low(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    size_t* ts_len, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->GetFullHistoryTsLow(column_family->rep, &tmp);
+  if (s.ok()) {
+    *ts_len = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *ts_len = 0;
+    SaveError(errptr, s);
+  }
+  return result;
+}
+
+void rocksdb_delete_range_cf(rocksdb_t* db,
+                             const rocksdb_writeoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* start_key, size_t start_key_len,
+                             const char* end_key, size_t end_key_len,
+                             char** errptr) {
+  SaveError(errptr, db->rep->DeleteRange(options->rep, column_family->rep,
+                                         Slice(start_key, start_key_len),
+                                         Slice(end_key, end_key_len)));
+}
+
+void rocksdb_merge(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                   const char* key, size_t keylen, const char* val,
+                   size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Merge(options->rep, Slice(key, keylen),
+                                   Slice(val, vallen)));
+}
+
+void rocksdb_merge_cf(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                      rocksdb_column_family_handle_t* column_family,
+                      const char* key, size_t keylen, const char* val,
+                      size_t vallen, char** errptr) {
+  SaveError(errptr, db->rep->Merge(options->rep, column_family->rep,
+                                   Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_write(rocksdb_t* db, const rocksdb_writeoptions_t* options,
+                   rocksdb_writebatch_t* batch, char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+char* rocksdb_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                  const char* key, size_t keylen, size_t* vallen,
+                  char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_cf(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                     rocksdb_column_family_handle_t* column_family,
+                     const char* key, size_t keylen, size_t* vallen,
+                     char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      db->rep->Get(options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_with_ts(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                          const char* key, size_t keylen, size_t* vallen,
+                          char** ts, size_t* tslen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp_val;
+  std::string tmp_ts;
+  Status s = db->rep->Get(options->rep, Slice(key, keylen), &tmp_val, &tmp_ts);
+  if (s.ok()) {
+    *vallen = tmp_val.size();
+    result = CopyString(tmp_val);
+    *tslen = tmp_ts.size();
+    *ts = CopyString(tmp_ts);
+  } else {
+    *vallen = 0;
+    *tslen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_get_cf_with_ts(rocksdb_t* db,
+                             const rocksdb_readoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, size_t keylen, size_t* vallen,
+                             char** ts, size_t* tslen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  std::string tmp_ts;
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &tmp, &tmp_ts);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+    *tslen = tmp_ts.size();
+    *ts = CopyString(tmp_ts);
+  } else {
+    *vallen = 0;
+    *tslen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_multi_get(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                       size_t num_keys, const char* const* keys_list,
+                       const size_t* keys_list_sizes, char** values_list,
+                       size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses = db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_with_ts(rocksdb_t* db,
+                               const rocksdb_readoptions_t* options,
+                               size_t num_keys, const char* const* keys_list,
+                               const size_t* keys_list_sizes,
+                               char** values_list, size_t* values_list_sizes,
+                               char** timestamp_list,
+                               size_t* timestamp_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<std::string> timestamps(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, keys, &values, &timestamps);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      timestamp_list[i] = CopyString(timestamps[i]);
+      timestamp_list_sizes[i] = timestamps[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      timestamp_list[i] = nullptr;
+      timestamp_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_multi_get_cf_with_ts(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** timestamps_list,
+    size_t* timestamps_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<std::string> timestamps(num_keys);
+  std::vector<Status> statuses =
+      db->rep->MultiGet(options->rep, cfs, keys, &values, &timestamps);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      timestamps_list[i] = CopyString(timestamps[i]);
+      timestamps_list_sizes[i] = timestamps[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      timestamps_list[i] = nullptr;
+      timestamps_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_batched_multi_get_cf(rocksdb_t* db,
+                                  const rocksdb_readoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  size_t num_keys, const char* const* keys_list,
+                                  const size_t* keys_list_sizes,
+                                  rocksdb_pinnableslice_t** values, char** errs,
+                                  const bool sorted_input) {
+  Slice* key_slices = new Slice[num_keys];
+  PinnableSlice* value_slices = new PinnableSlice[num_keys];
+  Status* statuses = new Status[num_keys];
+  for (size_t i = 0; i < num_keys; ++i) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+
+  db->rep->MultiGet(options->rep, column_family->rep, num_keys, key_slices,
+                    value_slices, statuses, sorted_input);
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    if (statuses[i].ok()) {
+      values[i] = new (rocksdb_pinnableslice_t);
+      values[i]->rep = std::move(value_slices[i]);
+      errs[i] = nullptr;
+    } else {
+      values[i] = nullptr;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+
+  delete[] key_slices;
+  delete[] value_slices;
+  delete[] statuses;
+}
+
+unsigned char rocksdb_key_may_exist(rocksdb_t* db,
+                                    const rocksdb_readoptions_t* options,
+                                    const char* key, size_t key_len,
+                                    char** value, size_t* val_len,
+                                    const char* timestamp, size_t timestamp_len,
+                                    unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(options->rep, Slice(key, key_len),
+                                           &tmp, timestamp ? &time : nullptr,
+                                           value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
+unsigned char rocksdb_key_may_exist_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t key_len, char** value, size_t* val_len, const char* timestamp,
+    size_t timestamp_len, unsigned char* value_found) {
+  std::string tmp;
+  std::string time;
+  if (timestamp) {
+    time.assign(timestamp, timestamp_len);
+  }
+  bool found = false;
+  const bool result = db->rep->KeyMayExist(
+      options->rep, column_family->rep, Slice(key, key_len), &tmp,
+      timestamp ? &time : nullptr, value_found ? &found : nullptr);
+  if (value_found) {
+    *value_found = found;
+    if (found) {
+      *val_len = tmp.size();
+      *value = CopyString(tmp);
+    }
+  }
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator(
+    rocksdb_t* db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_wal_iterator_t* rocksdb_get_updates_since(
+    rocksdb_t* db, uint64_t seq_number,
+    const rocksdb_wal_readoptions_t* options, char** errptr) {
+  std::unique_ptr<TransactionLogIterator> iter;
+  TransactionLogIterator::ReadOptions ro;
+  if (options != nullptr) {
+    ro = options->rep;
+  }
+  if (SaveError(errptr, db->rep->GetUpdatesSince(seq_number, &iter, ro))) {
+    return nullptr;
+  }
+  rocksdb_wal_iterator_t* result = new rocksdb_wal_iterator_t;
+  result->rep = iter.release();
+  return result;
+}
+
+void rocksdb_wal_iter_next(rocksdb_wal_iterator_t* iter) { iter->rep->Next(); }
+
+unsigned char rocksdb_wal_iter_valid(const rocksdb_wal_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_wal_iter_status(const rocksdb_wal_iterator_t* iter,
+                             char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+void rocksdb_wal_iter_destroy(const rocksdb_wal_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+rocksdb_writebatch_t* rocksdb_wal_iter_get_batch(
+    const rocksdb_wal_iterator_t* iter, uint64_t* seq) {
+  rocksdb_writebatch_t* result = rocksdb_writebatch_create();
+  BatchResult wal_batch = iter->rep->GetBatch();
+  result->rep = std::move(*wal_batch.writeBatchPtr);
+  if (seq != nullptr) {
+    *seq = wal_batch.sequence;
+  }
+  return result;
+}
+
+uint64_t rocksdb_get_latest_sequence_number(rocksdb_t* db) {
+  return db->rep->GetLatestSequenceNumber();
+}
+
+rocksdb_iterator_t* rocksdb_create_iterator_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_create_iterators(rocksdb_t* db, rocksdb_readoptions_t* opts,
+                              rocksdb_column_family_handle_t** column_families,
+                              rocksdb_iterator_t** iterators, size_t size,
+                              char** errptr) {
+  std::vector<ColumnFamilyHandle*> column_families_vec;
+  for (size_t i = 0; i < size; i++) {
+    column_families_vec.push_back(column_families[i]->rep);
+  }
+
+  std::vector<Iterator*> res;
+  Status status = db->rep->NewIterators(opts->rep, column_families_vec, &res);
+  assert(res.size() == size);
+  if (SaveError(errptr, status)) {
+    return;
+  }
+
+  for (size_t i = 0; i < size; i++) {
+    iterators[i] = new rocksdb_iterator_t;
+    iterators[i]->rep = res[i];
+  }
+}
+
+const rocksdb_snapshot_t* rocksdb_create_snapshot(rocksdb_t* db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_release_snapshot(rocksdb_t* db,
+                              const rocksdb_snapshot_t* snapshot) {
+  db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_property_value(rocksdb_t* db, const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_property_int(rocksdb_t* db, const char* propname,
+                         uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+int rocksdb_property_int_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* propname, uint64_t* out_val) {
+  if (db->rep->GetIntProperty(column_family->rep, Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+char* rocksdb_property_value_cf(rocksdb_t* db,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(column_family->rep, Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+void rocksdb_approximate_sizes(rocksdb_t* db, int num_ranges,
+                               const char* const* range_start_key,
+                               const size_t* range_start_key_len,
+                               const char* const* range_limit_key,
+                               const size_t* range_limit_key_len,
+                               uint64_t* sizes, char** errptr) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  Status s = db->rep->GetApproximateSizes(ranges, num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
+  delete[] ranges;
+}
+
+void rocksdb_approximate_sizes_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    int num_ranges, const char* const* range_start_key,
+    const size_t* range_start_key_len, const char* const* range_limit_key,
+    const size_t* range_limit_key_len, uint64_t* sizes, char** errptr) {
+  Range* ranges = new Range[num_ranges];
+  for (int i = 0; i < num_ranges; i++) {
+    ranges[i].start = Slice(range_start_key[i], range_start_key_len[i]);
+    ranges[i].limit = Slice(range_limit_key[i], range_limit_key_len[i]);
+  }
+  Status s = db->rep->GetApproximateSizes(column_family->rep, ranges,
+                                          num_ranges, sizes);
+  if (!s.ok()) {
+    SaveError(errptr, s);
+  }
+  delete[] ranges;
+}
+
+void rocksdb_delete_file(rocksdb_t* db, const char* name) {
+  db->rep->DeleteFile(name);
+}
+
+const rocksdb_livefiles_t* rocksdb_livefiles(rocksdb_t* db) {
+  rocksdb_livefiles_t* result = new rocksdb_livefiles_t;
+  db->rep->GetLiveFilesMetaData(&result->rep);
+  return result;
+}
+
+void rocksdb_compact_range(rocksdb_t* db, const char* start_key,
+                           size_t start_key_len, const char* limit_key,
+                           size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(),
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf(rocksdb_t* db,
+                              rocksdb_column_family_handle_t* column_family,
+                              const char* start_key, size_t start_key_len,
+                              const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      CompactRangeOptions(), column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_suggest_compact_range(rocksdb_t* db, const char* start_key,
+                                   size_t start_key_len, const char* limit_key,
+                                   size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  Status s = ROCKSDB_NAMESPACE::experimental::SuggestCompactRange(
+      db->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+  SaveError(errptr, s);
+}
+
+void rocksdb_suggest_compact_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  Status s = db->rep->SuggestCompactRange(
+      column_family->rep,
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+  SaveError(errptr, s);
+}
+
+void rocksdb_compact_range_opt(rocksdb_t* db, rocksdb_compactoptions_t* opt,
+                               const char* start_key, size_t start_key_len,
+                               const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_compact_range_cf_opt(rocksdb_t* db,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  rocksdb_compactoptions_t* opt,
+                                  const char* start_key, size_t start_key_len,
+                                  const char* limit_key, size_t limit_key_len) {
+  Slice a, b;
+  db->rep->CompactRange(
+      opt->rep, column_family->rep,
+      // Pass nullptr Slice if corresponding "const char*" is nullptr
+      (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+      (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr));
+}
+
+void rocksdb_flush(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+                   char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep));
+}
+
+void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+                      rocksdb_column_family_handle_t* column_family,
+                      char** errptr) {
+  SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
+}
+
+void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
+  SaveError(errptr, db->rep->FlushWAL(sync));
+}
+
+void rocksdb_disable_file_deletions(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->DisableFileDeletions());
+}
+
+void rocksdb_enable_file_deletions(rocksdb_t* db, unsigned char force,
+                                   char** errptr) {
+  SaveError(errptr, db->rep->EnableFileDeletions(force));
+}
+
+void rocksdb_destroy_db(const rocksdb_options_t* options, const char* name,
+                        char** errptr) {
+  SaveError(errptr, DestroyDB(name, options->rep));
+}
+
+void rocksdb_repair_db(const rocksdb_options_t* options, const char* name,
+                       char** errptr) {
+  SaveError(errptr, RepairDB(name, options->rep));
+}
+
+void rocksdb_iter_destroy(rocksdb_iterator_t* iter) {
+  delete iter->rep;
+  delete iter;
+}
+
+unsigned char rocksdb_iter_valid(const rocksdb_iterator_t* iter) {
+  return iter->rep->Valid();
+}
+
+void rocksdb_iter_seek_to_first(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToFirst();
+}
+
+void rocksdb_iter_seek_to_last(rocksdb_iterator_t* iter) {
+  iter->rep->SeekToLast();
+}
+
+void rocksdb_iter_seek(rocksdb_iterator_t* iter, const char* k, size_t klen) {
+  iter->rep->Seek(Slice(k, klen));
+}
+
+void rocksdb_iter_seek_for_prev(rocksdb_iterator_t* iter, const char* k,
+                                size_t klen) {
+  iter->rep->SeekForPrev(Slice(k, klen));
+}
+
+void rocksdb_iter_next(rocksdb_iterator_t* iter) { iter->rep->Next(); }
+
+void rocksdb_iter_prev(rocksdb_iterator_t* iter) { iter->rep->Prev(); }
+
+const char* rocksdb_iter_key(const rocksdb_iterator_t* iter, size_t* klen) {
+  Slice s = iter->rep->key();
+  *klen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_value(const rocksdb_iterator_t* iter, size_t* vlen) {
+  Slice s = iter->rep->value();
+  *vlen = s.size();
+  return s.data();
+}
+
+const char* rocksdb_iter_timestamp(const rocksdb_iterator_t* iter,
+                                   size_t* tslen) {
+  Slice s = iter->rep->timestamp();
+  *tslen = s.size();
+  return s.data();
+}
+
+void rocksdb_iter_get_error(const rocksdb_iterator_t* iter, char** errptr) {
+  SaveError(errptr, iter->rep->status());
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create() {
+  return new rocksdb_writebatch_t;
+}
+
+rocksdb_writebatch_t* rocksdb_writebatch_create_from(const char* rep,
+                                                     size_t size) {
+  rocksdb_writebatch_t* b = new rocksdb_writebatch_t;
+  b->rep = WriteBatch(std::string(rep, size));
+  return b;
+}
+
+void rocksdb_writebatch_destroy(rocksdb_writebatch_t* b) { delete b; }
+
+void rocksdb_writebatch_clear(rocksdb_writebatch_t* b) { b->rep.Clear(); }
+
+int rocksdb_writebatch_count(rocksdb_writebatch_t* b) { return b->rep.Count(); }
+
+void rocksdb_writebatch_put(rocksdb_writebatch_t* b, const char* key,
+                            size_t klen, const char* val, size_t vlen) {
+  b->rep.Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf(rocksdb_writebatch_t* b,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, size_t klen, const char* val,
+                               size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_put_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen, const char* val,
+    size_t vlen) {
+  b->rep.Put(column_family->rep, Slice(key, klen), Slice(ts, tslen),
+             Slice(val, vlen));
+}
+
+void rocksdb_writebatch_putv(rocksdb_writebatch_t* b, int num_keys,
+                             const char* const* keys_list,
+                             const size_t* keys_list_sizes, int num_values,
+                             const char* const* values_list,
+                             const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_putv_cf(rocksdb_writebatch_t* b,
+                                rocksdb_column_family_handle_t* column_family,
+                                int num_keys, const char* const* keys_list,
+                                const size_t* keys_list_sizes, int num_values,
+                                const char* const* values_list,
+                                const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+             SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_merge(rocksdb_writebatch_t* b, const char* key,
+                              size_t klen, const char* val, size_t vlen) {
+  b->rep.Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_merge_cf(rocksdb_writebatch_t* b,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen) {
+  b->rep.Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_mergev(rocksdb_writebatch_t* b, int num_keys,
+                               const char* const* keys_list,
+                               const size_t* keys_list_sizes, int num_values,
+                               const char* const* values_list,
+                               const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_mergev_cf(rocksdb_writebatch_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  int num_keys, const char* const* keys_list,
+                                  const size_t* keys_list_sizes, int num_values,
+                                  const char* const* values_list,
+                                  const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep.Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+               SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_delete(rocksdb_writebatch_t* b, const char* key,
+                               size_t klen) {
+  b->rep.Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete(rocksdb_writebatch_t* b, const char* key,
+                                     size_t klen) {
+  b->rep.SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf(rocksdb_writebatch_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_delete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen) {
+  b->rep.Delete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_singledelete_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep.SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_singledelete_cf_with_ts(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* ts, size_t tslen) {
+  b->rep.SingleDelete(column_family->rep, Slice(key, klen), Slice(ts, tslen));
+}
+
+void rocksdb_writebatch_deletev(rocksdb_writebatch_t* b, int num_keys,
+                                const char* const* keys_list,
+                                const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_deletev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep.Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_range(rocksdb_writebatch_t* b,
+                                     const char* start_key,
+                                     size_t start_key_len, const char* end_key,
+                                     size_t end_key_len) {
+  b->rep.DeleteRange(Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_range_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep.DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                     Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_delete_rangev(rocksdb_writebatch_t* b, int num_keys,
+                                      const char* const* start_keys_list,
+                                      const size_t* start_keys_list_sizes,
+                                      const char* const* end_keys_list,
+                                      const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_delete_rangev_cf(
+    rocksdb_writebatch_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep.DeleteRange(column_family->rep,
+                     SliceParts(start_key_slices.data(), num_keys),
+                     SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_put_log_data(rocksdb_writebatch_t* b, const char* blob,
+                                     size_t len) {
+  b->rep.PutLogData(Slice(blob, len));
+}
+
+class H : public WriteBatch::Handler {
+ public:
+  void* state_;
+  void (*put_)(void*, const char* k, size_t klen, const char* v, size_t vlen);
+  void (*deleted_)(void*, const char* k, size_t klen);
+  void Put(const Slice& key, const Slice& value) override {
+    (*put_)(state_, key.data(), key.size(), value.data(), value.size());
+  }
+  void Delete(const Slice& key) override {
+    (*deleted_)(state_, key.data(), key.size());
+  }
+};
+
+void rocksdb_writebatch_iterate(rocksdb_writebatch_t* b, void* state,
+                                void (*put)(void*, const char* k, size_t klen,
+                                            const char* v, size_t vlen),
+                                void (*deleted)(void*, const char* k,
+                                                size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep.Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
+  *size = b->rep.GetDataSize();
+  return b->rep.Data().c_str();
+}
+
+void rocksdb_writebatch_set_save_point(rocksdb_writebatch_t* b) {
+  b->rep.SetSavePoint();
+}
+
+void rocksdb_writebatch_rollback_to_save_point(rocksdb_writebatch_t* b,
+                                               char** errptr) {
+  SaveError(errptr, b->rep.RollbackToSavePoint());
+}
+
+void rocksdb_writebatch_pop_save_point(rocksdb_writebatch_t* b, char** errptr) {
+  SaveError(errptr, b->rep.PopSavePoint());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_writebatch_wi_create(
+    size_t reserved_bytes, unsigned char overwrite_key) {
+  rocksdb_writebatch_wi_t* b = new rocksdb_writebatch_wi_t;
+  b->rep = new WriteBatchWithIndex(BytewiseComparator(), reserved_bytes,
+                                   overwrite_key);
+  return b;
+}
+
+void rocksdb_writebatch_wi_destroy(rocksdb_writebatch_wi_t* b) {
+  if (b->rep) {
+    delete b->rep;
+  }
+  delete b;
+}
+
+void rocksdb_writebatch_wi_clear(rocksdb_writebatch_wi_t* b) {
+  b->rep->Clear();
+}
+
+int rocksdb_writebatch_wi_count(rocksdb_writebatch_wi_t* b) {
+  return b->rep->GetWriteBatch()->Count();
+}
+
+void rocksdb_writebatch_wi_put(rocksdb_writebatch_wi_t* b, const char* key,
+                               size_t klen, const char* val, size_t vlen) {
+  b->rep->Put(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_put_cf(rocksdb_writebatch_wi_t* b,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen) {
+  b->rep->Put(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_putv(rocksdb_writebatch_wi_t* b, int num_keys,
+                                const char* const* keys_list,
+                                const size_t* keys_list_sizes, int num_values,
+                                const char* const* values_list,
+                                const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(SliceParts(key_slices.data(), num_keys),
+              SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_putv_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Put(column_family->rep, SliceParts(key_slices.data(), num_keys),
+              SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_merge(rocksdb_writebatch_wi_t* b, const char* key,
+                                 size_t klen, const char* val, size_t vlen) {
+  b->rep->Merge(Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_merge_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, const char* val, size_t vlen) {
+  b->rep->Merge(column_family->rep, Slice(key, klen), Slice(val, vlen));
+}
+
+void rocksdb_writebatch_wi_mergev(rocksdb_writebatch_wi_t* b, int num_keys,
+                                  const char* const* keys_list,
+                                  const size_t* keys_list_sizes, int num_values,
+                                  const char* const* values_list,
+                                  const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(SliceParts(key_slices.data(), num_keys),
+                SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_mergev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes,
+    int num_values, const char* const* values_list,
+    const size_t* values_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<Slice> value_slices(num_values);
+  for (int i = 0; i < num_values; i++) {
+    value_slices[i] = Slice(values_list[i], values_list_sizes[i]);
+  }
+  b->rep->Merge(column_family->rep, SliceParts(key_slices.data(), num_keys),
+                SliceParts(value_slices.data(), num_values));
+}
+
+void rocksdb_writebatch_wi_delete(rocksdb_writebatch_wi_t* b, const char* key,
+                                  size_t klen) {
+  b->rep->Delete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete(rocksdb_writebatch_wi_t* b,
+                                        const char* key, size_t klen) {
+  b->rep->SingleDelete(Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_delete_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->Delete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_singledelete_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen) {
+  b->rep->SingleDelete(column_family->rep, Slice(key, klen));
+}
+
+void rocksdb_writebatch_wi_deletev(rocksdb_writebatch_wi_t* b, int num_keys,
+                                   const char* const* keys_list,
+                                   const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_deletev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* keys_list, const size_t* keys_list_sizes) {
+  std::vector<Slice> key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    key_slices[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  b->rep->Delete(column_family->rep, SliceParts(key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_range(rocksdb_writebatch_wi_t* b,
+                                        const char* start_key,
+                                        size_t start_key_len,
+                                        const char* end_key,
+                                        size_t end_key_len) {
+  b->rep->DeleteRange(Slice(start_key, start_key_len),
+                      Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_range_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* end_key,
+    size_t end_key_len) {
+  b->rep->DeleteRange(column_family->rep, Slice(start_key, start_key_len),
+                      Slice(end_key, end_key_len));
+}
+
+void rocksdb_writebatch_wi_delete_rangev(rocksdb_writebatch_wi_t* b,
+                                         int num_keys,
+                                         const char* const* start_keys_list,
+                                         const size_t* start_keys_list_sizes,
+                                         const char* const* end_keys_list,
+                                         const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(SliceParts(start_key_slices.data(), num_keys),
+                      SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_delete_rangev_cf(
+    rocksdb_writebatch_wi_t* b, rocksdb_column_family_handle_t* column_family,
+    int num_keys, const char* const* start_keys_list,
+    const size_t* start_keys_list_sizes, const char* const* end_keys_list,
+    const size_t* end_keys_list_sizes) {
+  std::vector<Slice> start_key_slices(num_keys);
+  std::vector<Slice> end_key_slices(num_keys);
+  for (int i = 0; i < num_keys; i++) {
+    start_key_slices[i] = Slice(start_keys_list[i], start_keys_list_sizes[i]);
+    end_key_slices[i] = Slice(end_keys_list[i], end_keys_list_sizes[i]);
+  }
+  b->rep->DeleteRange(column_family->rep,
+                      SliceParts(start_key_slices.data(), num_keys),
+                      SliceParts(end_key_slices.data(), num_keys));
+}
+
+void rocksdb_writebatch_wi_put_log_data(rocksdb_writebatch_wi_t* b,
+                                        const char* blob, size_t len) {
+  b->rep->PutLogData(Slice(blob, len));
+}
+
+void rocksdb_writebatch_wi_iterate(
+    rocksdb_writebatch_wi_t* b, void* state,
+    void (*put)(void*, const char* k, size_t klen, const char* v, size_t vlen),
+    void (*deleted)(void*, const char* k, size_t klen)) {
+  H handler;
+  handler.state_ = state;
+  handler.put_ = put;
+  handler.deleted_ = deleted;
+  b->rep->GetWriteBatch()->Iterate(&handler);
+}
+
+const char* rocksdb_writebatch_wi_data(rocksdb_writebatch_wi_t* b,
+                                       size_t* size) {
+  WriteBatch* wb = b->rep->GetWriteBatch();
+  *size = wb->GetDataSize();
+  return wb->Data().c_str();
+}
+
+void rocksdb_writebatch_wi_set_save_point(rocksdb_writebatch_wi_t* b) {
+  b->rep->SetSavePoint();
+}
+
+void rocksdb_writebatch_wi_rollback_to_save_point(rocksdb_writebatch_wi_t* b,
+                                                  char** errptr) {
+  SaveError(errptr, b->rep->RollbackToSavePoint());
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = wbwi->rep->NewIteratorWithBase(base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_writebatch_wi_create_iterator_with_base_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_iterator_t* base_iterator,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep =
+      wbwi->rep->NewIteratorWithBase(column_family->rep, base_iterator->rep);
+  delete base_iterator;
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch(rocksdb_writebatch_wi_t* wbwi,
+                                           const rocksdb_options_t* options,
+                                           const char* key, size_t keylen,
+                                           size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(options->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_cf(
+    rocksdb_writebatch_wi_t* wbwi, const rocksdb_options_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatch(column_family->rep, options->rep,
+                                     Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options, const char* key, size_t keylen,
+    size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(db->rep, options->rep,
+                                          Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+char* rocksdb_writebatch_wi_get_from_batch_and_db_cf(
+    rocksdb_writebatch_wi_t* wbwi, rocksdb_t* db,
+    const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = wbwi->rep->GetFromBatchAndDB(
+      db->rep, options->rep, column_family->rep, Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+void rocksdb_write_writebatch_wi(rocksdb_t* db,
+                                 const rocksdb_writeoptions_t* options,
+                                 rocksdb_writebatch_wi_t* wbwi, char** errptr) {
+  WriteBatch* wb = wbwi->rep->GetWriteBatch();
+  SaveError(errptr, db->rep->Write(options->rep, wb));
+}
+
+void rocksdb_load_latest_options(
+    const char* db_path, rocksdb_env_t* env, bool ignore_unknown_options,
+    rocksdb_cache_t* cache, rocksdb_options_t** db_options,
+    size_t* num_column_families, char*** list_column_family_names,
+    rocksdb_options_t*** list_column_family_options, char** errptr) {
+  DBOptions db_opt;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
+                               &cf_descs, ignore_unknown_options, &cache->rep);
+  if (s.ok()) {
+    char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
+    rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
+        cf_descs.size() * sizeof(rocksdb_options_t*));
+    for (size_t i = 0; i < cf_descs.size(); ++i) {
+      cf_names[i] = strdup(cf_descs[i].name.c_str());
+      cf_options[i] = new rocksdb_options_t{
+          Options(DBOptions(), std::move(cf_descs[i].options))};
+    }
+    *num_column_families = cf_descs.size();
+    *db_options = new rocksdb_options_t{
+        Options(std::move(db_opt), ColumnFamilyOptions())};
+    *list_column_family_names = cf_names;
+    *list_column_family_options = cf_options;
+  } else {
+    *num_column_families = 0;
+    *db_options = nullptr;
+    *list_column_family_names = nullptr;
+    *list_column_family_options = nullptr;
+    SaveError(errptr, s);
+  }
+}
+
+void rocksdb_load_latest_options_destroy(
+    rocksdb_options_t* db_options, char** list_column_family_names,
+    rocksdb_options_t** list_column_family_options, size_t len) {
+  rocksdb_options_destroy(db_options);
+  if (list_column_family_names) {
+    for (size_t i = 0; i < len; ++i) {
+      free(list_column_family_names[i]);
+    }
+    free(list_column_family_names);
+  }
+  if (list_column_family_options) {
+    for (size_t i = 0; i < len; ++i) {
+      rocksdb_options_destroy(list_column_family_options[i]);
+    }
+    free(list_column_family_options);
+  }
+}
+
+rocksdb_block_based_table_options_t* rocksdb_block_based_options_create() {
+  return new rocksdb_block_based_table_options_t;
+}
+
+void rocksdb_block_based_options_destroy(
+    rocksdb_block_based_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_block_based_options_set_checksum(
+    rocksdb_block_based_table_options_t* opt, char v) {
+  opt->rep.checksum = static_cast<ROCKSDB_NAMESPACE::ChecksumType>(v);
+}
+
+void rocksdb_block_based_options_set_block_size(
+    rocksdb_block_based_table_options_t* options, size_t block_size) {
+  options->rep.block_size = block_size;
+}
+
+void rocksdb_block_based_options_set_block_size_deviation(
+    rocksdb_block_based_table_options_t* options, int block_size_deviation) {
+  options->rep.block_size_deviation = block_size_deviation;
+}
+
+void rocksdb_block_based_options_set_block_restart_interval(
+    rocksdb_block_based_table_options_t* options, int block_restart_interval) {
+  options->rep.block_restart_interval = block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_index_block_restart_interval(
+    rocksdb_block_based_table_options_t* options,
+    int index_block_restart_interval) {
+  options->rep.index_block_restart_interval = index_block_restart_interval;
+}
+
+void rocksdb_block_based_options_set_metadata_block_size(
+    rocksdb_block_based_table_options_t* options,
+    uint64_t metadata_block_size) {
+  options->rep.metadata_block_size = metadata_block_size;
+}
+
+void rocksdb_block_based_options_set_partition_filters(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char partition_filters) {
+  options->rep.partition_filters = partition_filters;
+}
+
+void rocksdb_block_based_options_set_use_delta_encoding(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char use_delta_encoding) {
+  options->rep.use_delta_encoding = use_delta_encoding;
+}
+
+void rocksdb_block_based_options_set_filter_policy(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_filterpolicy_t* filter_policy) {
+  options->rep.filter_policy.reset(filter_policy);
+}
+
+void rocksdb_block_based_options_set_no_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    unsigned char no_block_cache) {
+  options->rep.no_block_cache = no_block_cache;
+}
+
+void rocksdb_block_based_options_set_block_cache(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache) {
+  if (block_cache) {
+    options->rep.block_cache = block_cache->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_block_cache_compressed(
+    rocksdb_block_based_table_options_t* options,
+    rocksdb_cache_t* block_cache_compressed) {
+  if (block_cache_compressed) {
+    options->rep.block_cache_compressed = block_cache_compressed->rep;
+  }
+}
+
+void rocksdb_block_based_options_set_whole_key_filtering(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.whole_key_filtering = v;
+}
+
+void rocksdb_block_based_options_set_format_version(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.format_version = v;
+}
+
+void rocksdb_block_based_options_set_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.index_type = static_cast<BlockBasedTableOptions::IndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_index_type(
+    rocksdb_block_based_table_options_t* options, int v) {
+  options->rep.data_block_index_type =
+      static_cast<BlockBasedTableOptions::DataBlockIndexType>(v);
+}
+
+void rocksdb_block_based_options_set_data_block_hash_ratio(
+    rocksdb_block_based_table_options_t* options, double v) {
+  options->rep.data_block_hash_table_util_ratio = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks = v;
+}
+
+void rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.cache_index_and_filter_blocks_with_high_priority = v;
+}
+
+void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_l0_filter_and_index_blocks_in_cache = v;
+}
+
+void rocksdb_block_based_options_set_pin_top_level_index_and_filter(
+    rocksdb_block_based_table_options_t* options, unsigned char v) {
+  options->rep.pin_top_level_index_and_filter = v;
+}
+
+void rocksdb_options_set_block_based_table_factory(
+    rocksdb_options_t* opt,
+    rocksdb_block_based_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options->rep));
+  }
+}
+
+rocksdb_cuckoo_table_options_t* rocksdb_cuckoo_options_create() {
+  return new rocksdb_cuckoo_table_options_t;
+}
+
+void rocksdb_cuckoo_options_destroy(rocksdb_cuckoo_table_options_t* options) {
+  delete options;
+}
+
+void rocksdb_cuckoo_options_set_hash_ratio(
+    rocksdb_cuckoo_table_options_t* options, double v) {
+  options->rep.hash_table_ratio = v;
+}
+
+void rocksdb_cuckoo_options_set_max_search_depth(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.max_search_depth = v;
+}
+
+void rocksdb_cuckoo_options_set_cuckoo_block_size(
+    rocksdb_cuckoo_table_options_t* options, uint32_t v) {
+  options->rep.cuckoo_block_size = v;
+}
+
+void rocksdb_cuckoo_options_set_identity_as_first_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.identity_as_first_hash = v;
+}
+
+void rocksdb_cuckoo_options_set_use_module_hash(
+    rocksdb_cuckoo_table_options_t* options, unsigned char v) {
+  options->rep.use_module_hash = v;
+}
+
+void rocksdb_options_set_cuckoo_table_factory(
+    rocksdb_options_t* opt, rocksdb_cuckoo_table_options_t* table_options) {
+  if (table_options) {
+    opt->rep.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewCuckooTableFactory(table_options->rep));
+  }
+}
+
+void rocksdb_set_options(rocksdb_t* db, int count, const char* const keys[],
+                         const char* const values[], char** errptr) {
+  std::unordered_map<std::string, std::string> options_map;
+  for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+  SaveError(errptr, db->rep->SetOptions(options_map));
+}
+
+void rocksdb_set_options_cf(rocksdb_t* db,
+                            rocksdb_column_family_handle_t* handle, int count,
+                            const char* const keys[],
+                            const char* const values[], char** errptr) {
+  std::unordered_map<std::string, std::string> options_map;
+  for (int i = 0; i < count; i++) options_map[keys[i]] = values[i];
+  SaveError(errptr, db->rep->SetOptions(handle->rep, options_map));
+}
+
+rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; }
+
+void rocksdb_options_destroy(rocksdb_options_t* options) { delete options; }
+
+rocksdb_options_t* rocksdb_options_create_copy(rocksdb_options_t* options) {
+  return new rocksdb_options_t(*options);
+}
+
+void rocksdb_options_increase_parallelism(rocksdb_options_t* opt,
+                                          int total_threads) {
+  opt->rep.IncreaseParallelism(total_threads);
+}
+
+void rocksdb_options_optimize_for_point_lookup(rocksdb_options_t* opt,
+                                               uint64_t block_cache_size_mb) {
+  opt->rep.OptimizeForPointLookup(block_cache_size_mb);
+}
+
+void rocksdb_options_optimize_level_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeLevelStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_optimize_universal_style_compaction(
+    rocksdb_options_t* opt, uint64_t memtable_memory_budget) {
+  opt->rep.OptimizeUniversalStyleCompaction(memtable_memory_budget);
+}
+
+void rocksdb_options_set_allow_ingest_behind(rocksdb_options_t* opt,
+                                             unsigned char v) {
+  opt->rep.allow_ingest_behind = v;
+}
+
+unsigned char rocksdb_options_get_allow_ingest_behind(rocksdb_options_t* opt) {
+  return opt->rep.allow_ingest_behind;
+}
+
+void rocksdb_options_set_compaction_filter(rocksdb_options_t* opt,
+                                           rocksdb_compactionfilter_t* filter) {
+  opt->rep.compaction_filter = filter;
+}
+
+void rocksdb_options_set_compaction_filter_factory(
+    rocksdb_options_t* opt, rocksdb_compactionfilterfactory_t* factory) {
+  opt->rep.compaction_filter_factory =
+      std::shared_ptr<CompactionFilterFactory>(factory);
+}
+
+void rocksdb_options_compaction_readahead_size(rocksdb_options_t* opt,
+                                               size_t s) {
+  opt->rep.compaction_readahead_size = s;
+}
+
+size_t rocksdb_options_get_compaction_readahead_size(rocksdb_options_t* opt) {
+  return opt->rep.compaction_readahead_size;
+}
+
+void rocksdb_options_set_comparator(rocksdb_options_t* opt,
+                                    rocksdb_comparator_t* cmp) {
+  opt->rep.comparator = cmp;
+}
+
+void rocksdb_options_set_merge_operator(
+    rocksdb_options_t* opt, rocksdb_mergeoperator_t* merge_operator) {
+  opt->rep.merge_operator = std::shared_ptr<MergeOperator>(merge_operator);
+}
+
+void rocksdb_options_set_create_if_missing(rocksdb_options_t* opt,
+                                           unsigned char v) {
+  opt->rep.create_if_missing = v;
+}
+
+unsigned char rocksdb_options_get_create_if_missing(rocksdb_options_t* opt) {
+  return opt->rep.create_if_missing;
+}
+
+void rocksdb_options_set_create_missing_column_families(rocksdb_options_t* opt,
+                                                        unsigned char v) {
+  opt->rep.create_missing_column_families = v;
+}
+
+unsigned char rocksdb_options_get_create_missing_column_families(
+    rocksdb_options_t* opt) {
+  return opt->rep.create_missing_column_families;
+}
+
+void rocksdb_options_set_error_if_exists(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.error_if_exists = v;
+}
+
+unsigned char rocksdb_options_get_error_if_exists(rocksdb_options_t* opt) {
+  return opt->rep.error_if_exists;
+}
+
+void rocksdb_options_set_paranoid_checks(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.paranoid_checks = v;
+}
+
+unsigned char rocksdb_options_get_paranoid_checks(rocksdb_options_t* opt) {
+  return opt->rep.paranoid_checks;
+}
+
+void rocksdb_options_set_db_paths(rocksdb_options_t* opt,
+                                  const rocksdb_dbpath_t** dbpath_values,
+                                  size_t num_paths) {
+  std::vector<DbPath> db_paths(num_paths);
+  for (size_t i = 0; i < num_paths; ++i) {
+    db_paths[i] = dbpath_values[i]->rep;
+  }
+  opt->rep.db_paths = db_paths;
+}
+
+void rocksdb_options_set_env(rocksdb_options_t* opt, rocksdb_env_t* env) {
+  opt->rep.env = (env ? env->rep : nullptr);
+}
+
+void rocksdb_options_set_info_log(rocksdb_options_t* opt, rocksdb_logger_t* l) {
+  if (l) {
+    opt->rep.info_log = l->rep;
+  }
+}
+
+void rocksdb_options_set_info_log_level(rocksdb_options_t* opt, int v) {
+  opt->rep.info_log_level = static_cast<InfoLogLevel>(v);
+}
+
+int rocksdb_options_get_info_log_level(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.info_log_level);
+}
+
+void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt,
+                                              size_t s) {
+  opt->rep.db_write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_db_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.db_write_buffer_size;
+}
+
+void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) {
+  opt->rep.write_buffer_size = s;
+}
+
+size_t rocksdb_options_get_write_buffer_size(rocksdb_options_t* opt) {
+  return opt->rep.write_buffer_size;
+}
+
+void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
+  opt->rep.max_open_files = n;
+}
+
+int rocksdb_options_get_max_open_files(rocksdb_options_t* opt) {
+  return opt->rep.max_open_files;
+}
+
+void rocksdb_options_set_max_file_opening_threads(rocksdb_options_t* opt,
+                                                  int n) {
+  opt->rep.max_file_opening_threads = n;
+}
+
+int rocksdb_options_get_max_file_opening_threads(rocksdb_options_t* opt) {
+  return opt->rep.max_file_opening_threads;
+}
+
+void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt,
+                                            uint64_t n) {
+  opt->rep.max_total_wal_size = n;
+}
+
+uint64_t rocksdb_options_get_max_total_wal_size(rocksdb_options_t* opt) {
+  return opt->rep.max_total_wal_size;
+}
+
+void rocksdb_options_set_target_file_size_base(rocksdb_options_t* opt,
+                                               uint64_t n) {
+  opt->rep.target_file_size_base = n;
+}
+
+uint64_t rocksdb_options_get_target_file_size_base(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_base;
+}
+
+void rocksdb_options_set_target_file_size_multiplier(rocksdb_options_t* opt,
+                                                     int n) {
+  opt->rep.target_file_size_multiplier = n;
+}
+
+int rocksdb_options_get_target_file_size_multiplier(rocksdb_options_t* opt) {
+  return opt->rep.target_file_size_multiplier;
+}
+
+void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt,
+                                                  uint64_t n) {
+  opt->rep.max_bytes_for_level_base = n;
+}
+
+uint64_t rocksdb_options_get_max_bytes_for_level_base(rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_base;
+}
+
+void rocksdb_options_set_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.level_compaction_dynamic_level_bytes = v;
+}
+
+unsigned char rocksdb_options_get_level_compaction_dynamic_level_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.level_compaction_dynamic_level_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier(rocksdb_options_t* opt,
+                                                        double n) {
+  opt->rep.max_bytes_for_level_multiplier = n;
+}
+
+double rocksdb_options_get_max_bytes_for_level_multiplier(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_bytes_for_level_multiplier;
+}
+
+void rocksdb_options_set_max_compaction_bytes(rocksdb_options_t* opt,
+                                              uint64_t n) {
+  opt->rep.max_compaction_bytes = n;
+}
+
+uint64_t rocksdb_options_get_max_compaction_bytes(rocksdb_options_t* opt) {
+  return opt->rep.max_compaction_bytes;
+}
+
+void rocksdb_options_set_max_bytes_for_level_multiplier_additional(
+    rocksdb_options_t* opt, int* level_values, size_t num_levels) {
+  opt->rep.max_bytes_for_level_multiplier_additional.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.max_bytes_for_level_multiplier_additional[i] = level_values[i];
+  }
+}
+
+void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
+  opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+}
+
+void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.skip_stats_update_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_stats_update_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_stats_update_on_db_open;
+}
+
+void rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt, unsigned char val) {
+  opt->rep.skip_checking_sst_file_sizes_on_db_open = val;
+}
+
+unsigned char rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.skip_checking_sst_file_sizes_on_db_open;
+}
+
+/* Blob Options Settings */
+void rocksdb_options_set_enable_blob_files(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.enable_blob_files = val;
+}
+extern ROCKSDB_LIBRARY_API unsigned char rocksdb_options_get_enable_blob_files(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_files;
+}
+
+void rocksdb_options_set_min_blob_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.min_blob_size = val;
+}
+
+uint64_t rocksdb_options_get_min_blob_size(rocksdb_options_t* opt) {
+  return opt->rep.min_blob_size;
+}
+
+void rocksdb_options_set_blob_file_size(rocksdb_options_t* opt, uint64_t val) {
+  opt->rep.blob_file_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_file_size(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_size;
+}
+
+void rocksdb_options_set_blob_compression_type(rocksdb_options_t* opt,
+                                               int val) {
+  opt->rep.blob_compression_type = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_blob_compression_type(rocksdb_options_t* opt) {
+  return opt->rep.blob_compression_type;
+}
+
+void rocksdb_options_set_enable_blob_gc(rocksdb_options_t* opt,
+                                        unsigned char val) {
+  opt->rep.enable_blob_garbage_collection = val;
+}
+
+unsigned char rocksdb_options_get_enable_blob_gc(rocksdb_options_t* opt) {
+  return opt->rep.enable_blob_garbage_collection;
+}
+
+void rocksdb_options_set_blob_gc_age_cutoff(rocksdb_options_t* opt,
+                                            double val) {
+  opt->rep.blob_garbage_collection_age_cutoff = val;
+}
+
+double rocksdb_options_get_blob_gc_age_cutoff(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_age_cutoff;
+}
+
+void rocksdb_options_set_blob_gc_force_threshold(rocksdb_options_t* opt,
+                                                 double val) {
+  opt->rep.blob_garbage_collection_force_threshold = val;
+}
+
+double rocksdb_options_get_blob_gc_force_threshold(rocksdb_options_t* opt) {
+  return opt->rep.blob_garbage_collection_force_threshold;
+}
+
+void rocksdb_options_set_blob_compaction_readahead_size(rocksdb_options_t* opt,
+                                                        uint64_t val) {
+  opt->rep.blob_compaction_readahead_size = val;
+}
+
+uint64_t rocksdb_options_get_blob_compaction_readahead_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.blob_compaction_readahead_size;
+}
+
+void rocksdb_options_set_blob_file_starting_level(rocksdb_options_t* opt,
+                                                  int val) {
+  opt->rep.blob_file_starting_level = val;
+}
+
+int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) {
+  return opt->rep.blob_file_starting_level;
+}
+
+void rocksdb_options_set_blob_cache(rocksdb_options_t* opt,
+                                    rocksdb_cache_t* blob_cache) {
+  opt->rep.blob_cache = blob_cache->rep;
+}
+
+void rocksdb_options_set_prepopulate_blob_cache(rocksdb_options_t* opt, int t) {
+  opt->rep.prepopulate_blob_cache = static_cast<PrepopulateBlobCache>(t);
+}
+
+int rocksdb_options_get_prepopulate_blob_cache(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.prepopulate_blob_cache);
+}
+
+void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) {
+  opt->rep.num_levels = n;
+}
+
+int rocksdb_options_get_num_levels(rocksdb_options_t* opt) {
+  return opt->rep.num_levels;
+}
+
+void rocksdb_options_set_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.level0_file_num_compaction_trigger = n;
+}
+
+int rocksdb_options_get_level0_file_num_compaction_trigger(
+    rocksdb_options_t* opt) {
+  return opt->rep.level0_file_num_compaction_trigger;
+}
+
+void rocksdb_options_set_level0_slowdown_writes_trigger(rocksdb_options_t* opt,
+                                                        int n) {
+  opt->rep.level0_slowdown_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_slowdown_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_slowdown_writes_trigger;
+}
+
+void rocksdb_options_set_level0_stop_writes_trigger(rocksdb_options_t* opt,
+                                                    int n) {
+  opt->rep.level0_stop_writes_trigger = n;
+}
+
+int rocksdb_options_get_level0_stop_writes_trigger(rocksdb_options_t* opt) {
+  return opt->rep.level0_stop_writes_trigger;
+}
+
+void rocksdb_options_set_wal_recovery_mode(rocksdb_options_t* opt, int mode) {
+  opt->rep.wal_recovery_mode = static_cast<WALRecoveryMode>(mode);
+}
+
+int rocksdb_options_get_wal_recovery_mode(rocksdb_options_t* opt) {
+  return static_cast<int>(opt->rep.wal_recovery_mode);
+}
+
+void rocksdb_options_set_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_compression(rocksdb_options_t* opt) {
+  return opt->rep.compression;
+}
+
+void rocksdb_options_set_bottommost_compression(rocksdb_options_t* opt, int t) {
+  opt->rep.bottommost_compression = static_cast<CompressionType>(t);
+}
+
+int rocksdb_options_get_bottommost_compression(rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression;
+}
+
+void rocksdb_options_set_compression_per_level(rocksdb_options_t* opt,
+                                               const int* level_values,
+                                               size_t num_levels) {
+  opt->rep.compression_per_level.resize(num_levels);
+  for (size_t i = 0; i < num_levels; ++i) {
+    opt->rep.compression_per_level[i] =
+        static_cast<CompressionType>(level_values[i]);
+  }
+}
+
+void rocksdb_options_set_bottommost_compression_options(rocksdb_options_t* opt,
+                                                        int w_bits, int level,
+                                                        int strategy,
+                                                        int max_dict_bytes,
+                                                        unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.window_bits = w_bits;
+  opt->rep.bottommost_compression_opts.level = level;
+  opt->rep.bottommost_compression_opts.strategy = strategy;
+  opt->rep.bottommost_compression_opts.max_dict_bytes = max_dict_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes, unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.zstd_max_train_bytes =
+      zstd_max_train_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.use_zstd_dict_trainer =
+      use_zstd_dict_trainer;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+unsigned char
+rocksdb_options_get_bottommost_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.bottommost_compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_bottommost_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes,
+    unsigned char enabled) {
+  opt->rep.bottommost_compression_opts.max_dict_buffer_bytes =
+      max_dict_buffer_bytes;
+  opt->rep.bottommost_compression_opts.enabled = enabled;
+}
+
+void rocksdb_options_set_compression_options(rocksdb_options_t* opt, int w_bits,
+                                             int level, int strategy,
+                                             int max_dict_bytes) {
+  opt->rep.compression_opts.window_bits = w_bits;
+  opt->rep.compression_opts.level = level;
+  opt->rep.compression_opts.strategy = strategy;
+  opt->rep.compression_opts.max_dict_bytes = max_dict_bytes;
+}
+
+void rocksdb_options_set_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt, int zstd_max_train_bytes) {
+  opt->rep.compression_opts.zstd_max_train_bytes = zstd_max_train_bytes;
+}
+
+int rocksdb_options_get_compression_options_zstd_max_train_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.zstd_max_train_bytes;
+}
+
+void rocksdb_options_set_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt, unsigned char use_zstd_dict_trainer) {
+  opt->rep.compression_opts.use_zstd_dict_trainer = use_zstd_dict_trainer;
+}
+
+unsigned char rocksdb_options_get_compression_options_use_zstd_dict_trainer(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.use_zstd_dict_trainer;
+}
+
+void rocksdb_options_set_compression_options_parallel_threads(
+    rocksdb_options_t* opt, int value) {
+  opt->rep.compression_opts.parallel_threads = value;
+}
+
+int rocksdb_options_get_compression_options_parallel_threads(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.parallel_threads;
+}
+
+void rocksdb_options_set_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt, uint64_t max_dict_buffer_bytes) {
+  opt->rep.compression_opts.max_dict_buffer_bytes = max_dict_buffer_bytes;
+}
+
+uint64_t rocksdb_options_get_compression_options_max_dict_buffer_bytes(
+    rocksdb_options_t* opt) {
+  return opt->rep.compression_opts.max_dict_buffer_bytes;
+}
+
+void rocksdb_options_set_prefix_extractor(
+    rocksdb_options_t* opt, rocksdb_slicetransform_t* prefix_extractor) {
+  opt->rep.prefix_extractor.reset(prefix_extractor);
+}
+
+void rocksdb_options_set_use_fsync(rocksdb_options_t* opt, int use_fsync) {
+  opt->rep.use_fsync = use_fsync;
+}
+
+int rocksdb_options_get_use_fsync(rocksdb_options_t* opt) {
+  return opt->rep.use_fsync;
+}
+
+void rocksdb_options_set_db_log_dir(rocksdb_options_t* opt,
+                                    const char* db_log_dir) {
+  opt->rep.db_log_dir = db_log_dir;
+}
+
+void rocksdb_options_set_wal_dir(rocksdb_options_t* opt, const char* v) {
+  opt->rep.wal_dir = v;
+}
+
+void rocksdb_options_set_WAL_ttl_seconds(rocksdb_options_t* opt, uint64_t ttl) {
+  opt->rep.WAL_ttl_seconds = ttl;
+}
+
+uint64_t rocksdb_options_get_WAL_ttl_seconds(rocksdb_options_t* opt) {
+  return opt->rep.WAL_ttl_seconds;
+}
+
+void rocksdb_options_set_WAL_size_limit_MB(rocksdb_options_t* opt,
+                                           uint64_t limit) {
+  opt->rep.WAL_size_limit_MB = limit;
+}
+
+uint64_t rocksdb_options_get_WAL_size_limit_MB(rocksdb_options_t* opt) {
+  return opt->rep.WAL_size_limit_MB;
+}
+
+void rocksdb_options_set_manifest_preallocation_size(rocksdb_options_t* opt,
+                                                     size_t v) {
+  opt->rep.manifest_preallocation_size = v;
+}
+
+size_t rocksdb_options_get_manifest_preallocation_size(rocksdb_options_t* opt) {
+  return opt->rep.manifest_preallocation_size;
+}
+
+void rocksdb_options_set_use_direct_reads(rocksdb_options_t* opt,
+                                          unsigned char v) {
+  opt->rep.use_direct_reads = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_reads(rocksdb_options_t* opt) {
+  return opt->rep.use_direct_reads;
+}
+
+void rocksdb_options_set_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.use_direct_io_for_flush_and_compaction = v;
+}
+
+unsigned char rocksdb_options_get_use_direct_io_for_flush_and_compaction(
+    rocksdb_options_t* opt) {
+  return opt->rep.use_direct_io_for_flush_and_compaction;
+}
+
+void rocksdb_options_set_allow_mmap_reads(rocksdb_options_t* opt,
+                                          unsigned char v) {
+  opt->rep.allow_mmap_reads = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_reads(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_reads;
+}
+
+void rocksdb_options_set_allow_mmap_writes(rocksdb_options_t* opt,
+                                           unsigned char v) {
+  opt->rep.allow_mmap_writes = v;
+}
+
+unsigned char rocksdb_options_get_allow_mmap_writes(rocksdb_options_t* opt) {
+  return opt->rep.allow_mmap_writes;
+}
+
+void rocksdb_options_set_is_fd_close_on_exec(rocksdb_options_t* opt,
+                                             unsigned char v) {
+  opt->rep.is_fd_close_on_exec = v;
+}
+
+unsigned char rocksdb_options_get_is_fd_close_on_exec(rocksdb_options_t* opt) {
+  return opt->rep.is_fd_close_on_exec;
+}
+
+void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt,
+                                               unsigned int v) {
+  opt->rep.stats_dump_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_dump_period_sec(rocksdb_options_t* opt) {
+  return opt->rep.stats_dump_period_sec;
+}
+
+void rocksdb_options_set_stats_persist_period_sec(rocksdb_options_t* opt,
+                                                  unsigned int v) {
+  opt->rep.stats_persist_period_sec = v;
+}
+
+unsigned int rocksdb_options_get_stats_persist_period_sec(
+    rocksdb_options_t* opt) {
+  return opt->rep.stats_persist_period_sec;
+}
+
+void rocksdb_options_set_advise_random_on_open(rocksdb_options_t* opt,
+                                               unsigned char v) {
+  opt->rep.advise_random_on_open = v;
+}
+
+unsigned char rocksdb_options_get_advise_random_on_open(
+    rocksdb_options_t* opt) {
+  return opt->rep.advise_random_on_open;
+}
+
+void rocksdb_options_set_access_hint_on_compaction_start(rocksdb_options_t* opt,
+                                                         int v) {
+  switch (v) {
+    case 0:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NONE;
+      break;
+    case 1:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::NORMAL;
+      break;
+    case 2:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::SEQUENTIAL;
+      break;
+    case 3:
+      opt->rep.access_hint_on_compaction_start =
+          ROCKSDB_NAMESPACE::Options::WILLNEED;
+      break;
+    default:
+      assert(0);
+  }
+}
+
+int rocksdb_options_get_access_hint_on_compaction_start(
+    rocksdb_options_t* opt) {
+  return opt->rep.access_hint_on_compaction_start;
+}
+
+void rocksdb_options_set_use_adaptive_mutex(rocksdb_options_t* opt,
+                                            unsigned char v) {
+  opt->rep.use_adaptive_mutex = v;
+}
+
+unsigned char rocksdb_options_get_use_adaptive_mutex(rocksdb_options_t* opt) {
+  return opt->rep.use_adaptive_mutex;
+}
+
+void rocksdb_options_set_wal_bytes_per_sync(rocksdb_options_t* opt,
+                                            uint64_t v) {
+  opt->rep.wal_bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_wal_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.wal_bytes_per_sync;
+}
+
+void rocksdb_options_set_bytes_per_sync(rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.bytes_per_sync = v;
+}
+
+uint64_t rocksdb_options_get_bytes_per_sync(rocksdb_options_t* opt) {
+  return opt->rep.bytes_per_sync;
+}
+
+void rocksdb_options_set_writable_file_max_buffer_size(rocksdb_options_t* opt,
+                                                       uint64_t v) {
+  opt->rep.writable_file_max_buffer_size = static_cast<size_t>(v);
+}
+
+uint64_t rocksdb_options_get_writable_file_max_buffer_size(
+    rocksdb_options_t* opt) {
+  return opt->rep.writable_file_max_buffer_size;
+}
+
+void rocksdb_options_set_allow_concurrent_memtable_write(rocksdb_options_t* opt,
+                                                         unsigned char v) {
+  opt->rep.allow_concurrent_memtable_write = v;
+}
+
+unsigned char rocksdb_options_get_allow_concurrent_memtable_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.allow_concurrent_memtable_write;
+}
+
+void rocksdb_options_set_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt, unsigned char v) {
+  opt->rep.enable_write_thread_adaptive_yield = v;
+}
+
+unsigned char rocksdb_options_get_enable_write_thread_adaptive_yield(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_write_thread_adaptive_yield;
+}
+
+void rocksdb_options_set_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.max_sequential_skip_in_iterations = v;
+}
+
+uint64_t rocksdb_options_get_max_sequential_skip_in_iterations(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_sequential_skip_in_iterations;
+}
+
+void rocksdb_options_set_max_write_buffer_number(rocksdb_options_t* opt,
+                                                 int n) {
+  opt->rep.max_write_buffer_number = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number(rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number;
+}
+
+void rocksdb_options_set_min_write_buffer_number_to_merge(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.min_write_buffer_number_to_merge = n;
+}
+
+int rocksdb_options_get_min_write_buffer_number_to_merge(
+    rocksdb_options_t* opt) {
+  return opt->rep.min_write_buffer_number_to_merge;
+}
+
+void rocksdb_options_set_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt, int n) {
+  opt->rep.max_write_buffer_number_to_maintain = n;
+}
+
+int rocksdb_options_get_max_write_buffer_number_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_number_to_maintain;
+}
+
+void rocksdb_options_set_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt, int64_t n) {
+  opt->rep.max_write_buffer_size_to_maintain = n;
+}
+
+int64_t rocksdb_options_get_max_write_buffer_size_to_maintain(
+    rocksdb_options_t* opt) {
+  return opt->rep.max_write_buffer_size_to_maintain;
+}
+
+void rocksdb_options_set_enable_pipelined_write(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.enable_pipelined_write = v;
+}
+
+unsigned char rocksdb_options_get_enable_pipelined_write(
+    rocksdb_options_t* opt) {
+  return opt->rep.enable_pipelined_write;
+}
+
+void rocksdb_options_set_unordered_write(rocksdb_options_t* opt,
+                                         unsigned char v) {
+  opt->rep.unordered_write = v;
+}
+
+unsigned char rocksdb_options_get_unordered_write(rocksdb_options_t* opt) {
+  return opt->rep.unordered_write;
+}
+
+void rocksdb_options_set_max_subcompactions(rocksdb_options_t* opt,
+                                            uint32_t n) {
+  opt->rep.max_subcompactions = n;
+}
+
+uint32_t rocksdb_options_get_max_subcompactions(rocksdb_options_t* opt) {
+  return opt->rep.max_subcompactions;
+}
+
+void rocksdb_options_set_max_background_jobs(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_jobs = n;
+}
+
+int rocksdb_options_get_max_background_jobs(rocksdb_options_t* opt) {
+  return opt->rep.max_background_jobs;
+}
+
+void rocksdb_options_set_max_background_compactions(rocksdb_options_t* opt,
+                                                    int n) {
+  opt->rep.max_background_compactions = n;
+}
+
+int rocksdb_options_get_max_background_compactions(rocksdb_options_t* opt) {
+  return opt->rep.max_background_compactions;
+}
+
+void rocksdb_options_set_max_background_flushes(rocksdb_options_t* opt, int n) {
+  opt->rep.max_background_flushes = n;
+}
+
+int rocksdb_options_get_max_background_flushes(rocksdb_options_t* opt) {
+  return opt->rep.max_background_flushes;
+}
+
+void rocksdb_options_set_experimental_mempurge_threshold(rocksdb_options_t* opt,
+                                                         double v) {
+  opt->rep.experimental_mempurge_threshold = v;
+}
+
+double rocksdb_options_get_experimental_mempurge_threshold(
+    rocksdb_options_t* opt) {
+  return opt->rep.experimental_mempurge_threshold;
+}
+
+void rocksdb_options_set_max_log_file_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.max_log_file_size = v;
+}
+
+size_t rocksdb_options_get_max_log_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_log_file_size;
+}
+
+void rocksdb_options_set_log_file_time_to_roll(rocksdb_options_t* opt,
+                                               size_t v) {
+  opt->rep.log_file_time_to_roll = v;
+}
+
+size_t rocksdb_options_get_log_file_time_to_roll(rocksdb_options_t* opt) {
+  return opt->rep.log_file_time_to_roll;
+}
+
+void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
+  opt->rep.keep_log_file_num = v;
+}
+
+size_t rocksdb_options_get_keep_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.keep_log_file_num;
+}
+
+void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
+                                              size_t v) {
+  opt->rep.recycle_log_file_num = v;
+}
+
+size_t rocksdb_options_get_recycle_log_file_num(rocksdb_options_t* opt) {
+  return opt->rep.recycle_log_file_num;
+}
+
+void rocksdb_options_set_soft_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.soft_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_soft_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.soft_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_hard_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt, size_t v) {
+  opt->rep.hard_pending_compaction_bytes_limit = v;
+}
+
+size_t rocksdb_options_get_hard_pending_compaction_bytes_limit(
+    rocksdb_options_t* opt) {
+  return opt->rep.hard_pending_compaction_bytes_limit;
+}
+
+void rocksdb_options_set_max_manifest_file_size(rocksdb_options_t* opt,
+                                                size_t v) {
+  opt->rep.max_manifest_file_size = v;
+}
+
+size_t rocksdb_options_get_max_manifest_file_size(rocksdb_options_t* opt) {
+  return opt->rep.max_manifest_file_size;
+}
+
+void rocksdb_options_set_table_cache_numshardbits(rocksdb_options_t* opt,
+                                                  int v) {
+  opt->rep.table_cache_numshardbits = v;
+}
+
+int rocksdb_options_get_table_cache_numshardbits(rocksdb_options_t* opt) {
+  return opt->rep.table_cache_numshardbits;
+}
+
+void rocksdb_options_set_arena_block_size(rocksdb_options_t* opt, size_t v) {
+  opt->rep.arena_block_size = v;
+}
+
+size_t rocksdb_options_get_arena_block_size(rocksdb_options_t* opt) {
+  return opt->rep.arena_block_size;
+}
+
+void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt,
+                                                  int disable) {
+  opt->rep.disable_auto_compactions = disable;
+}
+
+unsigned char rocksdb_options_get_disable_auto_compactions(
+    rocksdb_options_t* opt) {
+  return opt->rep.disable_auto_compactions;
+}
+
+void rocksdb_options_set_optimize_filters_for_hits(rocksdb_options_t* opt,
+                                                   int v) {
+  opt->rep.optimize_filters_for_hits = v;
+}
+
+unsigned char rocksdb_options_get_optimize_filters_for_hits(
+    rocksdb_options_t* opt) {
+  return opt->rep.optimize_filters_for_hits;
+}
+
+void rocksdb_options_set_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt, uint64_t v) {
+  opt->rep.delete_obsolete_files_period_micros = v;
+}
+
+uint64_t rocksdb_options_get_delete_obsolete_files_period_micros(
+    rocksdb_options_t* opt) {
+  return opt->rep.delete_obsolete_files_period_micros;
+}
+
+void rocksdb_options_prepare_for_bulk_load(rocksdb_options_t* opt) {
+  opt->rep.PrepareForBulkLoad();
+}
+
+void rocksdb_options_set_memtable_vector_rep(rocksdb_options_t* opt) {
+  opt->rep.memtable_factory.reset(new ROCKSDB_NAMESPACE::VectorRepFactory);
+}
+
+void rocksdb_options_set_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt, double v) {
+  opt->rep.memtable_prefix_bloom_size_ratio = v;
+}
+
+double rocksdb_options_get_memtable_prefix_bloom_size_ratio(
+    rocksdb_options_t* opt) {
+  return opt->rep.memtable_prefix_bloom_size_ratio;
+}
+
+void rocksdb_options_set_memtable_huge_page_size(rocksdb_options_t* opt,
+                                                 size_t v) {
+  opt->rep.memtable_huge_page_size = v;
+}
+
+size_t rocksdb_options_get_memtable_huge_page_size(rocksdb_options_t* opt) {
+  return opt->rep.memtable_huge_page_size;
+}
+
+void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t* opt,
+                                            size_t bucket_count,
+                                            int32_t skiplist_height,
+                                            int32_t skiplist_branching_factor) {
+  ROCKSDB_NAMESPACE::MemTableRepFactory* factory =
+      ROCKSDB_NAMESPACE::NewHashSkipListRepFactory(
+          bucket_count, skiplist_height, skiplist_branching_factor);
+  opt->rep.memtable_factory.reset(factory);
+}
+
+void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
+                                            size_t bucket_count) {
+  opt->rep.memtable_factory.reset(
+      ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
+}
+
+void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
+                                             uint32_t user_key_len,
+                                             int bloom_bits_per_key,
+                                             double hash_table_ratio,
+                                             size_t index_sparseness) {
+  ROCKSDB_NAMESPACE::PlainTableOptions options;
+  options.user_key_len = user_key_len;
+  options.bloom_bits_per_key = bloom_bits_per_key;
+  options.hash_table_ratio = hash_table_ratio;
+  options.index_sparseness = index_sparseness;
+
+  ROCKSDB_NAMESPACE::TableFactory* factory =
+      ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
+  opt->rep.table_factory.reset(factory);
+}
+
+void rocksdb_options_set_max_successive_merges(rocksdb_options_t* opt,
+                                               size_t v) {
+  opt->rep.max_successive_merges = v;
+}
+
+size_t rocksdb_options_get_max_successive_merges(rocksdb_options_t* opt) {
+  return opt->rep.max_successive_merges;
+}
+
+void rocksdb_options_set_bloom_locality(rocksdb_options_t* opt, uint32_t v) {
+  opt->rep.bloom_locality = v;
+}
+
+uint32_t rocksdb_options_get_bloom_locality(rocksdb_options_t* opt) {
+  return opt->rep.bloom_locality;
+}
+
+void rocksdb_options_set_inplace_update_support(rocksdb_options_t* opt,
+                                                unsigned char v) {
+  opt->rep.inplace_update_support = v;
+}
+
+unsigned char rocksdb_options_get_inplace_update_support(
+    rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_support;
+}
+
+void rocksdb_options_set_inplace_update_num_locks(rocksdb_options_t* opt,
+                                                  size_t v) {
+  opt->rep.inplace_update_num_locks = v;
+}
+
+size_t rocksdb_options_get_inplace_update_num_locks(rocksdb_options_t* opt) {
+  return opt->rep.inplace_update_num_locks;
+}
+
+void rocksdb_options_set_report_bg_io_stats(rocksdb_options_t* opt, int v) {
+  opt->rep.report_bg_io_stats = v;
+}
+
+unsigned char rocksdb_options_get_report_bg_io_stats(rocksdb_options_t* opt) {
+  return opt->rep.report_bg_io_stats;
+}
+
+void rocksdb_options_set_compaction_style(rocksdb_options_t* opt, int style) {
+  opt->rep.compaction_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStyle>(style);
+}
+
+int rocksdb_options_get_compaction_style(rocksdb_options_t* opt) {
+  return opt->rep.compaction_style;
+}
+
+void rocksdb_options_set_universal_compaction_options(
+    rocksdb_options_t* opt, rocksdb_universal_compaction_options_t* uco) {
+  opt->rep.compaction_options_universal = *(uco->rep);
+}
+
+void rocksdb_options_set_fifo_compaction_options(
+    rocksdb_options_t* opt, rocksdb_fifo_compaction_options_t* fifo) {
+  opt->rep.compaction_options_fifo = fifo->rep;
+}
+
+char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
+  ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+  if (statistics) {
+    return strdup(statistics->ToString().c_str());
+  }
+  return nullptr;
+}
+
+void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
+                                     rocksdb_ratelimiter_t* limiter) {
+  if (limiter) {
+    opt->rep.rate_limiter = limiter->rep;
+  }
+}
+
+void rocksdb_options_set_atomic_flush(rocksdb_options_t* opt,
+                                      unsigned char atomic_flush) {
+  opt->rep.atomic_flush = atomic_flush;
+}
+
+unsigned char rocksdb_options_get_atomic_flush(rocksdb_options_t* opt) {
+  return opt->rep.atomic_flush;
+}
+
+void rocksdb_options_set_manual_wal_flush(rocksdb_options_t* opt,
+                                          unsigned char manual_wal_flush) {
+  opt->rep.manual_wal_flush = manual_wal_flush;
+}
+
+unsigned char rocksdb_options_get_manual_wal_flush(rocksdb_options_t* opt) {
+  return opt->rep.manual_wal_flush;
+}
+
+void rocksdb_options_set_wal_compression(rocksdb_options_t* opt, int val) {
+  opt->rep.wal_compression = static_cast<CompressionType>(val);
+}
+
+int rocksdb_options_get_wal_compression(rocksdb_options_t* opt) {
+  return opt->rep.wal_compression;
+}
+
+rocksdb_ratelimiter_t* rocksdb_ratelimiter_create(int64_t rate_bytes_per_sec,
+                                                  int64_t refill_period_us,
+                                                  int32_t fairness) {
+  rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+  rate_limiter->rep.reset(
+      NewGenericRateLimiter(rate_bytes_per_sec, refill_period_us, fairness));
+  return rate_limiter;
+}
+
+void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) {
+  delete limiter;
+}
+
+void rocksdb_options_set_row_cache(rocksdb_options_t* opt,
+                                   rocksdb_cache_t* cache) {
+  if (cache) {
+    opt->rep.row_cache = cache->rep;
+  }
+}
+
+void rocksdb_options_add_compact_on_deletion_collector_factory(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del =
+          NewCompactOnDeletionCollectorFactory(window_size, num_dels_trigger);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
+void rocksdb_set_perf_level(int v) {
+  PerfLevel level = static_cast<PerfLevel>(v);
+  SetPerfLevel(level);
+}
+
+rocksdb_perfcontext_t* rocksdb_perfcontext_create() {
+  rocksdb_perfcontext_t* context = new rocksdb_perfcontext_t;
+  context->rep = ROCKSDB_NAMESPACE::get_perf_context();
+  return context;
+}
+
+void rocksdb_perfcontext_reset(rocksdb_perfcontext_t* context) {
+  context->rep->Reset();
+}
+
+char* rocksdb_perfcontext_report(rocksdb_perfcontext_t* context,
+                                 unsigned char exclude_zero_counters) {
+  return strdup(context->rep->ToString(exclude_zero_counters).c_str());
+}
+
+uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
+                                    int metric) {
+  PerfContext* rep = context->rep;
+  switch (metric) {
+    case rocksdb_user_key_comparison_count:
+      return rep->user_key_comparison_count;
+    case rocksdb_block_cache_hit_count:
+      return rep->block_cache_hit_count;
+    case rocksdb_block_read_count:
+      return rep->block_read_count;
+    case rocksdb_block_read_byte:
+      return rep->block_read_byte;
+    case rocksdb_block_read_time:
+      return rep->block_read_time;
+    case rocksdb_block_checksum_time:
+      return rep->block_checksum_time;
+    case rocksdb_block_decompress_time:
+      return rep->block_decompress_time;
+    case rocksdb_get_read_bytes:
+      return rep->get_read_bytes;
+    case rocksdb_multiget_read_bytes:
+      return rep->multiget_read_bytes;
+    case rocksdb_iter_read_bytes:
+      return rep->iter_read_bytes;
+    case rocksdb_internal_key_skipped_count:
+      return rep->internal_key_skipped_count;
+    case rocksdb_internal_delete_skipped_count:
+      return rep->internal_delete_skipped_count;
+    case rocksdb_internal_recent_skipped_count:
+      return rep->internal_recent_skipped_count;
+    case rocksdb_internal_merge_count:
+      return rep->internal_merge_count;
+    case rocksdb_get_snapshot_time:
+      return rep->get_snapshot_time;
+    case rocksdb_get_from_memtable_time:
+      return rep->get_from_memtable_time;
+    case rocksdb_get_from_memtable_count:
+      return rep->get_from_memtable_count;
+    case rocksdb_get_post_process_time:
+      return rep->get_post_process_time;
+    case rocksdb_get_from_output_files_time:
+      return rep->get_from_output_files_time;
+    case rocksdb_seek_on_memtable_time:
+      return rep->seek_on_memtable_time;
+    case rocksdb_seek_on_memtable_count:
+      return rep->seek_on_memtable_count;
+    case rocksdb_next_on_memtable_count:
+      return rep->next_on_memtable_count;
+    case rocksdb_prev_on_memtable_count:
+      return rep->prev_on_memtable_count;
+    case rocksdb_seek_child_seek_time:
+      return rep->seek_child_seek_time;
+    case rocksdb_seek_child_seek_count:
+      return rep->seek_child_seek_count;
+    case rocksdb_seek_min_heap_time:
+      return rep->seek_min_heap_time;
+    case rocksdb_seek_max_heap_time:
+      return rep->seek_max_heap_time;
+    case rocksdb_seek_internal_seek_time:
+      return rep->seek_internal_seek_time;
+    case rocksdb_find_next_user_entry_time:
+      return rep->find_next_user_entry_time;
+    case rocksdb_write_wal_time:
+      return rep->write_wal_time;
+    case rocksdb_write_memtable_time:
+      return rep->write_memtable_time;
+    case rocksdb_write_delay_time:
+      return rep->write_delay_time;
+    case rocksdb_write_pre_and_post_process_time:
+      return rep->write_pre_and_post_process_time;
+    case rocksdb_db_mutex_lock_nanos:
+      return rep->db_mutex_lock_nanos;
+    case rocksdb_db_condition_wait_nanos:
+      return rep->db_condition_wait_nanos;
+    case rocksdb_merge_operator_time_nanos:
+      return rep->merge_operator_time_nanos;
+    case rocksdb_read_index_block_nanos:
+      return rep->read_index_block_nanos;
+    case rocksdb_read_filter_block_nanos:
+      return rep->read_filter_block_nanos;
+    case rocksdb_new_table_block_iter_nanos:
+      return rep->new_table_block_iter_nanos;
+    case rocksdb_new_table_iterator_nanos:
+      return rep->new_table_iterator_nanos;
+    case rocksdb_block_seek_nanos:
+      return rep->block_seek_nanos;
+    case rocksdb_find_table_nanos:
+      return rep->find_table_nanos;
+    case rocksdb_bloom_memtable_hit_count:
+      return rep->bloom_memtable_hit_count;
+    case rocksdb_bloom_memtable_miss_count:
+      return rep->bloom_memtable_miss_count;
+    case rocksdb_bloom_sst_hit_count:
+      return rep->bloom_sst_hit_count;
+    case rocksdb_bloom_sst_miss_count:
+      return rep->bloom_sst_miss_count;
+    case rocksdb_key_lock_wait_time:
+      return rep->key_lock_wait_time;
+    case rocksdb_key_lock_wait_count:
+      return rep->key_lock_wait_count;
+    case rocksdb_env_new_sequential_file_nanos:
+      return rep->env_new_sequential_file_nanos;
+    case rocksdb_env_new_random_access_file_nanos:
+      return rep->env_new_random_access_file_nanos;
+    case rocksdb_env_new_writable_file_nanos:
+      return rep->env_new_writable_file_nanos;
+    case rocksdb_env_reuse_writable_file_nanos:
+      return rep->env_reuse_writable_file_nanos;
+    case rocksdb_env_new_random_rw_file_nanos:
+      return rep->env_new_random_rw_file_nanos;
+    case rocksdb_env_new_directory_nanos:
+      return rep->env_new_directory_nanos;
+    case rocksdb_env_file_exists_nanos:
+      return rep->env_file_exists_nanos;
+    case rocksdb_env_get_children_nanos:
+      return rep->env_get_children_nanos;
+    case rocksdb_env_get_children_file_attributes_nanos:
+      return rep->env_get_children_file_attributes_nanos;
+    case rocksdb_env_delete_file_nanos:
+      return rep->env_delete_file_nanos;
+    case rocksdb_env_create_dir_nanos:
+      return rep->env_create_dir_nanos;
+    case rocksdb_env_create_dir_if_missing_nanos:
+      return rep->env_create_dir_if_missing_nanos;
+    case rocksdb_env_delete_dir_nanos:
+      return rep->env_delete_dir_nanos;
+    case rocksdb_env_get_file_size_nanos:
+      return rep->env_get_file_size_nanos;
+    case rocksdb_env_get_file_modification_time_nanos:
+      return rep->env_get_file_modification_time_nanos;
+    case rocksdb_env_rename_file_nanos:
+      return rep->env_rename_file_nanos;
+    case rocksdb_env_link_file_nanos:
+      return rep->env_link_file_nanos;
+    case rocksdb_env_lock_file_nanos:
+      return rep->env_lock_file_nanos;
+    case rocksdb_env_unlock_file_nanos:
+      return rep->env_unlock_file_nanos;
+    case rocksdb_env_new_logger_nanos:
+      return rep->env_new_logger_nanos;
+    case rocksdb_number_async_seek:
+      return rep->number_async_seek;
+    case rocksdb_blob_cache_hit_count:
+      return rep->blob_cache_hit_count;
+    case rocksdb_blob_read_count:
+      return rep->blob_read_count;
+    case rocksdb_blob_read_byte:
+      return rep->blob_read_byte;
+    case rocksdb_blob_read_time:
+      return rep->blob_read_time;
+    case rocksdb_blob_checksum_time:
+      return rep->blob_checksum_time;
+    case rocksdb_blob_decompress_time:
+      return rep->blob_decompress_time;
+    case rocksdb_internal_range_del_reseek_count:
+      return rep->internal_range_del_reseek_count;
+    default:
+      break;
+  }
+  return 0;
+}
+
+void rocksdb_perfcontext_destroy(rocksdb_perfcontext_t* context) {
+  delete context;
+}
+
+/*
+TODO:
+DB::OpenForReadOnly
+DB::KeyMayExist
+DB::GetOptions
+DB::GetSortedWalFiles
+DB::GetLatestSequenceNumber
+DB::GetUpdatesSince
+DB::GetDbIdentity
+DB::RunManualCompaction
+custom cache
+table_properties_collectors
+*/
+
+rocksdb_compactionfilter_t* rocksdb_compactionfilter_create(
+    void* state, void (*destructor)(void*),
+    unsigned char (*filter)(void*, int level, const char* key,
+                            size_t key_length, const char* existing_value,
+                            size_t value_length, char** new_value,
+                            size_t* new_value_length,
+                            unsigned char* value_changed),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilter_t* result = new rocksdb_compactionfilter_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->filter_ = filter;
+  result->ignore_snapshots_ = true;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilter_set_ignore_snapshots(
+    rocksdb_compactionfilter_t* filter, unsigned char whether_ignore) {
+  filter->ignore_snapshots_ = whether_ignore;
+}
+
+void rocksdb_compactionfilter_destroy(rocksdb_compactionfilter_t* filter) {
+  delete filter;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_full_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_full_compaction;
+}
+
+unsigned char rocksdb_compactionfiltercontext_is_manual_compaction(
+    rocksdb_compactionfiltercontext_t* context) {
+  return context->rep.is_manual_compaction;
+}
+
+rocksdb_compactionfilterfactory_t* rocksdb_compactionfilterfactory_create(
+    void* state, void (*destructor)(void*),
+    rocksdb_compactionfilter_t* (*create_compaction_filter)(
+        void*, rocksdb_compactionfiltercontext_t* context),
+    const char* (*name)(void*)) {
+  rocksdb_compactionfilterfactory_t* result =
+      new rocksdb_compactionfilterfactory_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->create_compaction_filter_ = create_compaction_filter;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_compactionfilterfactory_destroy(
+    rocksdb_compactionfilterfactory_t* factory) {
+  delete factory;
+}
+
+rocksdb_comparator_t* rocksdb_comparator_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    const char* (*name)(void*)) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->name_ = name;
+  result->compare_ts_ = nullptr;
+  result->compare_without_ts_ = nullptr;
+  return result;
+}
+
+void rocksdb_comparator_destroy(rocksdb_comparator_t* cmp) { delete cmp; }
+
+rocksdb_comparator_t* rocksdb_comparator_with_ts_create(
+    void* state, void (*destructor)(void*),
+    int (*compare)(void*, const char* a, size_t alen, const char* b,
+                   size_t blen),
+    int (*compare_ts)(void*, const char* a_ts, size_t a_tslen, const char* b_ts,
+                      size_t b_tslen),
+    int (*compare_without_ts)(void*, const char* a, size_t alen,
+                              unsigned char a_has_ts, const char* b,
+                              size_t blen, unsigned char b_has_ts),
+    const char* (*name)(void*), size_t timestamp_size) {
+  rocksdb_comparator_t* result = new rocksdb_comparator_t(timestamp_size);
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->compare_ = compare;
+  result->compare_ts_ = compare_ts;
+  result->compare_without_ts_ = compare_without_ts;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_filterpolicy_destroy(rocksdb_filterpolicy_t* filter) {
+  delete filter;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
+    double bits_per_key, bool original_format) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewBloomFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
+    // No need to override GetFilterBitsBuilder if this one is overridden
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ = NewBloomFilterPolicy(bits_per_key, original_format);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_full(
+    double bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, false);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom(double bits_per_key) {
+  return rocksdb_filterpolicy_create_bloom_format(bits_per_key, true);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  // Make a rocksdb_filterpolicy_t, but override all of its methods so
+  // they delegate to a NewRibbonFilterPolicy() instead of user
+  // supplied C functions.
+  struct Wrapper : public rocksdb_filterpolicy_t {
+    const FilterPolicy* rep_;
+    ~Wrapper() override { delete rep_; }
+    const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
+    ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
+        const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
+        const override {
+      return rep_->GetBuilderWithContext(context);
+    }
+    ROCKSDB_NAMESPACE::FilterBitsReader* GetFilterBitsReader(
+        const Slice& contents) const override {
+      return rep_->GetFilterBitsReader(contents);
+    }
+    static void DoNothing(void*) {}
+  };
+  Wrapper* wrapper = new Wrapper;
+  wrapper->rep_ =
+      NewRibbonFilterPolicy(bloom_equivalent_bits_per_key, bloom_before_level);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &Wrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon(
+    double bloom_equivalent_bits_per_key) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, /*bloom_before_level = disabled*/ -1);
+}
+
+rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_hybrid(
+    double bloom_equivalent_bits_per_key, int bloom_before_level) {
+  return rocksdb_filterpolicy_create_ribbon_format(
+      bloom_equivalent_bits_per_key, bloom_before_level);
+}
+
+rocksdb_mergeoperator_t* rocksdb_mergeoperator_create(
+    void* state, void (*destructor)(void*),
+    char* (*full_merge)(void*, const char* key, size_t key_length,
+                        const char* existing_value,
+                        size_t existing_value_length,
+                        const char* const* operands_list,
+                        const size_t* operands_list_length, int num_operands,
+                        unsigned char* success, size_t* new_value_length),
+    char* (*partial_merge)(void*, const char* key, size_t key_length,
+                           const char* const* operands_list,
+                           const size_t* operands_list_length, int num_operands,
+                           unsigned char* success, size_t* new_value_length),
+    void (*delete_value)(void*, const char* value, size_t value_length),
+    const char* (*name)(void*)) {
+  rocksdb_mergeoperator_t* result = new rocksdb_mergeoperator_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->full_merge_ = full_merge;
+  result->partial_merge_ = partial_merge;
+  result->delete_value_ = delete_value;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_mergeoperator_destroy(rocksdb_mergeoperator_t* merge_operator) {
+  delete merge_operator;
+}
+
+rocksdb_readoptions_t* rocksdb_readoptions_create() {
+  return new rocksdb_readoptions_t;
+}
+
+void rocksdb_readoptions_destroy(rocksdb_readoptions_t* opt) { delete opt; }
+
+void rocksdb_readoptions_set_verify_checksums(rocksdb_readoptions_t* opt,
+                                              unsigned char v) {
+  opt->rep.verify_checksums = v;
+}
+
+unsigned char rocksdb_readoptions_get_verify_checksums(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.verify_checksums;
+}
+
+void rocksdb_readoptions_set_fill_cache(rocksdb_readoptions_t* opt,
+                                        unsigned char v) {
+  opt->rep.fill_cache = v;
+}
+
+unsigned char rocksdb_readoptions_get_fill_cache(rocksdb_readoptions_t* opt) {
+  return opt->rep.fill_cache;
+}
+
+void rocksdb_readoptions_set_snapshot(rocksdb_readoptions_t* opt,
+                                      const rocksdb_snapshot_t* snap) {
+  opt->rep.snapshot = (snap ? snap->rep : nullptr);
+}
+
+void rocksdb_readoptions_set_iterate_upper_bound(rocksdb_readoptions_t* opt,
+                                                 const char* key,
+                                                 size_t keylen) {
+  if (key == nullptr) {
+    opt->upper_bound = Slice();
+    opt->rep.iterate_upper_bound = nullptr;
+
+  } else {
+    opt->upper_bound = Slice(key, keylen);
+    opt->rep.iterate_upper_bound = &opt->upper_bound;
+  }
+}
+
+void rocksdb_readoptions_set_iterate_lower_bound(rocksdb_readoptions_t* opt,
+                                                 const char* key,
+                                                 size_t keylen) {
+  if (key == nullptr) {
+    opt->lower_bound = Slice();
+    opt->rep.iterate_lower_bound = nullptr;
+  } else {
+    opt->lower_bound = Slice(key, keylen);
+    opt->rep.iterate_lower_bound = &opt->lower_bound;
+  }
+}
+
+void rocksdb_readoptions_set_read_tier(rocksdb_readoptions_t* opt, int v) {
+  opt->rep.read_tier = static_cast<ROCKSDB_NAMESPACE::ReadTier>(v);
+}
+
+int rocksdb_readoptions_get_read_tier(rocksdb_readoptions_t* opt) {
+  return static_cast<int>(opt->rep.read_tier);
+}
+
+void rocksdb_readoptions_set_tailing(rocksdb_readoptions_t* opt,
+                                     unsigned char v) {
+  opt->rep.tailing = v;
+}
+
+unsigned char rocksdb_readoptions_get_tailing(rocksdb_readoptions_t* opt) {
+  return opt->rep.tailing;
+}
+
+void rocksdb_readoptions_set_managed(rocksdb_readoptions_t* opt,
+                                     unsigned char v) {
+  opt->rep.managed = v;
+}
+
+void rocksdb_readoptions_set_readahead_size(rocksdb_readoptions_t* opt,
+                                            size_t v) {
+  opt->rep.readahead_size = v;
+}
+
+size_t rocksdb_readoptions_get_readahead_size(rocksdb_readoptions_t* opt) {
+  return opt->rep.readahead_size;
+}
+
+void rocksdb_readoptions_set_prefix_same_as_start(rocksdb_readoptions_t* opt,
+                                                  unsigned char v) {
+  opt->rep.prefix_same_as_start = v;
+}
+
+unsigned char rocksdb_readoptions_get_prefix_same_as_start(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.prefix_same_as_start;
+}
+
+void rocksdb_readoptions_set_pin_data(rocksdb_readoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.pin_data = v;
+}
+
+unsigned char rocksdb_readoptions_get_pin_data(rocksdb_readoptions_t* opt) {
+  return opt->rep.pin_data;
+}
+
+void rocksdb_readoptions_set_total_order_seek(rocksdb_readoptions_t* opt,
+                                              unsigned char v) {
+  opt->rep.total_order_seek = v;
+}
+
+unsigned char rocksdb_readoptions_get_total_order_seek(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.total_order_seek;
+}
+
+void rocksdb_readoptions_set_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt, uint64_t v) {
+  opt->rep.max_skippable_internal_keys = v;
+}
+
+uint64_t rocksdb_readoptions_get_max_skippable_internal_keys(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.max_skippable_internal_keys;
+}
+
+void rocksdb_readoptions_set_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt, unsigned char v) {
+  opt->rep.background_purge_on_iterator_cleanup = v;
+}
+
+unsigned char rocksdb_readoptions_get_background_purge_on_iterator_cleanup(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.background_purge_on_iterator_cleanup;
+}
+
+void rocksdb_readoptions_set_ignore_range_deletions(rocksdb_readoptions_t* opt,
+                                                    unsigned char v) {
+  opt->rep.ignore_range_deletions = v;
+}
+
+unsigned char rocksdb_readoptions_get_ignore_range_deletions(
+    rocksdb_readoptions_t* opt) {
+  return opt->rep.ignore_range_deletions;
+}
+
+void rocksdb_readoptions_set_deadline(rocksdb_readoptions_t* opt,
+                                      uint64_t microseconds) {
+  opt->rep.deadline = std::chrono::microseconds(microseconds);
+}
+
+uint64_t rocksdb_readoptions_get_deadline(rocksdb_readoptions_t* opt) {
+  return opt->rep.deadline.count();
+}
+
+void rocksdb_readoptions_set_io_timeout(rocksdb_readoptions_t* opt,
+                                        uint64_t microseconds) {
+  opt->rep.io_timeout = std::chrono::microseconds(microseconds);
+}
+
+extern ROCKSDB_LIBRARY_API uint64_t
+rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
+  return opt->rep.io_timeout.count();
+}
+
+void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
+                                       const char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->timestamp = Slice();
+    opt->rep.timestamp = nullptr;
+  } else {
+    opt->timestamp = Slice(ts, tslen);
+    opt->rep.timestamp = &opt->timestamp;
+  }
+}
+
+void rocksdb_readoptions_set_iter_start_ts(rocksdb_readoptions_t* opt,
+                                           const char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->iter_start_ts = Slice();
+    opt->rep.iter_start_ts = nullptr;
+  } else {
+    opt->iter_start_ts = Slice(ts, tslen);
+    opt->rep.iter_start_ts = &opt->iter_start_ts;
+  }
+}
+
+rocksdb_writeoptions_t* rocksdb_writeoptions_create() {
+  return new rocksdb_writeoptions_t;
+}
+
+void rocksdb_writeoptions_destroy(rocksdb_writeoptions_t* opt) { delete opt; }
+
+void rocksdb_writeoptions_set_sync(rocksdb_writeoptions_t* opt,
+                                   unsigned char v) {
+  opt->rep.sync = v;
+}
+
+unsigned char rocksdb_writeoptions_get_sync(rocksdb_writeoptions_t* opt) {
+  return opt->rep.sync;
+}
+
+void rocksdb_writeoptions_disable_WAL(rocksdb_writeoptions_t* opt,
+                                      int disable) {
+  opt->rep.disableWAL = disable;
+}
+
+unsigned char rocksdb_writeoptions_get_disable_WAL(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.disableWAL;
+}
+
+void rocksdb_writeoptions_set_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.ignore_missing_column_families = v;
+}
+
+unsigned char rocksdb_writeoptions_get_ignore_missing_column_families(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.ignore_missing_column_families;
+}
+
+void rocksdb_writeoptions_set_no_slowdown(rocksdb_writeoptions_t* opt,
+                                          unsigned char v) {
+  opt->rep.no_slowdown = v;
+}
+
+unsigned char rocksdb_writeoptions_get_no_slowdown(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.no_slowdown;
+}
+
+void rocksdb_writeoptions_set_low_pri(rocksdb_writeoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.low_pri = v;
+}
+
+unsigned char rocksdb_writeoptions_get_low_pri(rocksdb_writeoptions_t* opt) {
+  return opt->rep.low_pri;
+}
+
+void rocksdb_writeoptions_set_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt, unsigned char v) {
+  opt->rep.memtable_insert_hint_per_batch = v;
+}
+
+unsigned char rocksdb_writeoptions_get_memtable_insert_hint_per_batch(
+    rocksdb_writeoptions_t* opt) {
+  return opt->rep.memtable_insert_hint_per_batch;
+}
+
+rocksdb_compactoptions_t* rocksdb_compactoptions_create() {
+  return new rocksdb_compactoptions_t;
+}
+
+void rocksdb_compactoptions_destroy(rocksdb_compactoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_compactoptions_set_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.bottommost_level_compaction =
+      static_cast<BottommostLevelCompaction>(v);
+}
+
+unsigned char rocksdb_compactoptions_get_bottommost_level_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return static_cast<unsigned char>(opt->rep.bottommost_level_compaction);
+}
+
+void rocksdb_compactoptions_set_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt, unsigned char v) {
+  opt->rep.exclusive_manual_compaction = v;
+}
+
+unsigned char rocksdb_compactoptions_get_exclusive_manual_compaction(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.exclusive_manual_compaction;
+}
+
+void rocksdb_compactoptions_set_change_level(rocksdb_compactoptions_t* opt,
+                                             unsigned char v) {
+  opt->rep.change_level = v;
+}
+
+unsigned char rocksdb_compactoptions_get_change_level(
+    rocksdb_compactoptions_t* opt) {
+  return opt->rep.change_level;
+}
+
+void rocksdb_compactoptions_set_target_level(rocksdb_compactoptions_t* opt,
+                                             int n) {
+  opt->rep.target_level = n;
+}
+
+int rocksdb_compactoptions_get_target_level(rocksdb_compactoptions_t* opt) {
+  return opt->rep.target_level;
+}
+
+void rocksdb_compactoptions_set_full_history_ts_low(
+    rocksdb_compactoptions_t* opt, char* ts, size_t tslen) {
+  if (ts == nullptr) {
+    opt->full_history_ts_low = Slice();
+    opt->rep.full_history_ts_low = nullptr;
+  } else {
+    opt->full_history_ts_low = Slice(ts, tslen);
+    opt->rep.full_history_ts_low = &opt->full_history_ts_low;
+  }
+}
+
+rocksdb_flushoptions_t* rocksdb_flushoptions_create() {
+  return new rocksdb_flushoptions_t;
+}
+
+void rocksdb_flushoptions_destroy(rocksdb_flushoptions_t* opt) { delete opt; }
+
+void rocksdb_flushoptions_set_wait(rocksdb_flushoptions_t* opt,
+                                   unsigned char v) {
+  opt->rep.wait = v;
+}
+
+unsigned char rocksdb_flushoptions_get_wait(rocksdb_flushoptions_t* opt) {
+  return opt->rep.wait;
+}
+
+rocksdb_memory_allocator_t* rocksdb_jemalloc_nodump_allocator_create(
+    char** errptr) {
+  rocksdb_memory_allocator_t* allocator = new rocksdb_memory_allocator_t;
+  ROCKSDB_NAMESPACE::JemallocAllocatorOptions options;
+  SaveError(errptr, ROCKSDB_NAMESPACE::NewJemallocNodumpAllocator(
+                        options, &allocator->rep));
+  return allocator;
+}
+
+void rocksdb_memory_allocator_destroy(rocksdb_memory_allocator_t* allocator) {
+  delete allocator;
+}
+
+rocksdb_lru_cache_options_t* rocksdb_lru_cache_options_create() {
+  return new rocksdb_lru_cache_options_t;
+}
+
+void rocksdb_lru_cache_options_destroy(rocksdb_lru_cache_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_lru_cache_options_set_capacity(rocksdb_lru_cache_options_t* opt,
+                                            size_t capacity) {
+  opt->rep.capacity = capacity;
+}
+
+void rocksdb_lru_cache_options_set_num_shard_bits(
+    rocksdb_lru_cache_options_t* opt, int num_shard_bits) {
+  opt->rep.num_shard_bits = num_shard_bits;
+}
+
+void rocksdb_lru_cache_options_set_memory_allocator(
+    rocksdb_lru_cache_options_t* opt, rocksdb_memory_allocator_t* allocator) {
+  opt->rep.memory_allocator = allocator->rep;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru(size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
+    size_t capacity) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(capacity);
+  c->rep->SetStrictCapacityLimit(true);
+  return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_lru_opts(
+    rocksdb_lru_cache_options_t* opt) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = NewLRUCache(opt->rep);
+  return c;
+}
+
+void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }
+
+void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
+  cache->rep->DisownData();
+}
+
+void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
+  cache->rep->SetCapacity(capacity);
+}
+
+size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
+  return cache->rep->GetCapacity();
+}
+
+size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetUsage();
+}
+
+size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+  return cache->rep->GetPinnedUsage();
+}
+
+rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
+                                        uint64_t target_size) {
+  rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
+  result->rep.path = std::string(path);
+  result->rep.target_size = target_size;
+  return result;
+}
+
+void rocksdb_dbpath_destroy(rocksdb_dbpath_t* dbpath) { delete dbpath; }
+
+rocksdb_env_t* rocksdb_create_default_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = Env::Default();
+  result->is_default = true;
+  return result;
+}
+
+rocksdb_env_t* rocksdb_create_mem_env() {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
+  result->is_default = false;
+  return result;
+}
+
+void rocksdb_env_set_background_threads(rocksdb_env_t* env, int n) {
+  env->rep->SetBackgroundThreads(n);
+}
+
+int rocksdb_env_get_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads();
+}
+
+void rocksdb_env_set_bottom_priority_background_threads(rocksdb_env_t* env,
+                                                        int n) {
+  env->rep->SetBackgroundThreads(n, Env::BOTTOM);
+}
+
+int rocksdb_env_get_bottom_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::BOTTOM);
+}
+
+void rocksdb_env_set_high_priority_background_threads(rocksdb_env_t* env,
+                                                      int n) {
+  env->rep->SetBackgroundThreads(n, Env::HIGH);
+}
+
+int rocksdb_env_get_high_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::HIGH);
+}
+
+void rocksdb_env_set_low_priority_background_threads(rocksdb_env_t* env,
+                                                     int n) {
+  env->rep->SetBackgroundThreads(n, Env::LOW);
+}
+
+int rocksdb_env_get_low_priority_background_threads(rocksdb_env_t* env) {
+  return env->rep->GetBackgroundThreads(Env::LOW);
+}
+
+void rocksdb_env_join_all_threads(rocksdb_env_t* env) {
+  env->rep->WaitForJoin();
+}
+
+void rocksdb_env_lower_thread_pool_io_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_io_priority(
+    rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolIOPriority(Env::HIGH);
+}
+
+void rocksdb_env_lower_thread_pool_cpu_priority(rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority();
+}
+
+void rocksdb_env_lower_high_priority_thread_pool_cpu_priority(
+    rocksdb_env_t* env) {
+  env->rep->LowerThreadPoolCPUPriority(Env::HIGH);
+}
+
+void rocksdb_env_destroy(rocksdb_env_t* env) {
+  if (!env->is_default) delete env->rep;
+  delete env;
+}
+
+rocksdb_envoptions_t* rocksdb_envoptions_create() {
+  rocksdb_envoptions_t* opt = new rocksdb_envoptions_t;
+  return opt;
+}
+
+void rocksdb_envoptions_destroy(rocksdb_envoptions_t* opt) { delete opt; }
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+void rocksdb_create_dir_if_missing(rocksdb_env_t* env, const char* path,
+                                   char** errptr) {
+  SaveError(errptr, env->rep->CreateDirIfMissing(std::string(path)));
+}
+
+rocksdb_sstfilewriter_t* rocksdb_sstfilewriter_create_with_comparator(
+    const rocksdb_envoptions_t* env, const rocksdb_options_t* io_options,
+    const rocksdb_comparator_t* /*comparator*/) {
+  rocksdb_sstfilewriter_t* writer = new rocksdb_sstfilewriter_t;
+  writer->rep = new SstFileWriter(env->rep, io_options->rep);
+  return writer;
+}
+
+void rocksdb_sstfilewriter_open(rocksdb_sstfilewriter_t* writer,
+                                const char* name, char** errptr) {
+  SaveError(errptr, writer->rep->Open(std::string(name)));
+}
+
+void rocksdb_sstfilewriter_add(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put(rocksdb_sstfilewriter_t* writer, const char* key,
+                               size_t keylen, const char* val, size_t vallen,
+                               char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_put_with_ts(rocksdb_sstfilewriter_t* writer,
+                                       const char* key, size_t keylen,
+                                       const char* ts, size_t tslen,
+                                       const char* val, size_t vallen,
+                                       char** errptr) {
+  SaveError(errptr, writer->rep->Put(Slice(key, keylen), Slice(ts, tslen),
+                                     Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_merge(rocksdb_sstfilewriter_t* writer,
+                                 const char* key, size_t keylen,
+                                 const char* val, size_t vallen,
+                                 char** errptr) {
+  SaveError(errptr, writer->rep->Merge(Slice(key, keylen), Slice(val, vallen)));
+}
+
+void rocksdb_sstfilewriter_delete(rocksdb_sstfilewriter_t* writer,
+                                  const char* key, size_t keylen,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Delete(Slice(key, keylen)));
+}
+
+void rocksdb_sstfilewriter_delete_with_ts(rocksdb_sstfilewriter_t* writer,
+                                          const char* key, size_t keylen,
+                                          const char* ts, size_t tslen,
+                                          char** errptr) {
+  SaveError(errptr, writer->rep->Delete(Slice(key, keylen), Slice(ts, tslen)));
+}
+
+void rocksdb_sstfilewriter_delete_range(rocksdb_sstfilewriter_t* writer,
+                                        const char* begin_key,
+                                        size_t begin_keylen,
+                                        const char* end_key, size_t end_keylen,
+                                        char** errptr) {
+  SaveError(errptr, writer->rep->DeleteRange(Slice(begin_key, begin_keylen),
+                                             Slice(end_key, end_keylen)));
+}
+
+void rocksdb_sstfilewriter_finish(rocksdb_sstfilewriter_t* writer,
+                                  char** errptr) {
+  SaveError(errptr, writer->rep->Finish(nullptr));
+}
+
+void rocksdb_sstfilewriter_file_size(rocksdb_sstfilewriter_t* writer,
+                                     uint64_t* file_size) {
+  *file_size = writer->rep->FileSize();
+}
+
+void rocksdb_sstfilewriter_destroy(rocksdb_sstfilewriter_t* writer) {
+  delete writer->rep;
+  delete writer;
+}
+
+rocksdb_ingestexternalfileoptions_t*
+rocksdb_ingestexternalfileoptions_create() {
+  rocksdb_ingestexternalfileoptions_t* opt =
+      new rocksdb_ingestexternalfileoptions_t;
+  return opt;
+}
+
+void rocksdb_ingestexternalfileoptions_set_move_files(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char move_files) {
+  opt->rep.move_files = move_files;
+}
+
+void rocksdb_ingestexternalfileoptions_set_snapshot_consistency(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char snapshot_consistency) {
+  opt->rep.snapshot_consistency = snapshot_consistency;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_global_seqno(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_global_seqno) {
+  opt->rep.allow_global_seqno = allow_global_seqno;
+}
+
+void rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char allow_blocking_flush) {
+  opt->rep.allow_blocking_flush = allow_blocking_flush;
+}
+
+void rocksdb_ingestexternalfileoptions_set_ingest_behind(
+    rocksdb_ingestexternalfileoptions_t* opt, unsigned char ingest_behind) {
+  opt->rep.ingest_behind = ingest_behind;
+}
+
+void rocksdb_ingestexternalfileoptions_destroy(
+    rocksdb_ingestexternalfileoptions_t* opt) {
+  delete opt;
+}
+
+void rocksdb_ingest_external_file(
+    rocksdb_t* db, const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(files, opt->rep));
+}
+
+void rocksdb_ingest_external_file_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* handle,
+    const char* const* file_list, const size_t list_len,
+    const rocksdb_ingestexternalfileoptions_t* opt, char** errptr) {
+  std::vector<std::string> files(list_len);
+  for (size_t i = 0; i < list_len; ++i) {
+    files[i] = std::string(file_list[i]);
+  }
+  SaveError(errptr, db->rep->IngestExternalFile(handle->rep, files, opt->rep));
+}
+
+void rocksdb_try_catch_up_with_primary(rocksdb_t* db, char** errptr) {
+  SaveError(errptr, db->rep->TryCatchUpWithPrimary());
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create(
+    void* state, void (*destructor)(void*),
+    char* (*transform)(void*, const char* key, size_t length,
+                       size_t* dst_length),
+    unsigned char (*in_domain)(void*, const char* key, size_t length),
+    unsigned char (*in_range)(void*, const char* key, size_t length),
+    const char* (*name)(void*)) {
+  rocksdb_slicetransform_t* result = new rocksdb_slicetransform_t;
+  result->state_ = state;
+  result->destructor_ = destructor;
+  result->transform_ = transform;
+  result->in_domain_ = in_domain;
+  result->in_range_ = in_range;
+  result->name_ = name;
+  return result;
+}
+
+void rocksdb_slicetransform_destroy(rocksdb_slicetransform_t* st) { delete st; }
+
+struct SliceTransformWrapper : public rocksdb_slicetransform_t {
+  const SliceTransform* rep_;
+  ~SliceTransformWrapper() override { delete rep_; }
+  const char* Name() const override { return rep_->Name(); }
+  std::string GetId() const override { return rep_->GetId(); }
+  Slice Transform(const Slice& src) const override {
+    return rep_->Transform(src);
+  }
+  bool InDomain(const Slice& src) const override { return rep_->InDomain(src); }
+  bool InRange(const Slice& src) const override { return rep_->InRange(src); }
+  static void DoNothing(void*) {}
+};
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_fixed_prefix(
+    size_t prefixLen) {
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewFixedPrefixTransform(prefixLen);
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_slicetransform_t* rocksdb_slicetransform_create_noop() {
+  SliceTransformWrapper* wrapper = new SliceTransformWrapper;
+  wrapper->rep_ = ROCKSDB_NAMESPACE::NewNoopTransform();
+  wrapper->state_ = nullptr;
+  wrapper->destructor_ = &SliceTransformWrapper::DoNothing;
+  return wrapper;
+}
+
+rocksdb_universal_compaction_options_t*
+rocksdb_universal_compaction_options_create() {
+  rocksdb_universal_compaction_options_t* result =
+      new rocksdb_universal_compaction_options_t;
+  result->rep = new ROCKSDB_NAMESPACE::CompactionOptionsUniversal;
+  return result;
+}
+
+void rocksdb_universal_compaction_options_set_size_ratio(
+    rocksdb_universal_compaction_options_t* uco, int ratio) {
+  uco->rep->size_ratio = ratio;
+}
+
+int rocksdb_universal_compaction_options_get_size_ratio(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->size_ratio;
+}
+
+void rocksdb_universal_compaction_options_set_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->min_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_min_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->min_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco, int w) {
+  uco->rep->max_merge_width = w;
+}
+
+int rocksdb_universal_compaction_options_get_max_merge_width(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_merge_width;
+}
+
+void rocksdb_universal_compaction_options_set_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->max_size_amplification_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->max_size_amplification_percent;
+}
+
+void rocksdb_universal_compaction_options_set_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco, int p) {
+  uco->rep->compression_size_percent = p;
+}
+
+int rocksdb_universal_compaction_options_get_compression_size_percent(
+    rocksdb_universal_compaction_options_t* uco) {
+  return uco->rep->compression_size_percent;
+}
+
+void rocksdb_universal_compaction_options_set_stop_style(
+    rocksdb_universal_compaction_options_t* uco, int style) {
+  uco->rep->stop_style =
+      static_cast<ROCKSDB_NAMESPACE::CompactionStopStyle>(style);
+}
+
+int rocksdb_universal_compaction_options_get_stop_style(
+    rocksdb_universal_compaction_options_t* uco) {
+  return static_cast<int>(uco->rep->stop_style);
+}
+
+void rocksdb_universal_compaction_options_destroy(
+    rocksdb_universal_compaction_options_t* uco) {
+  delete uco->rep;
+  delete uco;
+}
+
+rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
+  rocksdb_fifo_compaction_options_t* result =
+      new rocksdb_fifo_compaction_options_t;
+  result->rep = CompactionOptionsFIFO();
+  return result;
+}
+
+void rocksdb_fifo_compaction_options_set_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
+  fifo_opts->rep.max_table_files_size = size;
+}
+
+uint64_t rocksdb_fifo_compaction_options_get_max_table_files_size(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.max_table_files_size;
+}
+
+void rocksdb_fifo_compaction_options_destroy(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  delete fifo_opts;
+}
+
+void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt,
+                                               int level) {
+  if (level >= 0) {
+    assert(level <= opt->rep.num_levels);
+    opt->rep.compression_per_level.resize(opt->rep.num_levels);
+    for (int i = 0; i < level; i++) {
+      opt->rep.compression_per_level[i] = ROCKSDB_NAMESPACE::kNoCompression;
+    }
+    for (int i = level; i < opt->rep.num_levels; i++) {
+      opt->rep.compression_per_level[i] = opt->rep.compression;
+    }
+  }
+}
+
+int rocksdb_livefiles_count(const rocksdb_livefiles_t* lf) {
+  return static_cast<int>(lf->rep.size());
+}
+
+const char* rocksdb_livefiles_column_family_name(const rocksdb_livefiles_t* lf,
+                                                 int index) {
+  return lf->rep[index].column_family_name.c_str();
+}
+
+const char* rocksdb_livefiles_name(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].name.c_str();
+}
+
+int rocksdb_livefiles_level(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].level;
+}
+
+size_t rocksdb_livefiles_size(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].size;
+}
+
+const char* rocksdb_livefiles_smallestkey(const rocksdb_livefiles_t* lf,
+                                          int index, size_t* size) {
+  *size = lf->rep[index].smallestkey.size();
+  return lf->rep[index].smallestkey.data();
+}
+
+const char* rocksdb_livefiles_largestkey(const rocksdb_livefiles_t* lf,
+                                         int index, size_t* size) {
+  *size = lf->rep[index].largestkey.size();
+  return lf->rep[index].largestkey.data();
+}
+
+uint64_t rocksdb_livefiles_entries(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].num_entries;
+}
+
+uint64_t rocksdb_livefiles_deletions(const rocksdb_livefiles_t* lf, int index) {
+  return lf->rep[index].num_deletions;
+}
+
+extern void rocksdb_livefiles_destroy(const rocksdb_livefiles_t* lf) {
+  delete lf;
+}
+
+void rocksdb_get_options_from_string(const rocksdb_options_t* base_options,
+                                     const char* opts_str,
+                                     rocksdb_options_t* new_options,
+                                     char** errptr) {
+  SaveError(errptr,
+            GetOptionsFromString(base_options->rep, std::string(opts_str),
+                                 &new_options->rep));
+}
+
+void rocksdb_delete_file_in_range(rocksdb_t* db, const char* start_key,
+                                  size_t start_key_len, const char* limit_key,
+                                  size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, db->rep->DefaultColumnFamily(),
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+void rocksdb_delete_file_in_range_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family,
+    const char* start_key, size_t start_key_len, const char* limit_key,
+    size_t limit_key_len, char** errptr) {
+  Slice a, b;
+  SaveError(
+      errptr,
+      DeleteFilesInRange(
+          db->rep, column_family->rep,
+          (start_key ? (a = Slice(start_key, start_key_len), &a) : nullptr),
+          (limit_key ? (b = Slice(limit_key, limit_key_len), &b) : nullptr)));
+}
+
+/* MetaData */
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata(
+    rocksdb_t* db) {
+  rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+  db->rep->GetColumnFamilyMetaData(&meta->rep);
+  return meta;
+}
+
+rocksdb_column_family_metadata_t* rocksdb_get_column_family_metadata_cf(
+    rocksdb_t* db, rocksdb_column_family_handle_t* column_family) {
+  rocksdb_column_family_metadata_t* meta = new rocksdb_column_family_metadata_t;
+  db->rep->GetColumnFamilyMetaData(column_family->rep, &meta->rep);
+  return meta;
+}
+
+void rocksdb_column_family_metadata_destroy(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  delete cf_meta;
+}
+
+uint64_t rocksdb_column_family_metadata_get_size(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.size;
+}
+
+size_t rocksdb_column_family_metadata_get_file_count(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.file_count;
+}
+
+char* rocksdb_column_family_metadata_get_name(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return strdup(cf_meta->rep.name.c_str());
+}
+
+size_t rocksdb_column_family_metadata_get_level_count(
+    rocksdb_column_family_metadata_t* cf_meta) {
+  return cf_meta->rep.levels.size();
+}
+
+rocksdb_level_metadata_t* rocksdb_column_family_metadata_get_level_metadata(
+    rocksdb_column_family_metadata_t* cf_meta, size_t i) {
+  if (i >= cf_meta->rep.levels.size()) {
+    return NULL;
+  }
+  rocksdb_level_metadata_t* level_meta =
+      (rocksdb_level_metadata_t*)malloc(sizeof(rocksdb_level_metadata_t));
+  level_meta->rep = &cf_meta->rep.levels[i];
+
+  return level_meta;
+}
+
+void rocksdb_level_metadata_destroy(rocksdb_level_metadata_t* level_meta) {
+  // Only free the base pointer as its parent rocksdb_column_family_metadata_t
+  // has the ownership of its rep.
+  free(level_meta);
+}
+
+int rocksdb_level_metadata_get_level(rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->level;
+}
+
+uint64_t rocksdb_level_metadata_get_size(rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->size;
+}
+
+size_t rocksdb_level_metadata_get_file_count(
+    rocksdb_level_metadata_t* level_meta) {
+  return level_meta->rep->files.size();
+}
+
+rocksdb_sst_file_metadata_t* rocksdb_level_metadata_get_sst_file_metadata(
+    rocksdb_level_metadata_t* level_meta, size_t i) {
+  if (i >= level_meta->rep->files.size()) {
+    return nullptr;
+  }
+  rocksdb_sst_file_metadata_t* file_meta =
+      (rocksdb_sst_file_metadata_t*)malloc(sizeof(rocksdb_sst_file_metadata_t));
+  file_meta->rep = &level_meta->rep->files[i];
+  return file_meta;
+}
+
+void rocksdb_sst_file_metadata_destroy(rocksdb_sst_file_metadata_t* file_meta) {
+  // Only free the base pointer as its parent rocksdb_level_metadata_t
+  // has the ownership of its rep.
+  free(file_meta);
+}
+
+char* rocksdb_sst_file_metadata_get_relative_filename(
+    rocksdb_sst_file_metadata_t* file_meta) {
+  return strdup(file_meta->rep->relative_filename.c_str());
+}
+
+uint64_t rocksdb_sst_file_metadata_get_size(
+    rocksdb_sst_file_metadata_t* file_meta) {
+  return file_meta->rep->size;
+}
+
+char* rocksdb_sst_file_metadata_get_smallestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+  *key_len = file_meta->rep->smallestkey.size();
+  return CopyString(file_meta->rep->smallestkey);
+}
+
+char* rocksdb_sst_file_metadata_get_largestkey(
+    rocksdb_sst_file_metadata_t* file_meta, size_t* key_len) {
+  *key_len = file_meta->rep->largestkey.size();
+  return CopyString(file_meta->rep->largestkey);
+}
+
+/* Transactions */
+
+rocksdb_transactiondb_options_t* rocksdb_transactiondb_options_create() {
+  return new rocksdb_transactiondb_options_t;
+}
+
+void rocksdb_transactiondb_options_destroy(
+    rocksdb_transactiondb_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_transactiondb_options_set_max_num_locks(
+    rocksdb_transactiondb_options_t* opt, int64_t max_num_locks) {
+  opt->rep.max_num_locks = max_num_locks;
+}
+
+void rocksdb_transactiondb_options_set_num_stripes(
+    rocksdb_transactiondb_options_t* opt, size_t num_stripes) {
+  opt->rep.num_stripes = num_stripes;
+}
+
+void rocksdb_transactiondb_options_set_transaction_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t txn_lock_timeout) {
+  opt->rep.transaction_lock_timeout = txn_lock_timeout;
+}
+
+void rocksdb_transactiondb_options_set_default_lock_timeout(
+    rocksdb_transactiondb_options_t* opt, int64_t default_lock_timeout) {
+  opt->rep.default_lock_timeout = default_lock_timeout;
+}
+
+rocksdb_transaction_options_t* rocksdb_transaction_options_create() {
+  return new rocksdb_transaction_options_t;
+}
+
+void rocksdb_transaction_options_destroy(rocksdb_transaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_transaction_options_set_set_snapshot(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.deadlock_detect = v;
+}
+
+void rocksdb_transaction_options_set_lock_timeout(
+    rocksdb_transaction_options_t* opt, int64_t lock_timeout) {
+  opt->rep.lock_timeout = lock_timeout;
+}
+
+void rocksdb_transaction_options_set_expiration(
+    rocksdb_transaction_options_t* opt, int64_t expiration) {
+  opt->rep.expiration = expiration;
+}
+
+void rocksdb_transaction_options_set_deadlock_detect_depth(
+    rocksdb_transaction_options_t* opt, int64_t depth) {
+  opt->rep.deadlock_detect_depth = depth;
+}
+
+void rocksdb_transaction_options_set_max_write_batch_size(
+    rocksdb_transaction_options_t* opt, size_t size) {
+  opt->rep.max_write_batch_size = size;
+}
+
+void rocksdb_transaction_options_set_skip_prepare(
+    rocksdb_transaction_options_t* opt, unsigned char v) {
+  opt->rep.skip_prepare = v;
+}
+
+rocksdb_optimistictransaction_options_t*
+rocksdb_optimistictransaction_options_create() {
+  return new rocksdb_optimistictransaction_options_t;
+}
+
+void rocksdb_optimistictransaction_options_destroy(
+    rocksdb_optimistictransaction_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_optimistictransaction_options_set_set_snapshot(
+    rocksdb_optimistictransaction_options_t* opt, unsigned char v) {
+  opt->rep.set_snapshot = v;
+}
+
+char* rocksdb_optimistictransactiondb_property_value(
+    rocksdb_optimistictransactiondb_t* db, const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_optimistictransactiondb_property_int(
+    rocksdb_optimistictransactiondb_t* db, const char* propname,
+    uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+rocksdb_column_family_handle_t* rocksdb_transactiondb_create_column_family(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_options_t* column_family_options,
+    const char* column_family_name, char** errptr) {
+  rocksdb_column_family_handle_t* handle = new rocksdb_column_family_handle_t;
+  SaveError(errptr, txn_db->rep->CreateColumnFamily(
+                        ColumnFamilyOptions(column_family_options->rep),
+                        std::string(column_family_name), &(handle->rep)));
+  return handle;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    char** errptr) {
+  TransactionDB* txn_db;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), &txn_db))) {
+    return nullptr;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+rocksdb_transactiondb_t* rocksdb_transactiondb_open_column_families(
+    const rocksdb_options_t* options,
+    const rocksdb_transactiondb_options_t* txn_db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  TransactionDB* txn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, TransactionDB::Open(options->rep, txn_db_options->rep,
+                                            std::string(name), column_families,
+                                            &handles, &txn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_transactiondb_t* result = new rocksdb_transactiondb_t;
+  result->rep = txn_db;
+  return result;
+}
+
+const rocksdb_snapshot_t* rocksdb_transactiondb_create_snapshot(
+    rocksdb_transactiondb_t* txn_db) {
+  rocksdb_snapshot_t* result = new rocksdb_snapshot_t;
+  result->rep = txn_db->rep->GetSnapshot();
+  return result;
+}
+
+void rocksdb_transactiondb_release_snapshot(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_snapshot_t* snapshot) {
+  txn_db->rep->ReleaseSnapshot(snapshot->rep);
+  delete snapshot;
+}
+
+char* rocksdb_transactiondb_property_value(rocksdb_transactiondb_t* db,
+                                           const char* propname) {
+  std::string tmp;
+  if (db->rep->GetProperty(Slice(propname), &tmp)) {
+    // We use strdup() since we expect human readable output.
+    return strdup(tmp.c_str());
+  } else {
+    return nullptr;
+  }
+}
+
+int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
+                                       const char* propname,
+                                       uint64_t* out_val) {
+  if (db->rep->GetIntProperty(Slice(propname), out_val)) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
+rocksdb_transaction_t* rocksdb_transaction_begin(
+    rocksdb_transactiondb_t* txn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_transaction_options_t* txn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                                txn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = txn_db->rep->BeginTransaction(write_options->rep,
+                                               txn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+rocksdb_transaction_t** rocksdb_transactiondb_get_prepared_transactions(
+    rocksdb_transactiondb_t* txn_db, size_t* cnt) {
+  std::vector<Transaction*> txns;
+  txn_db->rep->GetAllPreparedTransactions(&txns);
+  *cnt = txns.size();
+  if (txns.empty()) {
+    return nullptr;
+  } else {
+    rocksdb_transaction_t** buf = (rocksdb_transaction_t**)malloc(
+        txns.size() * sizeof(rocksdb_transaction_t*));
+    for (size_t i = 0; i < txns.size(); i++) {
+      buf[i] = new rocksdb_transaction_t;
+      buf[i]->rep = txns[i];
+    }
+    return buf;
+  }
+}
+
+void rocksdb_transaction_set_name(rocksdb_transaction_t* txn, const char* name,
+                                  size_t name_len, char** errptr) {
+  std::string str = std::string(name, name_len);
+  SaveError(errptr, txn->rep->SetName(str));
+}
+
+char* rocksdb_transaction_get_name(rocksdb_transaction_t* txn,
+                                   size_t* name_len) {
+  auto name = txn->rep->GetName();
+  *name_len = name.size();
+  return CopyString(name);
+}
+
+void rocksdb_transaction_prepare(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Prepare());
+}
+
+rocksdb_writebatch_wi_t* rocksdb_transaction_get_writebatch_wi(
+    rocksdb_transaction_t* txn) {
+  rocksdb_writebatch_wi_t* wi =
+      (rocksdb_writebatch_wi_t*)malloc(sizeof(rocksdb_writebatch_wi_t));
+  wi->rep = txn->rep->GetWriteBatch();
+
+  return wi;
+}
+
+void rocksdb_transaction_rebuild_from_writebatch(
+    rocksdb_transaction_t* txn, rocksdb_writebatch_t* writebatch,
+    char** errptr) {
+  SaveError(errptr, txn->rep->RebuildFromWriteBatch(&writebatch->rep));
+}
+
+void rocksdb_transaction_rebuild_from_writebatch_wi(rocksdb_transaction_t* txn,
+                                                    rocksdb_writebatch_wi_t* wi,
+                                                    char** errptr) {
+  SaveError(errptr, txn->rep->RebuildFromWriteBatch(wi->rep->GetWriteBatch()));
+}
+
+void rocksdb_transaction_commit(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Commit());
+}
+
+void rocksdb_transaction_rollback(rocksdb_transaction_t* txn, char** errptr) {
+  SaveError(errptr, txn->rep->Rollback());
+}
+
+void rocksdb_transaction_set_savepoint(rocksdb_transaction_t* txn) {
+  txn->rep->SetSavePoint();
+}
+
+void rocksdb_transaction_rollback_to_savepoint(rocksdb_transaction_t* txn,
+                                               char** errptr) {
+  SaveError(errptr, txn->rep->RollbackToSavePoint());
+}
+
+void rocksdb_transaction_destroy(rocksdb_transaction_t* txn) {
+  delete txn->rep;
+  delete txn;
+}
+
+const rocksdb_snapshot_t* rocksdb_transaction_get_snapshot(
+    rocksdb_transaction_t* txn) {
+  // This will be freed later on using free, so use malloc here to avoid a
+  // mismatch
+  rocksdb_snapshot_t* result =
+      (rocksdb_snapshot_t*)malloc(sizeof(rocksdb_snapshot_t));
+  result->rep = txn->rep->GetSnapshot();
+  return result;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get(rocksdb_transaction_t* txn,
+                              const rocksdb_readoptions_t* options,
+                              const char* key, size_t klen, size_t* vlen,
+                              char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->Get(options->rep, Slice(key, klen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transaction_get_cf(rocksdb_transaction_t* txn,
+                                 const rocksdb_readoptions_t* options,
+                                 rocksdb_column_family_handle_t* column_family,
+                                 const char* key, size_t klen, size_t* vlen,
+                                 char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->Get(options->rep, column_family->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->Get(options->rep, column_family->rep, Slice(key, klen),
+                           &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+// Read a key inside a transaction
+char* rocksdb_transaction_get_for_update(rocksdb_transaction_t* txn,
+                                         const rocksdb_readoptions_t* options,
+                                         const char* key, size_t klen,
+                                         size_t* vlen, unsigned char exclusive,
+                                         char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s =
+      txn->rep->GetForUpdate(options->rep, Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, unsigned char exclusive, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->GetForUpdate(options->rep, Slice(key, klen),
+                                    v->rep.GetSelf(), exclusive);
+  v->rep.PinSelf();
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transaction_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    size_t* vlen, unsigned char exclusive, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &tmp, exclusive);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transaction_get_pinned_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    unsigned char exclusive, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn->rep->GetForUpdate(options->rep, column_family->rep,
+                                    Slice(key, klen), &v->rep, exclusive);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
+                                   const rocksdb_readoptions_t* options,
+                                   size_t num_keys,
+                                   const char* const* keys_list,
+                                   const size_t* keys_list_sizes,
+                                   char** values_list,
+                                   size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_transaction_multi_get_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+// Read a key outside a transaction
+char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
+                                const rocksdb_readoptions_t* options,
+                                const char* key, size_t klen, size_t* vlen,
+                                char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, Slice(key, klen), &tmp);
+  if (s.ok()) {
+    *vlen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vlen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const char* key, size_t klen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn_db->rep->Get(options->rep, txn_db->rep->DefaultColumnFamily(),
+                              Slice(key, klen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+char* rocksdb_transactiondb_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, size_t* vallen, char** errptr) {
+  char* result = nullptr;
+  std::string tmp;
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &tmp);
+  if (s.ok()) {
+    *vallen = tmp.size();
+    result = CopyString(tmp);
+  } else {
+    *vallen = 0;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+  }
+  return result;
+}
+
+rocksdb_pinnableslice_t* rocksdb_transactiondb_get_pinned_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = txn_db->rep->Get(options->rep, column_family->rep,
+                              Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_transactiondb_multi_get(rocksdb_transactiondb_t* txn_db,
+                                     const rocksdb_readoptions_t* options,
+                                     size_t num_keys,
+                                     const char* const* keys_list,
+                                     const size_t* keys_list_sizes,
+                                     char** values_list,
+                                     size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn_db->rep->MultiGet(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+void rocksdb_transactiondb_multi_get_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn_db->rep->MultiGet(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
+// Put a key inside a transaction
+void rocksdb_transaction_put(rocksdb_transaction_t* txn, const char* key,
+                             size_t klen, const char* val, size_t vlen,
+                             char** errptr) {
+  SaveError(errptr, txn->rep->Put(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_put_cf(rocksdb_transaction_t* txn,
+                                rocksdb_column_family_handle_t* column_family,
+                                const char* key, size_t klen, const char* val,
+                                size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Put(column_family->rep, Slice(key, klen),
+                                  Slice(val, vlen)));
+}
+
+void rocksdb_transaction_set_commit_timestamp(rocksdb_transaction_t* txn,
+                                              uint64_t commit_timestamp) {
+  txn->rep->SetCommitTimestamp(commit_timestamp);
+}
+
+void rocksdb_transaction_set_read_timestamp_for_validation(
+    rocksdb_transaction_t* txn, uint64_t read_timestamp) {
+  txn->rep->SetReadTimestampForValidation(read_timestamp);
+}
+
+// Put a key outside a transaction
+void rocksdb_transactiondb_put(rocksdb_transactiondb_t* txn_db,
+                               const rocksdb_writeoptions_t* options,
+                               const char* key, size_t klen, const char* val,
+                               size_t vlen, char** errptr) {
+  SaveError(errptr,
+            txn_db->rep->Put(options->rep, Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_put_cf(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t keylen,
+                                  const char* val, size_t vallen,
+                                  char** errptr) {
+  SaveError(errptr, txn_db->rep->Put(options->rep, column_family->rep,
+                                     Slice(key, keylen), Slice(val, vallen)));
+}
+
+// Write batch into transaction db
+void rocksdb_transactiondb_write(rocksdb_transactiondb_t* db,
+                                 const rocksdb_writeoptions_t* options,
+                                 rocksdb_writebatch_t* batch, char** errptr) {
+  SaveError(errptr, db->rep->Write(options->rep, &batch->rep));
+}
+
+// Merge a key inside a transaction
+void rocksdb_transaction_merge(rocksdb_transaction_t* txn, const char* key,
+                               size_t klen, const char* val, size_t vlen,
+                               char** errptr) {
+  SaveError(errptr, txn->rep->Merge(Slice(key, klen), Slice(val, vlen)));
+}
+
+void rocksdb_transaction_merge_cf(rocksdb_transaction_t* txn,
+                                  rocksdb_column_family_handle_t* column_family,
+                                  const char* key, size_t klen, const char* val,
+                                  size_t vlen, char** errptr) {
+  SaveError(errptr, txn->rep->Merge(column_family->rep, Slice(key, klen),
+                                    Slice(val, vlen)));
+}
+
+// Merge a key outside a transaction
+void rocksdb_transactiondb_merge(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_writeoptions_t* options,
+                                 const char* key, size_t klen, const char* val,
+                                 size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, Slice(key, klen),
+                                       Slice(val, vlen)));
+}
+
+void rocksdb_transactiondb_merge_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key, size_t klen,
+    const char* val, size_t vlen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Merge(options->rep, column_family->rep,
+                                       Slice(key, klen), Slice(val, vlen)));
+}
+
+// Delete a key inside a transaction
+void rocksdb_transaction_delete(rocksdb_transaction_t* txn, const char* key,
+                                size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(Slice(key, klen)));
+}
+
+void rocksdb_transaction_delete_cf(
+    rocksdb_transaction_t* txn, rocksdb_column_family_handle_t* column_family,
+    const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn->rep->Delete(column_family->rep, Slice(key, klen)));
+}
+
+// Delete a key outside a transaction
+void rocksdb_transactiondb_delete(rocksdb_transactiondb_t* txn_db,
+                                  const rocksdb_writeoptions_t* options,
+                                  const char* key, size_t klen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, Slice(key, klen)));
+}
+
+void rocksdb_transactiondb_delete_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_writeoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  SaveError(errptr, txn_db->rep->Delete(options->rep, column_family->rep,
+                                        Slice(key, keylen)));
+}
+
+// Create an iterator inside a transaction
+rocksdb_iterator_t* rocksdb_transaction_create_iterator(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep);
+  return result;
+}
+
+// Create an iterator inside a transaction with column family
+rocksdb_iterator_t* rocksdb_transaction_create_iterator_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn->rep->GetIterator(options->rep, column_family->rep);
+  return result;
+}
+
+// Create an iterator outside a transaction
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep);
+  return result;
+}
+
+rocksdb_iterator_t* rocksdb_transactiondb_create_iterator_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family) {
+  rocksdb_iterator_t* result = new rocksdb_iterator_t;
+  result->rep = txn_db->rep->NewIterator(options->rep, column_family->rep);
+  return result;
+}
+
+void rocksdb_transactiondb_close(rocksdb_transactiondb_t* txn_db) {
+  delete txn_db->rep;
+  delete txn_db;
+}
+
+void rocksdb_transactiondb_flush_wal(rocksdb_transactiondb_t* txn_db,
+                                     unsigned char sync, char** errptr) {
+  SaveError(errptr, txn_db->rep->FlushWAL(sync));
+}
+
+void rocksdb_transactiondb_flush(rocksdb_transactiondb_t* txn_db,
+                                 const rocksdb_flushoptions_t* options,
+                                 char** errptr) {
+  SaveError(errptr, txn_db->rep->Flush(options->rep));
+}
+
+void rocksdb_transactiondb_flush_cf(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, char** errptr) {
+  SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
+}
+
+rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
+    rocksdb_transactiondb_t* txn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(txn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t* rocksdb_optimistictransactiondb_open(
+    const rocksdb_options_t* options, const char* name, char** errptr) {
+  OptimisticTransactionDB* otxn_db;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            options->rep, std::string(name), &otxn_db))) {
+    return nullptr;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_optimistictransactiondb_t*
+rocksdb_optimistictransactiondb_open_column_families(
+    const rocksdb_options_t* db_options, const char* name,
+    int num_column_families, const char* const* column_family_names,
+    const rocksdb_options_t* const* column_family_options,
+    rocksdb_column_family_handle_t** column_family_handles, char** errptr) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (int i = 0; i < num_column_families; i++) {
+    column_families.push_back(ColumnFamilyDescriptor(
+        std::string(column_family_names[i]),
+        ColumnFamilyOptions(column_family_options[i]->rep)));
+  }
+
+  OptimisticTransactionDB* otxn_db;
+  std::vector<ColumnFamilyHandle*> handles;
+  if (SaveError(errptr, OptimisticTransactionDB::Open(
+                            DBOptions(db_options->rep), std::string(name),
+                            column_families, &handles, &otxn_db))) {
+    return nullptr;
+  }
+
+  for (size_t i = 0; i < handles.size(); i++) {
+    rocksdb_column_family_handle_t* c_handle =
+        new rocksdb_column_family_handle_t;
+    c_handle->rep = handles[i];
+    column_family_handles[i] = c_handle;
+  }
+  rocksdb_optimistictransactiondb_t* result =
+      new rocksdb_optimistictransactiondb_t;
+  result->rep = otxn_db;
+  return result;
+}
+
+rocksdb_t* rocksdb_optimistictransactiondb_get_base_db(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  DB* base_db = otxn_db->rep->GetBaseDB();
+
+  if (base_db != nullptr) {
+    rocksdb_t* result = new rocksdb_t;
+    result->rep = base_db;
+    return result;
+  }
+
+  return nullptr;
+}
+
+void rocksdb_optimistictransactiondb_close_base_db(rocksdb_t* base_db) {
+  delete base_db;
+}
+
+rocksdb_transaction_t* rocksdb_optimistictransaction_begin(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* write_options,
+    const rocksdb_optimistictransaction_options_t* otxn_options,
+    rocksdb_transaction_t* old_txn) {
+  if (old_txn == nullptr) {
+    rocksdb_transaction_t* result = new rocksdb_transaction_t;
+    result->rep = otxn_db->rep->BeginTransaction(write_options->rep,
+                                                 otxn_options->rep, nullptr);
+    return result;
+  }
+  old_txn->rep = otxn_db->rep->BeginTransaction(
+      write_options->rep, otxn_options->rep, old_txn->rep);
+  return old_txn;
+}
+
+// Write batch into OptimisticTransactionDB
+void rocksdb_optimistictransactiondb_write(
+    rocksdb_optimistictransactiondb_t* otxn_db,
+    const rocksdb_writeoptions_t* options, rocksdb_writebatch_t* batch,
+    char** errptr) {
+  SaveError(errptr, otxn_db->rep->Write(options->rep, &batch->rep));
+}
+
+void rocksdb_optimistictransactiondb_close(
+    rocksdb_optimistictransactiondb_t* otxn_db) {
+  delete otxn_db->rep;
+  delete otxn_db;
+}
+
+rocksdb_checkpoint_t* rocksdb_optimistictransactiondb_checkpoint_object_create(
+    rocksdb_optimistictransactiondb_t* otxn_db, char** errptr) {
+  Checkpoint* checkpoint;
+  if (SaveError(errptr, Checkpoint::Create(otxn_db->rep, &checkpoint))) {
+    return nullptr;
+  }
+  rocksdb_checkpoint_t* result = new rocksdb_checkpoint_t;
+  result->rep = checkpoint;
+  return result;
+}
+
+void rocksdb_free(void* ptr) { free(ptr); }
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned(
+    rocksdb_t* db, const rocksdb_readoptions_t* options, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, db->rep->DefaultColumnFamily(),
+                          Slice(key, keylen), &v->rep);
+  if (!s.ok()) {
+    delete (v);
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+rocksdb_pinnableslice_t* rocksdb_get_pinned_cf(
+    rocksdb_t* db, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    size_t keylen, char** errptr) {
+  rocksdb_pinnableslice_t* v = new (rocksdb_pinnableslice_t);
+  Status s = db->rep->Get(options->rep, column_family->rep, Slice(key, keylen),
+                          &v->rep);
+  if (!s.ok()) {
+    delete v;
+    if (!s.IsNotFound()) {
+      SaveError(errptr, s);
+    }
+    return nullptr;
+  }
+  return v;
+}
+
+void rocksdb_pinnableslice_destroy(rocksdb_pinnableslice_t* v) { delete v; }
+
+const char* rocksdb_pinnableslice_value(const rocksdb_pinnableslice_t* v,
+                                        size_t* vlen) {
+  if (!v) {
+    *vlen = 0;
+    return nullptr;
+  }
+
+  *vlen = v->rep.size();
+  return v->rep.data();
+}
+
+// container to keep databases and caches in order to use
+// ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_consumers_t {
+  std::vector<rocksdb_t*> dbs;
+  std::unordered_set<rocksdb_cache_t*> caches;
+};
+
+// initializes new container of memory consumers
+rocksdb_memory_consumers_t* rocksdb_memory_consumers_create() {
+  return new rocksdb_memory_consumers_t;
+}
+
+// adds datatabase to the container of memory consumers
+void rocksdb_memory_consumers_add_db(rocksdb_memory_consumers_t* consumers,
+                                     rocksdb_t* db) {
+  consumers->dbs.push_back(db);
+}
+
+// adds cache to the container of memory consumers
+void rocksdb_memory_consumers_add_cache(rocksdb_memory_consumers_t* consumers,
+                                        rocksdb_cache_t* cache) {
+  consumers->caches.insert(cache);
+}
+
+// deletes container with memory consumers
+void rocksdb_memory_consumers_destroy(rocksdb_memory_consumers_t* consumers) {
+  delete consumers;
+}
+
+// contains memory usage statistics provided by ROCKSDB_NAMESPACE::MemoryUtil
+struct rocksdb_memory_usage_t {
+  uint64_t mem_table_total;
+  uint64_t mem_table_unflushed;
+  uint64_t mem_table_readers_total;
+  uint64_t cache_total;
+};
+
+// estimates amount of memory occupied by consumers (dbs and caches)
+rocksdb_memory_usage_t* rocksdb_approximate_memory_usage_create(
+    rocksdb_memory_consumers_t* consumers, char** errptr) {
+  vector<DB*> dbs;
+  for (auto db : consumers->dbs) {
+    dbs.push_back(db->rep);
+  }
+
+  unordered_set<const Cache*> cache_set;
+  for (auto cache : consumers->caches) {
+    cache_set.insert(const_cast<const Cache*>(cache->rep.get()));
+  }
+
+  std::map<ROCKSDB_NAMESPACE::MemoryUtil::UsageType, uint64_t> usage_by_type;
+
+  auto status = MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
+                                                            &usage_by_type);
+  if (SaveError(errptr, status)) {
+    return nullptr;
+  }
+
+  auto result = new rocksdb_memory_usage_t;
+  result->mem_table_total = usage_by_type[MemoryUtil::kMemTableTotal];
+  result->mem_table_unflushed = usage_by_type[MemoryUtil::kMemTableUnFlushed];
+  result->mem_table_readers_total =
+      usage_by_type[MemoryUtil::kTableReadersTotal];
+  result->cache_total = usage_by_type[MemoryUtil::kCacheTotal];
+  return result;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_unflushed(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_unflushed;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_mem_table_readers_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->mem_table_readers_total;
+}
+
+uint64_t rocksdb_approximate_memory_usage_get_cache_total(
+    rocksdb_memory_usage_t* memory_usage) {
+  return memory_usage->cache_total;
+}
+
+void rocksdb_options_set_dump_malloc_stats(rocksdb_options_t* opt,
+                                           unsigned char val) {
+  opt->rep.dump_malloc_stats = val;
+}
+
+void rocksdb_options_set_memtable_whole_key_filtering(rocksdb_options_t* opt,
+                                                      unsigned char val) {
+  opt->rep.memtable_whole_key_filtering = val;
+}
+
+void rocksdb_options_set_avoid_unnecessary_blocking_io(rocksdb_options_t* opt,
+                                                       unsigned char val) {
+  opt->rep.avoid_unnecessary_blocking_io = val;
+}
+
+unsigned char rocksdb_options_get_avoid_unnecessary_blocking_io(
+    rocksdb_options_t* opt) {
+  return opt->rep.avoid_unnecessary_blocking_io;
+}
+
+// deletes container with memory usage estimates
+void rocksdb_approximate_memory_usage_destroy(rocksdb_memory_usage_t* usage) {
+  delete usage;
+}
+
+void rocksdb_cancel_all_background_work(rocksdb_t* db, unsigned char wait) {
+  CancelAllBackgroundWork(db->rep, wait);
+}
+
+void rocksdb_disable_manual_compaction(rocksdb_t* db) {
+  db->rep->DisableManualCompaction();
+}
+
+void rocksdb_enable_manual_compaction(rocksdb_t* db) {
+  db->rep->EnableManualCompaction();
+}
+
+}  // end extern "C"
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/c_test.c b/src/rocksdb/db/c_test.c
new file mode 100644
index 000000000..249ab9023
--- /dev/null
+++ b/src/rocksdb/db/c_test.c
@@ -0,0 +1,3476 @@
+/* Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+   Use of this source code is governed by a BSD-style license that can be
+   found in the LICENSE file. See the AUTHORS file for names of contributors. */
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+
+#include <stdio.h>
+
+#ifndef ROCKSDB_LITE  // Lite does not support C API
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "rocksdb/c.h"
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#include <inttypes.h>
+
+// Can not use port/port.h macros as this is a c file
+#ifdef OS_WIN
+#include <windows.h>
+
+// Ok for uniqueness
+int geteuid() {
+  int result = 0;
+
+  result = ((int)GetCurrentProcessId() << 16);
+  result |= (int)GetCurrentThreadId();
+
+  return result;
+}
+
+#endif
+
+const char* phase = "";
+static char dbname[200];
+static char sstfilename[200];
+static char dbbackupname[200];
+static char dbcheckpointname[200];
+static char dbpathname[200];
+static char secondary_path[200];
+
+static void StartPhase(const char* name) {
+  fprintf(stderr, "=== Test %s\n", name);
+  phase = name;
+}
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4996)  // getenv security warning
+#endif
+static const char* GetTempDir(void) {
+  const char* ret = getenv("TEST_TMPDIR");
+  if (ret == NULL || ret[0] == '\0')
+#ifdef OS_WIN
+    ret = getenv("TEMP");
+#else
+    ret = "/tmp";
+#endif
+  return ret;
+}
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#define CheckNoError(err)                                                 \
+  if ((err) != NULL) {                                                    \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, (err)); \
+    abort();                                                              \
+  }
+
+#define CheckCondition(cond)                                              \
+  if (!(cond)) {                                                          \
+    fprintf(stderr, "%s:%d: %s: %s\n", __FILE__, __LINE__, phase, #cond); \
+    abort();                                                              \
+  }
+
+static void CheckEqual(const char* expected, const char* v, size_t n) {
+  if (expected == NULL && v == NULL) {
+    // ok
+  } else if (expected != NULL && v != NULL && n == strlen(expected) &&
+             memcmp(expected, v, n) == 0) {
+    // ok
+    return;
+  } else {
+    fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+            (expected ? expected : "(null)"), (v ? v : "(null)"));
+    abort();
+  }
+}
+
+static void Free(char** ptr) {
+  if (*ptr) {
+    free(*ptr);
+    *ptr = NULL;
+  }
+}
+
+static void CheckValue(char* err, const char* expected, char** actual,
+                       size_t actual_length) {
+  CheckNoError(err);
+  CheckEqual(expected, *actual, actual_length);
+  Free(actual);
+}
+
+static void CheckGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                     const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get(db, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                       rocksdb_column_family_handle_t* handle, const char* key,
+                       const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_get_cf(db, options, handle, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckPinGet(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                        const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned(db, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckPinGetCF(rocksdb_t* db, const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* handle,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  const char* val;
+  rocksdb_pinnableslice_t* p;
+  p = rocksdb_get_pinned_cf(db, options, handle, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckMultiGetValues(size_t num_keys, char** values,
+                                size_t* values_sizes, char** errs,
+                                const char** expected) {
+  for (size_t i = 0; i < num_keys; i++) {
+    CheckNoError(errs[i]);
+    CheckEqual(expected[i], values[i], values_sizes[i]);
+    Free(&values[i]);
+  }
+}
+
+static void CheckIter(rocksdb_iterator_t* iter, const char* key,
+                      const char* val) {
+  size_t len;
+  const char* str;
+  str = rocksdb_iter_key(iter, &len);
+  CheckEqual(key, str, len);
+  str = rocksdb_iter_value(iter, &len);
+  CheckEqual(val, str, len);
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckPut(void* ptr, const char* k, size_t klen, const char* v,
+                     size_t vlen) {
+  int* state = (int*)ptr;
+  CheckCondition(*state < 2);
+  switch (*state) {
+    case 0:
+      CheckEqual("bar", k, klen);
+      CheckEqual("b", v, vlen);
+      break;
+    case 1:
+      CheckEqual("box", k, klen);
+      CheckEqual("c", v, vlen);
+      break;
+  }
+  (*state)++;
+}
+
+// Callback from rocksdb_writebatch_iterate()
+static void CheckDel(void* ptr, const char* k, size_t klen) {
+  int* state = (int*)ptr;
+  CheckCondition(*state == 2);
+  CheckEqual("bar", k, klen);
+  (*state)++;
+}
+
+static void CmpDestroy(void* arg) { (void)arg; }
+
+static int CmpCompare(void* arg, const char* a, size_t alen, const char* b,
+                      size_t blen) {
+  (void)arg;
+  size_t n = (alen < blen) ? alen : blen;
+  int r = memcmp(a, b, n);
+  if (r == 0) {
+    if (alen < blen)
+      r = -1;
+    else if (alen > blen)
+      r = +1;
+  }
+  return r;
+}
+
+static const char* CmpName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+
+// Custom compaction filter
+static void CFilterDestroy(void* arg) { (void)arg; }
+static const char* CFilterName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static unsigned char CFilterFilter(void* arg, int level, const char* key,
+                                   size_t key_length,
+                                   const char* existing_value,
+                                   size_t value_length, char** new_value,
+                                   size_t* new_value_length,
+                                   unsigned char* value_changed) {
+  (void)arg;
+  (void)level;
+  (void)existing_value;
+  (void)value_length;
+  if (key_length == 3) {
+    if (memcmp(key, "bar", key_length) == 0) {
+      return 1;
+    } else if (memcmp(key, "baz", key_length) == 0) {
+      *value_changed = 1;
+      *new_value = "newbazvalue";
+      *new_value_length = 11;
+      return 0;
+    }
+  }
+  return 0;
+}
+
+static void CFilterFactoryDestroy(void* arg) { (void)arg; }
+static const char* CFilterFactoryName(void* arg) {
+  (void)arg;
+  return "foo";
+}
+static rocksdb_compactionfilter_t* CFilterCreate(
+    void* arg, rocksdb_compactionfiltercontext_t* context) {
+  (void)arg;
+  (void)context;
+  return rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter,
+                                         CFilterName);
+}
+
+void CheckMetaData(rocksdb_column_family_metadata_t* cf_meta,
+                   const char* expected_cf_name) {
+  char* cf_name = rocksdb_column_family_metadata_get_name(cf_meta);
+  assert(strcmp(cf_name, expected_cf_name) == 0);
+  rocksdb_free(cf_name);
+
+  size_t cf_size = rocksdb_column_family_metadata_get_size(cf_meta);
+  assert(cf_size > 0);
+  size_t cf_file_count = rocksdb_column_family_metadata_get_size(cf_meta);
+  assert(cf_file_count > 0);
+
+  uint64_t total_level_size = 0;
+  size_t total_file_count = 0;
+  size_t level_count = rocksdb_column_family_metadata_get_level_count(cf_meta);
+  assert(level_count > 0);
+  for (size_t l = 0; l < level_count; ++l) {
+    rocksdb_level_metadata_t* level_meta =
+        rocksdb_column_family_metadata_get_level_metadata(cf_meta, l);
+    assert(level_meta);
+    assert(rocksdb_level_metadata_get_level(level_meta) >= (int)l);
+    uint64_t level_size = rocksdb_level_metadata_get_size(level_meta);
+    uint64_t file_size_in_level = 0;
+
+    size_t file_count = rocksdb_level_metadata_get_file_count(level_meta);
+    total_file_count += file_count;
+    for (size_t f = 0; f < file_count; ++f) {
+      rocksdb_sst_file_metadata_t* file_meta =
+          rocksdb_level_metadata_get_sst_file_metadata(level_meta, f);
+      assert(file_meta);
+
+      uint64_t file_size = rocksdb_sst_file_metadata_get_size(file_meta);
+      assert(file_size > 0);
+      file_size_in_level += file_size;
+
+      char* file_name =
+          rocksdb_sst_file_metadata_get_relative_filename(file_meta);
+      assert(file_name);
+      assert(strlen(file_name) > 0);
+      rocksdb_free(file_name);
+
+      size_t smallest_key_len;
+      char* smallest_key = rocksdb_sst_file_metadata_get_smallestkey(
+          file_meta, &smallest_key_len);
+      assert(smallest_key);
+      assert(smallest_key_len > 0);
+      size_t largest_key_len;
+      char* largest_key =
+          rocksdb_sst_file_metadata_get_largestkey(file_meta, &largest_key_len);
+      assert(largest_key);
+      assert(largest_key_len > 0);
+      rocksdb_free(smallest_key);
+      rocksdb_free(largest_key);
+
+      rocksdb_sst_file_metadata_destroy(file_meta);
+    }
+    assert(level_size == file_size_in_level);
+    total_level_size += level_size;
+    rocksdb_level_metadata_destroy(level_meta);
+  }
+  assert(total_file_count > 0);
+  assert(cf_size == total_level_size);
+}
+
+void GetAndCheckMetaData(rocksdb_t* db) {
+  rocksdb_column_family_metadata_t* cf_meta =
+      rocksdb_get_column_family_metadata(db);
+
+  CheckMetaData(cf_meta, "default");
+
+  rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+void GetAndCheckMetaDataCf(rocksdb_t* db,
+                           rocksdb_column_family_handle_t* handle,
+                           const char* cf_name) {
+  // Compact to make sure we have at least one sst file to obtain datadata.
+  rocksdb_compact_range_cf(db, handle, NULL, 0, NULL, 0);
+
+  rocksdb_column_family_metadata_t* cf_meta =
+      rocksdb_get_column_family_metadata_cf(db, handle);
+
+  CheckMetaData(cf_meta, cf_name);
+
+  rocksdb_column_family_metadata_destroy(cf_meta);
+}
+
+static rocksdb_t* CheckCompaction(rocksdb_t* db, rocksdb_options_t* options,
+                                  rocksdb_readoptions_t* roptions,
+                                  rocksdb_writeoptions_t* woptions) {
+  char* err = NULL;
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "foovalue");
+  rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "bar", "barvalue");
+  rocksdb_put(db, woptions, "baz", 3, "bazvalue", 8, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "baz", "bazvalue");
+
+  // Disable compaction
+  rocksdb_disable_manual_compaction(db);
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should not filter anything when disabled
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", "barvalue");
+  CheckGet(db, roptions, "baz", "bazvalue");
+  // Reenable compaction
+  rocksdb_enable_manual_compaction(db);
+
+  // Force compaction
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  // should have filtered bar, but not foo
+  CheckGet(db, roptions, "foo", "foovalue");
+  CheckGet(db, roptions, "bar", NULL);
+  CheckGet(db, roptions, "baz", "newbazvalue");
+
+  rocksdb_suggest_compact_range(db, "bar", 3, "foo", 3, &err);
+  GetAndCheckMetaData(db);
+  CheckNoError(err);
+
+  return db;
+}
+
+// Custom merge operator
+static void MergeOperatorDestroy(void* arg) { (void)arg; }
+static const char* MergeOperatorName(void* arg) {
+  (void)arg;
+  return "TestMergeOperator";
+}
+static char* MergeOperatorFullMerge(
+    void* arg, const char* key, size_t key_length, const char* existing_value,
+    size_t existing_value_length, const char* const* operands_list,
+    const size_t* operands_list_length, int num_operands,
+    unsigned char* success, size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)existing_value;
+  (void)existing_value_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+static char* MergeOperatorPartialMerge(void* arg, const char* key,
+                                       size_t key_length,
+                                       const char* const* operands_list,
+                                       const size_t* operands_list_length,
+                                       int num_operands, unsigned char* success,
+                                       size_t* new_value_length) {
+  (void)arg;
+  (void)key;
+  (void)key_length;
+  (void)operands_list;
+  (void)operands_list_length;
+  (void)num_operands;
+  *new_value_length = 4;
+  *success = 1;
+  char* result = malloc(4);
+  memcpy(result, "fake", 4);
+  return result;
+}
+
+static void CheckTxnGet(rocksdb_transaction_t* txn,
+                        const rocksdb_readoptions_t* options, const char* key,
+                        const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get(txn, options, key, strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnGetCF(rocksdb_transaction_t* txn,
+                          const rocksdb_readoptions_t* options,
+                          rocksdb_column_family_handle_t* column_family,
+                          const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_cf(txn, options, column_family, key,
+                                   strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnPinGet(rocksdb_transaction_t* txn,
+                           const rocksdb_readoptions_t* options,
+                           const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transaction_get_pinned(txn, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
+                             const rocksdb_readoptions_t* options,
+                             rocksdb_column_family_handle_t* column_family,
+                             const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transaction_get_pinned_cf(txn, options, column_family, key,
+                                        strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
+                          const rocksdb_readoptions_t* options, const char* key,
+                          const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get(txn_db, options, key, strlen(key), &val_len,
+                                  &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
+                            const rocksdb_readoptions_t* options,
+                            rocksdb_column_family_handle_t* column_family,
+                            const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transactiondb_get_cf(txn_db, options, column_family, key,
+                                     strlen(key), &val_len, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
+static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
+                             const rocksdb_readoptions_t* options,
+                             const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transactiondb_get_pinned(txn_db, options, key, strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void CheckTxnDBPinGetCF(rocksdb_transactiondb_t* txn_db,
+                               const rocksdb_readoptions_t* options,
+                               rocksdb_column_family_handle_t* column_family,
+                               const char* key, const char* expected) {
+  rocksdb_pinnableslice_t* p = NULL;
+  const char* val = NULL;
+  char* err = NULL;
+  size_t val_len;
+  p = rocksdb_transactiondb_get_pinned_cf(txn_db, options, column_family, key,
+                                          strlen(key), &err);
+  CheckNoError(err);
+  val = rocksdb_pinnableslice_value(p, &val_len);
+  CheckEqual(expected, val, val_len);
+  rocksdb_pinnableslice_destroy(p);
+}
+
+static void LoadAndCheckLatestOptions(const char* db_name, rocksdb_env_t* env,
+                                      bool ignore_unknown_options,
+                                      rocksdb_cache_t* cache,
+                                      rocksdb_comparator_t* cmp,
+                                      const size_t expected_num_column_families,
+                                      const char** expected_cf_names,
+                                      const char* expected_open_err) {
+  rocksdb_options_t* db_options;
+  size_t num_column_families;
+  char** list_column_family_names;
+  rocksdb_options_t** list_column_family_options;
+  char* err = 0;
+
+  // load the latest rocksdb option
+  rocksdb_load_latest_options(db_name, env, ignore_unknown_options, cache,
+                              &db_options, &num_column_families,
+                              &list_column_family_names,
+                              &list_column_family_options, &err);
+  assert(num_column_families == expected_num_column_families);
+  CheckNoError(err);
+
+  // verify the loaded options by opening the db.
+  rocksdb_options_set_error_if_exists(db_options, 0);
+
+  char** list_const_cf_names =
+      (char**)malloc(num_column_families * sizeof(char*));
+  rocksdb_options_t** list_const_cf_options = (rocksdb_options_t**)malloc(
+      num_column_families * sizeof(rocksdb_options_t*));
+  for (size_t i = 0; i < num_column_families; ++i) {
+    assert(strcmp(list_column_family_names[i], expected_cf_names[i]) == 0);
+    list_const_cf_names[i] = list_column_family_names[i];
+    if (cmp) {
+      rocksdb_options_set_comparator(list_column_family_options[i], cmp);
+    }
+    list_const_cf_options[i] = list_column_family_options[i];
+  }
+  rocksdb_column_family_handle_t** handles =
+      (rocksdb_column_family_handle_t**)malloc(
+          num_column_families * sizeof(rocksdb_column_family_handle_t*));
+
+  rocksdb_t* db = rocksdb_open_column_families(
+      db_options, db_name, (int)num_column_families,
+      (const char* const*)list_const_cf_names,
+      (const rocksdb_options_t* const*)list_const_cf_options, handles, &err);
+  if (expected_open_err == NULL) {
+    CheckNoError(err);
+    for (size_t i = 0; i < num_column_families; ++i) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    free(handles);
+    rocksdb_close(db);
+  } else {
+    assert(err != NULL);
+    assert(strcmp(err, expected_open_err) == 0);
+    free(handles);
+    free(err);
+  }
+
+  free(list_const_cf_names);
+  free(list_const_cf_options);
+  rocksdb_load_latest_options_destroy(db_options, list_column_family_names,
+                                      list_column_family_options,
+                                      num_column_families);
+}
+
+int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
+  rocksdb_t* db;
+  rocksdb_comparator_t* cmp;
+  rocksdb_cache_t* cache;
+  rocksdb_dbpath_t* dbpath;
+  rocksdb_env_t* env;
+  rocksdb_options_t* options;
+  rocksdb_compactoptions_t* coptions;
+  rocksdb_block_based_table_options_t* table_options;
+  rocksdb_readoptions_t* roptions;
+  rocksdb_writeoptions_t* woptions;
+  rocksdb_ratelimiter_t* rate_limiter;
+  rocksdb_transactiondb_t* txn_db;
+  rocksdb_transactiondb_options_t* txn_db_options;
+  rocksdb_transaction_t* txn;
+  rocksdb_transaction_options_t* txn_options;
+  rocksdb_optimistictransactiondb_t* otxn_db;
+  rocksdb_optimistictransaction_options_t* otxn_options;
+  char* err = NULL;
+  int run = -1;
+
+  snprintf(dbname, sizeof(dbname), "%s/rocksdb_c_test-%d", GetTempDir(),
+           ((int)geteuid()));
+
+  snprintf(dbbackupname, sizeof(dbbackupname), "%s/rocksdb_c_test-%d-backup",
+           GetTempDir(), ((int)geteuid()));
+
+  snprintf(dbcheckpointname, sizeof(dbcheckpointname),
+           "%s/rocksdb_c_test-%d-checkpoint", GetTempDir(), ((int)geteuid()));
+
+  snprintf(sstfilename, sizeof(sstfilename), "%s/rocksdb_c_test-%d-sst",
+           GetTempDir(), ((int)geteuid()));
+
+  snprintf(dbpathname, sizeof(dbpathname), "%s/rocksdb_c_test-%d-dbpath",
+           GetTempDir(), ((int)geteuid()));
+
+  StartPhase("create_objects");
+  cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName);
+  dbpath = rocksdb_dbpath_create(dbpathname, 1024 * 1024);
+  env = rocksdb_create_default_env();
+
+  rocksdb_create_dir_if_missing(env, GetTempDir(), &err);
+  CheckNoError(err);
+
+  cache = rocksdb_cache_create_lru(100000);
+
+  options = rocksdb_options_create();
+  rocksdb_options_set_comparator(options, cmp);
+  rocksdb_options_set_error_if_exists(options, 1);
+  rocksdb_options_set_env(options, env);
+  rocksdb_options_set_info_log(options, NULL);
+  rocksdb_options_set_write_buffer_size(options, 100000);
+  rocksdb_options_set_paranoid_checks(options, 1);
+  rocksdb_options_set_max_open_files(options, 10);
+
+  table_options = rocksdb_block_based_options_create();
+  rocksdb_block_based_options_set_block_cache(table_options, cache);
+  rocksdb_block_based_options_set_data_block_index_type(table_options, 1);
+  rocksdb_block_based_options_set_data_block_hash_ratio(table_options, 0.75);
+  rocksdb_options_set_block_based_table_factory(options, table_options);
+
+  rocksdb_options_set_compression(options, rocksdb_no_compression);
+  rocksdb_options_set_compression_options(options, -14, -1, 0, 0);
+  int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
+                              rocksdb_no_compression, rocksdb_no_compression};
+  rocksdb_options_set_compression_per_level(options, compression_levels, 4);
+  rate_limiter = rocksdb_ratelimiter_create(1000 * 1024 * 1024, 100 * 1000, 10);
+  rocksdb_options_set_ratelimiter(options, rate_limiter);
+  rocksdb_ratelimiter_destroy(rate_limiter);
+
+  roptions = rocksdb_readoptions_create();
+  rocksdb_readoptions_set_verify_checksums(roptions, 1);
+  rocksdb_readoptions_set_fill_cache(roptions, 1);
+
+  woptions = rocksdb_writeoptions_create();
+  rocksdb_writeoptions_set_sync(woptions, 1);
+
+  coptions = rocksdb_compactoptions_create();
+  rocksdb_compactoptions_set_exclusive_manual_compaction(coptions, 1);
+
+  rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
+                                                            10001);
+
+  StartPhase("destroy");
+  rocksdb_destroy_db(options, dbname, &err);
+  Free(&err);
+
+  StartPhase("open_error");
+  rocksdb_open(options, dbname, &err);
+  CheckCondition(err != NULL);
+  Free(&err);
+
+  StartPhase("open");
+  rocksdb_options_set_create_if_missing(options, 1);
+  db = rocksdb_open(options, dbname, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", NULL);
+
+  StartPhase("put");
+  rocksdb_put(db, woptions, "foo", 3, "hello", 5, &err);
+  CheckNoError(err);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("backup_and_restore");
+  {
+    rocksdb_destroy_db(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_t* be =
+        rocksdb_backup_engine_open(options, dbbackupname, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    // need a change to trigger a new backup
+    rocksdb_delete(db, woptions, "does-not-exist", 14, &err);
+    CheckNoError(err);
+
+    rocksdb_backup_engine_create_new_backup(be, db, &err);
+    CheckNoError(err);
+
+    const rocksdb_backup_engine_info_t* bei =
+        rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) > 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_backup_engine_purge_old_backups(be, 1, &err);
+    CheckNoError(err);
+
+    bei = rocksdb_backup_engine_get_backup_info(be);
+    CheckCondition(rocksdb_backup_engine_info_count(bei) == 1);
+    rocksdb_backup_engine_info_destroy(bei);
+
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_close(db);
+
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_restore_options_t* restore_options =
+        rocksdb_restore_options_create();
+    rocksdb_restore_options_set_keep_log_files(restore_options, 0);
+    rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname,
+                                                        restore_options, &err);
+    CheckNoError(err);
+    rocksdb_restore_options_destroy(restore_options);
+
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_backup_engine_close(be);
+  }
+
+  StartPhase("checkpoint");
+  {
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_t* checkpoint =
+        rocksdb_checkpoint_object_create(db, &err);
+    CheckNoError(err);
+
+    rocksdb_checkpoint_create(checkpoint, dbcheckpointname, 0, &err);
+    CheckNoError(err);
+
+    // start a new database from the checkpoint
+    rocksdb_close(db);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    CheckGet(db, roptions, "foo", "hello");
+
+    rocksdb_checkpoint_object_destroy(checkpoint);
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbcheckpointname, &err);
+    CheckNoError(err);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("compactall");
+  rocksdb_compact_range(db, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrange");
+  rocksdb_compact_range(db, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactallopt");
+  rocksdb_compact_range_opt(db, coptions, NULL, 0, NULL, 0);
+  CheckGet(db, roptions, "foo", "hello");
+
+  StartPhase("compactrangeopt");
+  rocksdb_compact_range_opt(db, coptions, "a", 1, "z", 1);
+  CheckGet(db, roptions, "foo", "hello");
+
+  // Simple check cache usage
+  StartPhase("cache_usage");
+  {
+    rocksdb_readoptions_set_pin_data(roptions, 1);
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_iter_seek(iter, "foo", 3);
+
+    size_t usage = rocksdb_cache_get_usage(cache);
+    CheckCondition(usage > 0);
+
+    size_t pin_usage = rocksdb_cache_get_pinned_usage(cache);
+    CheckCondition(pin_usage > 0);
+
+    rocksdb_iter_next(iter);
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_pin_data(roptions, 0);
+  }
+
+  StartPhase("addfile");
+  {
+    rocksdb_envoptions_t* env_opt = rocksdb_envoptions_create();
+    rocksdb_options_t* io_options = rocksdb_options_create();
+    rocksdb_sstfilewriter_t* writer =
+        rocksdb_sstfilewriter_create(env_opt, io_options);
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk1", 5, "v1", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v2", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v3", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingestexternalfileoptions_t* ing_opt =
+        rocksdb_ingestexternalfileoptions_create();
+    const char* file_list[1] = {sstfilename};
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v2");
+    CheckGet(db, roptions, "sstk3", "v3");
+
+    remove(sstfilename);
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk2", 5, "v4", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk22", 6, "v5", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "sstk3", 5, "v6", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingest_external_file(db, file_list, 1, ing_opt, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "sstk1", "v1");
+    CheckGet(db, roptions, "sstk2", "v4");
+    CheckGet(db, roptions, "sstk22", "v5");
+    CheckGet(db, roptions, "sstk3", "v6");
+
+    rocksdb_sstfilewriter_open(writer, sstfilename, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc1", 4, "v7", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc2", 4, "v8", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc3", 4, "v9", 2, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_put(writer, "abc4", 4, "v10", 3, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_delete_range(writer, "abc1", 4, "abc4", 4, &err);
+    CheckNoError(err);
+    rocksdb_sstfilewriter_finish(writer, &err);
+    CheckNoError(err);
+
+    rocksdb_ingestexternalfileoptions_destroy(ing_opt);
+    rocksdb_sstfilewriter_destroy(writer);
+    rocksdb_options_destroy(io_options);
+    rocksdb_envoptions_destroy(env_opt);
+
+    // Delete all keys we just ingested
+    rocksdb_delete(db, woptions, "sstk1", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk2", 5, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk22", 6, &err);
+    CheckNoError(err);
+    rocksdb_delete(db, woptions, "sstk3", 5, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("writebatch");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_iterate(wb, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "bay", 3, "d", 1);
+    rocksdb_writebatch_delete_range(wb, "bar", 3, "bay", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "bay", "d");
+    rocksdb_writebatch_clear(wb);
+    const char* start_list[1] = {"bay"};
+    const size_t start_sizes[1] = {3};
+    const char* end_list[1] = {"baz"};
+    const size_t end_sizes[1] = {3};
+    rocksdb_writebatch_delete_rangev(wb, 1, start_list, start_sizes, end_list,
+                                     end_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bay", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_vectors");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_delete(wb, "zap", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_savepoint");
+  {
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_set_save_point(wb);
+    rocksdb_writebatch_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_pop_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_writebatch_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_destroy(wb);
+  }
+
+  StartPhase("writebatch_rep");
+  {
+    rocksdb_writebatch_t* wb1 = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb1, "baz", 3, "d", 1);
+    rocksdb_writebatch_put(wb1, "quux", 4, "e", 1);
+    rocksdb_writebatch_delete(wb1, "quux", 4);
+    size_t repsize1 = 0;
+    const char* rep = rocksdb_writebatch_data(wb1, &repsize1);
+    rocksdb_writebatch_t* wb2 = rocksdb_writebatch_create_from(rep, repsize1);
+    CheckCondition(rocksdb_writebatch_count(wb1) ==
+                   rocksdb_writebatch_count(wb2));
+    size_t repsize2 = 0;
+    CheckCondition(
+        memcmp(rep, rocksdb_writebatch_data(wb2, &repsize2), repsize1) == 0);
+    rocksdb_writebatch_destroy(wb1);
+    rocksdb_writebatch_destroy(wb2);
+  }
+
+  StartPhase("writebatch_wi");
+  {
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "foo", 3, "a", 1);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_put(wbi, "box", 3, "c", 1);
+    rocksdb_writebatch_wi_delete(wbi, "bar", 3);
+    int count = rocksdb_writebatch_wi_count(wbi);
+    CheckCondition(count == 3);
+    size_t size;
+    char* value;
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "box", 3, &size,
+                                                 &err);
+    CheckValue(err, "c", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch(wbi, options, "bar", 3, &size,
+                                                 &err);
+    CheckValue(err, NULL, &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+                                                        "foo", 3, &size, &err);
+    CheckValue(err, "hello", &value, size);
+    value = rocksdb_writebatch_wi_get_from_batch_and_db(wbi, db, roptions,
+                                                        "box", 3, &size, &err);
+    CheckValue(err, "c", &value, size);
+    rocksdb_write_writebatch_wi(db, woptions, wbi, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "hello");
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    int pos = 0;
+    rocksdb_writebatch_wi_iterate(wbi, &pos, CheckPut, CheckDel);
+    CheckCondition(pos == 3);
+    rocksdb_writebatch_wi_clear(wbi);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("writebatch_wi_vectors");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", "xyz");
+    rocksdb_writebatch_wi_delete(wb, "zap", 3);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("writebatch_wi_savepoint");
+  {
+    rocksdb_writebatch_wi_t* wb = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_set_save_point(wb);
+    const char* k_list[2] = {"z", "ap"};
+    const size_t k_sizes[2] = {1, 2};
+    const char* v_list[3] = {"x", "y", "z"};
+    const size_t v_sizes[3] = {1, 1, 1};
+    rocksdb_writebatch_wi_putv(wb, 2, k_list, k_sizes, 3, v_list, v_sizes);
+    rocksdb_writebatch_wi_rollback_to_save_point(wb, &err);
+    CheckNoError(err);
+    rocksdb_write_writebatch_wi(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "zap", NULL);
+    rocksdb_writebatch_wi_destroy(wb);
+  }
+
+  StartPhase("iter");
+  {
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "g", 1);
+    CheckIter(iter, "foo", "hello");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+  }
+
+  StartPhase("wbwi_iter");
+  {
+    rocksdb_iterator_t* base_iter = rocksdb_create_iterator(db, roptions);
+    rocksdb_writebatch_wi_t* wbi = rocksdb_writebatch_wi_create(0, 1);
+    rocksdb_writebatch_wi_put(wbi, "bar", 3, "b", 1);
+    rocksdb_writebatch_wi_delete(wbi, "foo", 3);
+    rocksdb_iterator_t* iter =
+        rocksdb_writebatch_wi_create_iterator_with_base(wbi, base_iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_prev(iter);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_prev(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_last(iter);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek(iter, "b", 1);
+    CheckIter(iter, "bar", "b");
+    rocksdb_iter_seek_for_prev(iter, "c", 1);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_seek_for_prev(iter, "box", 3);
+    CheckIter(iter, "box", "c");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+    rocksdb_writebatch_wi_destroy(wbi);
+  }
+
+  StartPhase("multiget");
+  {
+    const char* keys[3] = {"box", "foo", "notfound"};
+    const size_t keys_sizes[3] = {3, 3, 8};
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    const char* expected[3] = {"c", "hello", NULL};
+    rocksdb_multi_get(db, roptions, 3, keys, keys_sizes, vals, vals_sizes,
+                      errs);
+    CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+  }
+
+  StartPhase("pin_get");
+  {
+    CheckPinGet(db, roptions, "box", "c");
+    CheckPinGet(db, roptions, "foo", "hello");
+    CheckPinGet(db, roptions, "notfound", NULL);
+  }
+
+  StartPhase("approximate_sizes");
+  {
+    int i;
+    int n = 20000;
+    char keybuf[100];
+    char valbuf[100];
+    uint64_t sizes[2];
+    const char* start[2] = {"a", "k00000000000000010000"};
+    size_t start_len[2] = {1, 21};
+    const char* limit[2] = {"k00000000000000010000", "z"};
+    size_t limit_len[2] = {21, 1};
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    for (i = 0; i < n; i++) {
+      snprintf(keybuf, sizeof(keybuf), "k%020d", i);
+      snprintf(valbuf, sizeof(valbuf), "v%020d", i);
+      rocksdb_put(db, woptions, keybuf, strlen(keybuf), valbuf, strlen(valbuf),
+                  &err);
+      CheckNoError(err);
+    }
+    rocksdb_approximate_sizes(db, 2, start, start_len, limit, limit_len, sizes,
+                              &err);
+    CheckNoError(err);
+    CheckCondition(sizes[0] > 0);
+    CheckCondition(sizes[1] > 0);
+  }
+
+  StartPhase("property");
+  {
+    char* prop = rocksdb_property_value(db, "nosuchprop");
+    CheckCondition(prop == NULL);
+    prop = rocksdb_property_value(db, "rocksdb.stats");
+    CheckCondition(prop != NULL);
+    Free(&prop);
+  }
+
+  StartPhase("snapshot");
+  {
+    const rocksdb_snapshot_t* snap;
+    snap = rocksdb_create_snapshot(db);
+    rocksdb_delete(db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    CheckGet(db, roptions, "foo", NULL);
+    rocksdb_release_snapshot(db, snap);
+  }
+  StartPhase("snapshot_with_memtable_inplace_update");
+  {
+    rocksdb_close(db);
+    const rocksdb_snapshot_t* snap = NULL;
+    const char* s_key = "foo_snap";
+    const char* value1 = "hello_s1";
+    const char* value2 = "hello_s2";
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+    rocksdb_options_set_inplace_update_support(options, 1);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+    snap = rocksdb_create_snapshot(db);
+    assert(snap != NULL);
+    rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", NULL);
+    // snapshot syntax is invalid, because of inplace update supported is set
+    CheckGet(db, roptions, s_key, value2);
+    // restore the data and options
+    rocksdb_delete(db, woptions, s_key, 8, &err);
+    CheckGet(db, roptions, s_key, NULL);
+    rocksdb_release_snapshot(db, snap);
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_options_set_inplace_update_support(options, 0);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+  StartPhase("repair");
+  {
+    // If we do not compact here, then the lazy deletion of
+    // files (https://reviews.facebook.net/D6123) would leave
+    // around deleted files and the repair process will find
+    // those files and put them back into the database.
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+    rocksdb_close(db);
+    rocksdb_options_set_create_if_missing(options, 0);
+    rocksdb_options_set_error_if_exists(options, 0);
+    rocksdb_options_set_wal_recovery_mode(options, 2);
+    rocksdb_repair_db(options, dbname, &err);
+    CheckNoError(err);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", NULL);
+    CheckGet(db, roptions, "bar", NULL);
+    CheckGet(db, roptions, "box", "c");
+    rocksdb_options_set_create_if_missing(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
+
+  StartPhase("filter");
+  for (run = 1; run <= 4; run++) {
+    // run=0 uses custom filter (not currently supported)
+    // run=1 uses old block-based bloom filter
+    // run=2 run uses full bloom filter
+    // run=3 uses Ribbon
+    // run=4 uses Ribbon-Bloom hybrid configuration
+    CheckNoError(err);
+    rocksdb_filterpolicy_t* policy;
+    if (run == 1) {
+      policy = rocksdb_filterpolicy_create_bloom(8.0);
+    } else if (run == 2) {
+      policy = rocksdb_filterpolicy_create_bloom_full(8.0);
+    } else if (run == 3) {
+      policy = rocksdb_filterpolicy_create_ribbon(8.0);
+    } else {
+      policy = rocksdb_filterpolicy_create_ribbon_hybrid(8.0, 1);
+    }
+    rocksdb_block_based_options_set_filter_policy(table_options, policy);
+
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+
+    {
+      // Add enough keys to get just one reasonably populated Bloom filter
+      const int keys_to_add = 1500;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_add; i++) {
+        snprintf(keybuf, sizeof(keybuf), "yes%020d", i);
+        rocksdb_put(db, woptions, keybuf, strlen(keybuf), "val", 3, &err);
+        CheckNoError(err);
+      }
+    }
+    rocksdb_compact_range(db, NULL, 0, NULL, 0);
+
+    CheckGet(db, roptions, "foo", "foovalue");
+    CheckGet(db, roptions, "bar", "barvalue");
+
+    {
+      // Query some keys not added to identify Bloom filter implementation
+      // from false positive queries, using perfcontext to detect Bloom
+      // filter behavior
+      rocksdb_perfcontext_t* perf = rocksdb_perfcontext_create();
+      rocksdb_perfcontext_reset(perf);
+
+      const int keys_to_query = 10000;
+      int i;
+      char keybuf[100];
+      for (i = 0; i < keys_to_query; i++) {
+        snprintf(keybuf, sizeof(keybuf), "no%020d", i);
+        CheckGet(db, roptions, keybuf, NULL);
+      }
+
+      const int hits =
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_hit_count);
+      if (run == 0) {
+        // Due to half true, half false with fake filter result
+        CheckCondition(hits == keys_to_query / 2);
+      } else if (run == 1 || run == 2 || run == 4) {
+        // For run == 1, block-based Bloom is no longer available in public
+        // API; attempting to enable it enables full Bloom instead.
+        //
+        // Essentially a fingerprint of full Bloom schema, format_version=5
+        CheckCondition(hits == 188);
+      } else {
+        // Essentially a fingerprint of Ribbon schema
+        CheckCondition(hits == 226);
+      }
+      CheckCondition(
+          (keys_to_query - hits) ==
+          (int)rocksdb_perfcontext_metric(perf, rocksdb_bloom_sst_miss_count));
+
+      rocksdb_perfcontext_destroy(perf);
+    }
+
+    // Reset the policy
+    rocksdb_block_based_options_set_filter_policy(table_options, NULL);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+  }
+
+  StartPhase("compaction_filter");
+  {
+    rocksdb_options_t* options_with_filter = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter, 1);
+    rocksdb_compactionfilter_t* cfilter;
+    cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy,
+                                              CFilterFilter, CFilterName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter, dbname, &err);
+    rocksdb_options_set_compaction_filter(options_with_filter, cfilter);
+    db = CheckCompaction(db, options_with_filter, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter(options_with_filter, NULL);
+    rocksdb_compactionfilter_destroy(cfilter);
+    rocksdb_options_destroy(options_with_filter);
+  }
+
+  StartPhase("compaction_filter_factory");
+  {
+    rocksdb_options_t* options_with_filter_factory = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(options_with_filter_factory, 1);
+    rocksdb_compactionfilterfactory_t* factory;
+    factory = rocksdb_compactionfilterfactory_create(
+        NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options_with_filter_factory, dbname, &err);
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  factory);
+    db = CheckCompaction(db, options_with_filter_factory, roptions, woptions);
+
+    rocksdb_options_set_compaction_filter_factory(options_with_filter_factory,
+                                                  NULL);
+    rocksdb_options_destroy(options_with_filter_factory);
+  }
+
+  StartPhase("merge_operator");
+  {
+    rocksdb_mergeoperator_t* merge_operator;
+    merge_operator = rocksdb_mergeoperator_create(
+        NULL, MergeOperatorDestroy, MergeOperatorFullMerge,
+        MergeOperatorPartialMerge, NULL, MergeOperatorName);
+    // Create new database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_set_merge_operator(options, merge_operator);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "foovalue");
+    rocksdb_merge(db, woptions, "foo", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "foo", "fake");
+
+    // Merge of a non-existing value
+    rocksdb_merge(db, woptions, "bar", 3, "barvalue", 8, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "bar", "fake");
+  }
+
+  StartPhase("columnfamilies");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_close(db);
+    {
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names, NULL);
+    }
+
+    rocksdb_options_set_create_if_missing(db_options, 0);
+    db = rocksdb_open(db_options, dbname, &err);
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_create_column_family(db, db_options, "cf1", &err);
+    rocksdb_column_family_handle_destroy(cfh);
+    CheckNoError(err);
+    rocksdb_close(db);
+
+    size_t cflen;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cflen, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("cf1", column_fams[1], 3);
+    CheckCondition(cflen == 2);
+    rocksdb_list_column_families_destroy(column_fams, cflen);
+
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+
+    const char* cf_names[2] = {"default", "cf1"};
+    const rocksdb_options_t* cf_opts[2] = {cf_options, cf_options};
+    rocksdb_column_family_handle_t* handles[2];
+
+    LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 2, cf_names,
+                              NULL);
+
+    db = rocksdb_open_column_families(db_options, dbname, 2, cf_names, cf_opts,
+                                      handles, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    rocksdb_put_cf(db, woptions, handles[1], "foobar1", 7, "hello1", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar2", 7, "hello2", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar3", 7, "hello3", 6, &err);
+    CheckNoError(err);
+    rocksdb_put_cf(db, woptions, handles[1], "foobar4", 7, "hello4", 6, &err);
+    CheckNoError(err);
+    rocksdb_suggest_compact_range_cf(db, handles[1], "foo", 3, "foobar9", 7,
+                                     &err);
+    CheckNoError(err);
+
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush_cf(db, flush_options, handles[1], &err);
+    CheckNoError(err) rocksdb_flushoptions_destroy(flush_options);
+
+    CheckGetCF(db, roptions, handles[1], "foo", "hello");
+    CheckPinGetCF(db, roptions, handles[1], "foo", "hello");
+
+    rocksdb_delete_cf(db, woptions, handles[1], "foo", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_delete_range_cf(db, woptions, handles[1], "foobar2", 7, "foobar4",
+                            7, &err);
+    CheckNoError(err);
+
+    CheckGetCF(db, roptions, handles[1], "foo", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "foo", NULL);
+
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put_cf(wb, handles[1], "baz", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put_cf(wb, handles[1], "bar", 3, "b", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "box", 3, "c", 1);
+    rocksdb_writebatch_put_cf(wb, handles[1], "buff", 4, "rocksdb", 7);
+    rocksdb_writebatch_delete_cf(wb, handles[1], "bar", 3);
+    rocksdb_write(db, woptions, wb, &err);
+    CheckNoError(err);
+    CheckGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckGetCF(db, roptions, handles[1], "box", "c");
+    CheckGetCF(db, roptions, handles[1], "buff", "rocksdb");
+    CheckPinGetCF(db, roptions, handles[1], "baz", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "bar", NULL);
+    CheckPinGetCF(db, roptions, handles[1], "box", "c");
+    CheckPinGetCF(db, roptions, handles[1], "buff", "rocksdb");
+    rocksdb_writebatch_destroy(wb);
+
+    rocksdb_flush_wal(db, 1, &err);
+    CheckNoError(err);
+
+    const char* keys[3] = {"box", "box", "barfooxx"};
+    const rocksdb_column_family_handle_t* get_handles[3] = {
+        handles[0], handles[1], handles[1]};
+    const size_t keys_sizes[3] = {3, 3, 8};
+    char* vals[3];
+    size_t vals_sizes[3];
+    char* errs[3];
+    rocksdb_multi_get_cf(db, roptions, get_handles, 3, keys, keys_sizes, vals,
+                         vals_sizes, errs);
+
+    int i;
+    for (i = 0; i < 3; i++) {
+      CheckEqual(NULL, errs[i], 0);
+      switch (i) {
+        case 0:
+          CheckEqual(NULL, vals[i], vals_sizes[i]);  // wrong cf
+          break;
+        case 1:
+          CheckEqual("c", vals[i], vals_sizes[i]);  // bingo
+          break;
+        case 2:
+          CheckEqual(NULL, vals[i], vals_sizes[i]);  // normal not found
+          break;
+      }
+      Free(&vals[i]);
+    }
+
+    {
+      const char* batched_keys[4] = {"box", "buff", "barfooxx", "box"};
+      const size_t batched_keys_sizes[4] = {3, 4, 8, 3};
+      const char* expected_value[4] = {"c", "rocksdb", NULL, "c"};
+      char* batched_errs[4];
+
+      rocksdb_pinnableslice_t* pvals[4];
+      rocksdb_batched_multi_get_cf(db, roptions, handles[1], 4, batched_keys,
+                                   batched_keys_sizes, pvals, batched_errs,
+                                   false);
+      const char* val;
+      size_t val_len;
+      for (i = 0; i < 4; ++i) {
+        val = rocksdb_pinnableslice_value(pvals[i], &val_len);
+        CheckNoError(batched_errs[i]);
+        CheckEqual(expected_value[i], val, val_len);
+        rocksdb_pinnableslice_destroy(pvals[i]);
+      }
+    }
+
+    {
+      unsigned char value_found = 0;
+
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            NULL, NULL, NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist(db, roptions, "invalid_key", 11,
+                                            &vals[0], &vals_sizes[0], NULL, 0,
+                                            &value_found));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, NULL, NULL,
+                                               NULL, 0, NULL));
+      CheckCondition(!rocksdb_key_may_exist_cf(db, roptions, handles[1],
+                                               "invalid_key", 11, &vals[0],
+                                               &vals_sizes[0], NULL, 0, NULL));
+      if (value_found) {
+        Free(&vals[0]);
+      }
+    }
+
+    rocksdb_iterator_t* iter =
+        rocksdb_create_iterator_cf(db, roptions, handles[1]);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 4);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_column_family_handle_t* iters_cf_handles[2] = {handles[0],
+                                                           handles[1]};
+    rocksdb_iterator_t* iters_handles[2];
+    rocksdb_create_iterators(db, roptions, iters_cf_handles, iters_handles, 2,
+                             &err);
+    CheckNoError(err);
+
+    iter = iters_handles[0];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_destroy(iter);
+
+    iter = iters_handles[1];
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    for (i = 0; rocksdb_iter_valid(iter) != 0; rocksdb_iter_next(iter)) {
+      i++;
+    }
+    CheckCondition(i == 4);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    GetAndCheckMetaDataCf(db, handles[1], cf_names[1]);
+
+    rocksdb_drop_column_family(db, handles[1], &err);
+    CheckNoError(err);
+    for (i = 0; i < 2; i++) {
+      rocksdb_column_family_handle_destroy(handles[i]);
+    }
+    rocksdb_close(db);
+    {
+      // As column family has been dropped, we expect only one column family.
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names, NULL);
+    }
+    rocksdb_destroy_db(options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(cf_options);
+  }
+
+  StartPhase("prefix");
+  {
+    // Create new database
+    rocksdb_options_set_allow_mmap_reads(options, 1);
+    rocksdb_options_set_prefix_extractor(
+        options, rocksdb_slicetransform_create_fixed_prefix(3));
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "foo1", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo2", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo3", 4, "foo", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar1", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar2", 4, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "bar3", 4, "bar", 3, &err);
+    CheckNoError(err);
+
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "bar", 3);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+
+    CheckIter(iter, "bar1", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar2", "bar");
+    rocksdb_iter_next(iter);
+    CheckIter(iter, "bar3", "bar");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_readoptions_set_total_order_seek(roptions, 1);
+    iter = rocksdb_create_iterator(db, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+
+    rocksdb_iter_seek(iter, "ba", 2);
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar1", "bar");
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_set_total_order_seek(roptions, 0);
+
+    rocksdb_close(db);
+
+    {
+      const char* expected_cf_names[1] = {"default"};
+      LoadAndCheckLatestOptions(dbname, env, false, cache, NULL, 1,
+                                expected_cf_names,
+                                "Invalid argument: leveldb.BytewiseComparator: "
+                                "does not match existing comparator foo");
+      LoadAndCheckLatestOptions(dbname, env, false, cache, cmp, 1,
+                                expected_cf_names, NULL);
+    }
+    rocksdb_destroy_db(options, dbname, &err);
+  }
+
+  // Check memory usage stats
+  StartPhase("approximate_memory_usage");
+  {
+    // Create database
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_memory_consumers_t* consumers;
+    consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, db);
+    rocksdb_memory_consumers_add_cache(consumers, cache);
+
+    // take memory usage report before write-read operation
+    rocksdb_memory_usage_t* mu1;
+    mu1 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // Put data (this should affect memtables)
+    rocksdb_put(db, woptions, "memory", 6, "test", 4, &err);
+    CheckNoError(err);
+    CheckGet(db, roptions, "memory", "test");
+
+    // take memory usage report after write-read operation
+    rocksdb_memory_usage_t* mu2;
+    mu2 = rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+
+    // amount of memory used within memtables should grow
+    CheckCondition(rocksdb_approximate_memory_usage_get_mem_table_total(mu2) >=
+                   rocksdb_approximate_memory_usage_get_mem_table_total(mu1));
+    CheckCondition(
+        rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu2) >=
+        rocksdb_approximate_memory_usage_get_mem_table_unflushed(mu1));
+
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_approximate_memory_usage_destroy(mu1);
+    rocksdb_approximate_memory_usage_destroy(mu2);
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("cuckoo_options");
+  {
+    rocksdb_cuckoo_table_options_t* cuckoo_options;
+    cuckoo_options = rocksdb_cuckoo_options_create();
+    rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5);
+    rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200);
+    rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10);
+    rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1);
+    rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0);
+    rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_cuckoo_options_destroy(cuckoo_options);
+  }
+
+  StartPhase("options");
+  {
+    rocksdb_options_t* o;
+    o = rocksdb_options_create();
+
+    // Set and check options.
+    rocksdb_options_set_allow_ingest_behind(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(o, 10);
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(o, 1);
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(o, 1);
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(o, 1);
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(o, 3);
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(o, 100);
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(o, 1000);
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(o, 21);
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(o, 400);
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(o, 7);
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(o, 4);
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(o, 6);
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(o, 8);
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(o, 256);
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(o, 3);
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(o, 1024);
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(o, 97);
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(o, 64);
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(o, 50000);
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(o, 123456);
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(o, 2);
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(o, 3);
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(o, 5);
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(o, 6);
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(o, 7);
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(o, 8);
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(o, 9);
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(o, 10);
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(o, 11);
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_max_manifest_file_size(o, 12);
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(o, 13);
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(o, 14);
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(o, 15);
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(o, 16);
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(o, 17);
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(o, 1);
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(o, 1);
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_stats_dump_period_sec(o, 18);
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(o, 5);
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(o, 1);
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(o, 3);
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(o, 1);
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(o, 19);
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(o, 20);
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(o, 21);
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(o, 1);
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(o, 1);
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(o, 22);
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(o, 1);
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(o, 1);
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(o, 23);
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(o, 2.0);
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(o, 24);
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(o, 25);
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(o, 26);
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(o, 27);
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(o, 1);
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(o, 28);
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(o, 1);
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(o, 2);
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(o, 5);
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(o, 4);
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(o, 2);
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_set_manual_wal_flush(o, 1);
+    CheckCondition(1 == rocksdb_options_get_manual_wal_flush(o));
+
+    rocksdb_options_set_wal_compression(o, 1);
+    CheckCondition(1 == rocksdb_options_get_wal_compression(o));
+
+    rocksdb_options_set_experimental_mempurge_threshold(o, 29.0);
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(o));
+
+    /* Blob Options */
+    rocksdb_options_set_enable_blob_files(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
+
+    rocksdb_options_set_min_blob_size(o, 29);
+    CheckCondition(29 == rocksdb_options_get_min_blob_size(o));
+
+    rocksdb_options_set_blob_file_size(o, 30);
+    CheckCondition(30 == rocksdb_options_get_blob_file_size(o));
+
+    rocksdb_options_set_blob_compression_type(o, 4);
+    CheckCondition(4 == rocksdb_options_get_blob_compression_type(o));
+
+    rocksdb_options_set_enable_blob_gc(o, 1);
+    CheckCondition(1 == rocksdb_options_get_enable_blob_gc(o));
+
+    rocksdb_options_set_blob_gc_age_cutoff(o, 0.5);
+    CheckCondition(0.5 == rocksdb_options_get_blob_gc_age_cutoff(o));
+
+    rocksdb_options_set_blob_gc_force_threshold(o, 0.75);
+    CheckCondition(0.75 == rocksdb_options_get_blob_gc_force_threshold(o));
+
+    rocksdb_options_set_blob_compaction_readahead_size(o, 262144);
+    CheckCondition(262144 ==
+                   rocksdb_options_get_blob_compaction_readahead_size(o));
+
+    rocksdb_options_set_blob_file_starting_level(o, 5);
+    CheckCondition(5 == rocksdb_options_get_blob_file_starting_level(o));
+
+    rocksdb_options_set_prepopulate_blob_cache(o, 1 /* flush only */);
+    CheckCondition(1 == rocksdb_options_get_prepopulate_blob_cache(o));
+
+    // Create a copy that should be equal to the original.
+    rocksdb_options_t* copy;
+    copy = rocksdb_options_create_copy(o);
+
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(
+        4 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(6 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(
+        64 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(
+        10 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(
+        11 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(3 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(21 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(
+        23 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(copy));
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(copy));
+
+    // Copies should be independent.
+    rocksdb_options_set_allow_ingest_behind(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_ingest_behind(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_ingest_behind(o));
+
+    rocksdb_options_compaction_readahead_size(copy, 20);
+    CheckCondition(20 == rocksdb_options_get_compaction_readahead_size(copy));
+    CheckCondition(10 == rocksdb_options_get_compaction_readahead_size(o));
+
+    rocksdb_options_set_create_if_missing(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_create_if_missing(copy));
+    CheckCondition(1 == rocksdb_options_get_create_if_missing(o));
+
+    rocksdb_options_set_create_missing_column_families(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_create_missing_column_families(copy));
+    CheckCondition(1 == rocksdb_options_get_create_missing_column_families(o));
+
+    rocksdb_options_set_error_if_exists(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_error_if_exists(copy));
+    CheckCondition(1 == rocksdb_options_get_error_if_exists(o));
+
+    rocksdb_options_set_paranoid_checks(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_paranoid_checks(copy));
+    CheckCondition(1 == rocksdb_options_get_paranoid_checks(o));
+
+    rocksdb_options_set_info_log_level(copy, 2);
+    CheckCondition(2 == rocksdb_options_get_info_log_level(copy));
+    CheckCondition(3 == rocksdb_options_get_info_log_level(o));
+
+    rocksdb_options_set_write_buffer_size(copy, 200);
+    CheckCondition(200 == rocksdb_options_get_write_buffer_size(copy));
+    CheckCondition(100 == rocksdb_options_get_write_buffer_size(o));
+
+    rocksdb_options_set_db_write_buffer_size(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_db_write_buffer_size(copy));
+    CheckCondition(1000 == rocksdb_options_get_db_write_buffer_size(o));
+
+    rocksdb_options_set_max_open_files(copy, 42);
+    CheckCondition(42 == rocksdb_options_get_max_open_files(copy));
+    CheckCondition(21 == rocksdb_options_get_max_open_files(o));
+
+    rocksdb_options_set_max_file_opening_threads(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_max_file_opening_threads(copy));
+    CheckCondition(5 == rocksdb_options_get_max_file_opening_threads(o));
+
+    rocksdb_options_set_max_total_wal_size(copy, 4000);
+    CheckCondition(4000 == rocksdb_options_get_max_total_wal_size(copy));
+    CheckCondition(400 == rocksdb_options_get_max_total_wal_size(o));
+
+    rocksdb_options_set_num_levels(copy, 6);
+    CheckCondition(6 == rocksdb_options_get_num_levels(copy));
+    CheckCondition(7 == rocksdb_options_get_num_levels(o));
+
+    rocksdb_options_set_level0_file_num_compaction_trigger(copy, 14);
+    CheckCondition(
+        14 == rocksdb_options_get_level0_file_num_compaction_trigger(copy));
+    CheckCondition(4 ==
+                   rocksdb_options_get_level0_file_num_compaction_trigger(o));
+
+    rocksdb_options_set_level0_slowdown_writes_trigger(copy, 61);
+    CheckCondition(61 ==
+                   rocksdb_options_get_level0_slowdown_writes_trigger(copy));
+    CheckCondition(6 == rocksdb_options_get_level0_slowdown_writes_trigger(o));
+
+    rocksdb_options_set_level0_stop_writes_trigger(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_level0_stop_writes_trigger(copy));
+    CheckCondition(8 == rocksdb_options_get_level0_stop_writes_trigger(o));
+
+    rocksdb_options_set_target_file_size_base(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_target_file_size_base(copy));
+    CheckCondition(256 == rocksdb_options_get_target_file_size_base(o));
+
+    rocksdb_options_set_target_file_size_multiplier(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_target_file_size_multiplier(copy));
+    CheckCondition(3 == rocksdb_options_get_target_file_size_multiplier(o));
+
+    rocksdb_options_set_max_bytes_for_level_base(copy, 900);
+    CheckCondition(900 == rocksdb_options_get_max_bytes_for_level_base(copy));
+    CheckCondition(1024 == rocksdb_options_get_max_bytes_for_level_base(o));
+
+    rocksdb_options_set_level_compaction_dynamic_level_bytes(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_level_compaction_dynamic_level_bytes(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_level_compaction_dynamic_level_bytes(o));
+
+    rocksdb_options_set_max_bytes_for_level_multiplier(copy, 8.0);
+    CheckCondition(8.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_max_bytes_for_level_multiplier(o));
+
+    rocksdb_options_set_skip_stats_update_on_db_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_skip_stats_update_on_db_open(copy));
+    CheckCondition(1 == rocksdb_options_get_skip_stats_update_on_db_open(o));
+
+    rocksdb_options_set_skip_checking_sst_file_sizes_on_db_open(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_skip_checking_sst_file_sizes_on_db_open(o));
+
+    rocksdb_options_set_max_write_buffer_number(copy, 2000);
+    CheckCondition(2000 == rocksdb_options_get_max_write_buffer_number(copy));
+    CheckCondition(97 == rocksdb_options_get_max_write_buffer_number(o));
+
+    rocksdb_options_set_min_write_buffer_number_to_merge(copy, 146);
+    CheckCondition(146 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_min_write_buffer_number_to_merge(o));
+
+    rocksdb_options_set_max_write_buffer_number_to_maintain(copy, 128);
+    CheckCondition(
+        128 == rocksdb_options_get_max_write_buffer_number_to_maintain(copy));
+    CheckCondition(64 ==
+                   rocksdb_options_get_max_write_buffer_number_to_maintain(o));
+
+    rocksdb_options_set_max_write_buffer_size_to_maintain(copy, 9000);
+    CheckCondition(9000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(copy));
+    CheckCondition(50000 ==
+                   rocksdb_options_get_max_write_buffer_size_to_maintain(o));
+
+    rocksdb_options_set_enable_pipelined_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_enable_pipelined_write(copy));
+    CheckCondition(1 == rocksdb_options_get_enable_pipelined_write(o));
+
+    rocksdb_options_set_unordered_write(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_unordered_write(copy));
+    CheckCondition(1 == rocksdb_options_get_unordered_write(o));
+
+    rocksdb_options_set_max_subcompactions(copy, 90001);
+    CheckCondition(90001 == rocksdb_options_get_max_subcompactions(copy));
+    CheckCondition(123456 == rocksdb_options_get_max_subcompactions(o));
+
+    rocksdb_options_set_max_background_jobs(copy, 12);
+    CheckCondition(12 == rocksdb_options_get_max_background_jobs(copy));
+    CheckCondition(2 == rocksdb_options_get_max_background_jobs(o));
+
+    rocksdb_options_set_max_background_compactions(copy, 13);
+    CheckCondition(13 == rocksdb_options_get_max_background_compactions(copy));
+    CheckCondition(3 == rocksdb_options_get_max_background_compactions(o));
+
+    rocksdb_options_set_max_background_flushes(copy, 15);
+    CheckCondition(15 == rocksdb_options_get_max_background_flushes(copy));
+    CheckCondition(5 == rocksdb_options_get_max_background_flushes(o));
+
+    rocksdb_options_set_max_log_file_size(copy, 16);
+    CheckCondition(16 == rocksdb_options_get_max_log_file_size(copy));
+    CheckCondition(6 == rocksdb_options_get_max_log_file_size(o));
+
+    rocksdb_options_set_log_file_time_to_roll(copy, 17);
+    CheckCondition(17 == rocksdb_options_get_log_file_time_to_roll(copy));
+    CheckCondition(7 == rocksdb_options_get_log_file_time_to_roll(o));
+
+    rocksdb_options_set_keep_log_file_num(copy, 18);
+    CheckCondition(18 == rocksdb_options_get_keep_log_file_num(copy));
+    CheckCondition(8 == rocksdb_options_get_keep_log_file_num(o));
+
+    rocksdb_options_set_recycle_log_file_num(copy, 19);
+    CheckCondition(19 == rocksdb_options_get_recycle_log_file_num(copy));
+    CheckCondition(9 == rocksdb_options_get_recycle_log_file_num(o));
+
+    rocksdb_options_set_soft_pending_compaction_bytes_limit(copy, 110);
+    CheckCondition(
+        110 == rocksdb_options_get_soft_pending_compaction_bytes_limit(copy));
+    CheckCondition(10 ==
+                   rocksdb_options_get_soft_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_hard_pending_compaction_bytes_limit(copy, 111);
+    CheckCondition(
+        111 == rocksdb_options_get_hard_pending_compaction_bytes_limit(copy));
+    CheckCondition(11 ==
+                   rocksdb_options_get_hard_pending_compaction_bytes_limit(o));
+
+    rocksdb_options_set_max_manifest_file_size(copy, 112);
+    CheckCondition(112 == rocksdb_options_get_max_manifest_file_size(copy));
+    CheckCondition(12 == rocksdb_options_get_max_manifest_file_size(o));
+
+    rocksdb_options_set_table_cache_numshardbits(copy, 113);
+    CheckCondition(113 == rocksdb_options_get_table_cache_numshardbits(copy));
+    CheckCondition(13 == rocksdb_options_get_table_cache_numshardbits(o));
+
+    rocksdb_options_set_arena_block_size(copy, 114);
+    CheckCondition(114 == rocksdb_options_get_arena_block_size(copy));
+    CheckCondition(14 == rocksdb_options_get_arena_block_size(o));
+
+    rocksdb_options_set_use_fsync(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_fsync(copy));
+    CheckCondition(1 == rocksdb_options_get_use_fsync(o));
+
+    rocksdb_options_set_WAL_ttl_seconds(copy, 115);
+    CheckCondition(115 == rocksdb_options_get_WAL_ttl_seconds(copy));
+    CheckCondition(15 == rocksdb_options_get_WAL_ttl_seconds(o));
+
+    rocksdb_options_set_WAL_size_limit_MB(copy, 116);
+    CheckCondition(116 == rocksdb_options_get_WAL_size_limit_MB(copy));
+    CheckCondition(16 == rocksdb_options_get_WAL_size_limit_MB(o));
+
+    rocksdb_options_set_manifest_preallocation_size(copy, 117);
+    CheckCondition(117 ==
+                   rocksdb_options_get_manifest_preallocation_size(copy));
+    CheckCondition(17 == rocksdb_options_get_manifest_preallocation_size(o));
+
+    rocksdb_options_set_allow_mmap_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_reads(o));
+
+    rocksdb_options_set_allow_mmap_writes(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_allow_mmap_writes(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_mmap_writes(o));
+
+    rocksdb_options_set_use_direct_reads(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_direct_reads(copy));
+    CheckCondition(1 == rocksdb_options_get_use_direct_reads(o));
+
+    rocksdb_options_set_use_direct_io_for_flush_and_compaction(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(copy));
+    CheckCondition(
+        1 == rocksdb_options_get_use_direct_io_for_flush_and_compaction(o));
+
+    rocksdb_options_set_is_fd_close_on_exec(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_is_fd_close_on_exec(copy));
+    CheckCondition(1 == rocksdb_options_get_is_fd_close_on_exec(o));
+
+    rocksdb_options_set_stats_dump_period_sec(copy, 218);
+    CheckCondition(218 == rocksdb_options_get_stats_dump_period_sec(copy));
+    CheckCondition(18 == rocksdb_options_get_stats_dump_period_sec(o));
+
+    rocksdb_options_set_stats_persist_period_sec(copy, 600);
+    CheckCondition(600 == rocksdb_options_get_stats_persist_period_sec(copy));
+    CheckCondition(5 == rocksdb_options_get_stats_persist_period_sec(o));
+
+    rocksdb_options_set_advise_random_on_open(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_advise_random_on_open(copy));
+    CheckCondition(1 == rocksdb_options_get_advise_random_on_open(o));
+
+    rocksdb_options_set_access_hint_on_compaction_start(copy, 2);
+    CheckCondition(2 ==
+                   rocksdb_options_get_access_hint_on_compaction_start(copy));
+    CheckCondition(3 == rocksdb_options_get_access_hint_on_compaction_start(o));
+
+    rocksdb_options_set_use_adaptive_mutex(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_use_adaptive_mutex(copy));
+    CheckCondition(1 == rocksdb_options_get_use_adaptive_mutex(o));
+
+    rocksdb_options_set_bytes_per_sync(copy, 219);
+    CheckCondition(219 == rocksdb_options_get_bytes_per_sync(copy));
+    CheckCondition(19 == rocksdb_options_get_bytes_per_sync(o));
+
+    rocksdb_options_set_wal_bytes_per_sync(copy, 120);
+    CheckCondition(120 == rocksdb_options_get_wal_bytes_per_sync(copy));
+    CheckCondition(20 == rocksdb_options_get_wal_bytes_per_sync(o));
+
+    rocksdb_options_set_writable_file_max_buffer_size(copy, 121);
+    CheckCondition(121 ==
+                   rocksdb_options_get_writable_file_max_buffer_size(copy));
+    CheckCondition(21 == rocksdb_options_get_writable_file_max_buffer_size(o));
+
+    rocksdb_options_set_allow_concurrent_memtable_write(copy, 0);
+    CheckCondition(0 ==
+                   rocksdb_options_get_allow_concurrent_memtable_write(copy));
+    CheckCondition(1 == rocksdb_options_get_allow_concurrent_memtable_write(o));
+
+    rocksdb_options_set_enable_write_thread_adaptive_yield(copy, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_enable_write_thread_adaptive_yield(copy));
+    CheckCondition(1 ==
+                   rocksdb_options_get_enable_write_thread_adaptive_yield(o));
+
+    rocksdb_options_set_max_sequential_skip_in_iterations(copy, 122);
+    CheckCondition(122 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(copy));
+    CheckCondition(22 ==
+                   rocksdb_options_get_max_sequential_skip_in_iterations(o));
+
+    rocksdb_options_set_disable_auto_compactions(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_disable_auto_compactions(copy));
+    CheckCondition(1 == rocksdb_options_get_disable_auto_compactions(o));
+
+    rocksdb_options_set_optimize_filters_for_hits(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_optimize_filters_for_hits(copy));
+    CheckCondition(1 == rocksdb_options_get_optimize_filters_for_hits(o));
+
+    rocksdb_options_set_delete_obsolete_files_period_micros(copy, 123);
+    CheckCondition(
+        123 == rocksdb_options_get_delete_obsolete_files_period_micros(copy));
+    CheckCondition(23 ==
+                   rocksdb_options_get_delete_obsolete_files_period_micros(o));
+
+    rocksdb_options_set_memtable_prefix_bloom_size_ratio(copy, 4.0);
+    CheckCondition(4.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(copy));
+    CheckCondition(2.0 ==
+                   rocksdb_options_get_memtable_prefix_bloom_size_ratio(o));
+
+    rocksdb_options_set_max_compaction_bytes(copy, 124);
+    CheckCondition(124 == rocksdb_options_get_max_compaction_bytes(copy));
+    CheckCondition(24 == rocksdb_options_get_max_compaction_bytes(o));
+
+    rocksdb_options_set_memtable_huge_page_size(copy, 125);
+    CheckCondition(125 == rocksdb_options_get_memtable_huge_page_size(copy));
+    CheckCondition(25 == rocksdb_options_get_memtable_huge_page_size(o));
+
+    rocksdb_options_set_max_successive_merges(copy, 126);
+    CheckCondition(126 == rocksdb_options_get_max_successive_merges(copy));
+    CheckCondition(26 == rocksdb_options_get_max_successive_merges(o));
+
+    rocksdb_options_set_bloom_locality(copy, 127);
+    CheckCondition(127 == rocksdb_options_get_bloom_locality(copy));
+    CheckCondition(27 == rocksdb_options_get_bloom_locality(o));
+
+    rocksdb_options_set_inplace_update_support(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_inplace_update_support(copy));
+    CheckCondition(1 == rocksdb_options_get_inplace_update_support(o));
+
+    rocksdb_options_set_inplace_update_num_locks(copy, 128);
+    CheckCondition(128 == rocksdb_options_get_inplace_update_num_locks(copy));
+    CheckCondition(28 == rocksdb_options_get_inplace_update_num_locks(o));
+
+    rocksdb_options_set_report_bg_io_stats(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_report_bg_io_stats(copy));
+    CheckCondition(1 == rocksdb_options_get_report_bg_io_stats(o));
+
+    rocksdb_options_set_wal_recovery_mode(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_wal_recovery_mode(copy));
+    CheckCondition(2 == rocksdb_options_get_wal_recovery_mode(o));
+
+    rocksdb_options_set_compression(copy, 4);
+    CheckCondition(4 == rocksdb_options_get_compression(copy));
+    CheckCondition(5 == rocksdb_options_get_compression(o));
+
+    rocksdb_options_set_bottommost_compression(copy, 3);
+    CheckCondition(3 == rocksdb_options_get_bottommost_compression(copy));
+    CheckCondition(4 == rocksdb_options_get_bottommost_compression(o));
+
+    rocksdb_options_set_compaction_style(copy, 1);
+    CheckCondition(1 == rocksdb_options_get_compaction_style(copy));
+    CheckCondition(2 == rocksdb_options_get_compaction_style(o));
+
+    rocksdb_options_set_atomic_flush(copy, 0);
+    CheckCondition(0 == rocksdb_options_get_atomic_flush(copy));
+    CheckCondition(1 == rocksdb_options_get_atomic_flush(o));
+
+    rocksdb_options_set_experimental_mempurge_threshold(copy, 229.0);
+    CheckCondition(229.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(copy));
+    CheckCondition(29.0 ==
+                   rocksdb_options_get_experimental_mempurge_threshold(o));
+
+    rocksdb_options_destroy(copy);
+    rocksdb_options_destroy(o);
+  }
+
+  StartPhase("read_options");
+  {
+    rocksdb_readoptions_t* ro;
+    ro = rocksdb_readoptions_create();
+
+    rocksdb_readoptions_set_verify_checksums(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_verify_checksums(ro));
+
+    rocksdb_readoptions_set_fill_cache(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_fill_cache(ro));
+
+    rocksdb_readoptions_set_read_tier(ro, 2);
+    CheckCondition(2 == rocksdb_readoptions_get_read_tier(ro));
+
+    rocksdb_readoptions_set_tailing(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_tailing(ro));
+
+    rocksdb_readoptions_set_readahead_size(ro, 100);
+    CheckCondition(100 == rocksdb_readoptions_get_readahead_size(ro));
+
+    rocksdb_readoptions_set_prefix_same_as_start(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_prefix_same_as_start(ro));
+
+    rocksdb_readoptions_set_pin_data(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_pin_data(ro));
+
+    rocksdb_readoptions_set_total_order_seek(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_total_order_seek(ro));
+
+    rocksdb_readoptions_set_max_skippable_internal_keys(ro, 200);
+    CheckCondition(200 ==
+                   rocksdb_readoptions_get_max_skippable_internal_keys(ro));
+
+    rocksdb_readoptions_set_background_purge_on_iterator_cleanup(ro, 1);
+    CheckCondition(
+        1 == rocksdb_readoptions_get_background_purge_on_iterator_cleanup(ro));
+
+    rocksdb_readoptions_set_ignore_range_deletions(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_ignore_range_deletions(ro));
+
+    rocksdb_readoptions_set_deadline(ro, 300);
+    CheckCondition(300 == rocksdb_readoptions_get_deadline(ro));
+
+    rocksdb_readoptions_set_io_timeout(ro, 400);
+    CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
+
+    rocksdb_readoptions_destroy(ro);
+  }
+
+  StartPhase("write_options");
+  {
+    rocksdb_writeoptions_t* wo;
+    wo = rocksdb_writeoptions_create();
+
+    rocksdb_writeoptions_set_sync(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_sync(wo));
+
+    rocksdb_writeoptions_disable_WAL(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_disable_WAL(wo));
+
+    rocksdb_writeoptions_set_ignore_missing_column_families(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_ignore_missing_column_families(wo));
+
+    rocksdb_writeoptions_set_no_slowdown(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_no_slowdown(wo));
+
+    rocksdb_writeoptions_set_low_pri(wo, 1);
+    CheckCondition(1 == rocksdb_writeoptions_get_low_pri(wo));
+
+    rocksdb_writeoptions_set_memtable_insert_hint_per_batch(wo, 1);
+    CheckCondition(1 ==
+                   rocksdb_writeoptions_get_memtable_insert_hint_per_batch(wo));
+
+    rocksdb_writeoptions_destroy(wo);
+  }
+
+  StartPhase("compact_options");
+  {
+    rocksdb_compactoptions_t* co;
+    co = rocksdb_compactoptions_create();
+
+    rocksdb_compactoptions_set_exclusive_manual_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_exclusive_manual_compaction(co));
+
+    rocksdb_compactoptions_set_bottommost_level_compaction(co, 1);
+    CheckCondition(1 ==
+                   rocksdb_compactoptions_get_bottommost_level_compaction(co));
+
+    rocksdb_compactoptions_set_change_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_change_level(co));
+
+    rocksdb_compactoptions_set_target_level(co, 1);
+    CheckCondition(1 == rocksdb_compactoptions_get_target_level(co));
+
+    rocksdb_compactoptions_destroy(co);
+  }
+
+  StartPhase("flush_options");
+  {
+    rocksdb_flushoptions_t* fo;
+    fo = rocksdb_flushoptions_create();
+
+    rocksdb_flushoptions_set_wait(fo, 1);
+    CheckCondition(1 == rocksdb_flushoptions_get_wait(fo));
+
+    rocksdb_flushoptions_destroy(fo);
+  }
+
+  StartPhase("cache_options");
+  {
+    rocksdb_cache_t* co;
+    co = rocksdb_cache_create_lru(100);
+    CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_set_capacity(co, 200);
+    CheckCondition(200 == rocksdb_cache_get_capacity(co));
+
+    rocksdb_cache_destroy(co);
+  }
+
+  StartPhase("jemalloc_nodump_allocator");
+  {
+    rocksdb_memory_allocator_t* allocator;
+    allocator = rocksdb_jemalloc_nodump_allocator_create(&err);
+    if (err != NULL) {
+      // not supported on all platforms, allow unsupported error
+      const char* ni = "Not implemented: ";
+      size_t ni_len = strlen(ni);
+      size_t err_len = strlen(err);
+
+      CheckCondition(err_len >= ni_len);
+      CheckCondition(memcmp(ni, err, ni_len) == 0);
+      Free(&err);
+    } else {
+      rocksdb_cache_t* co;
+      rocksdb_lru_cache_options_t* copts;
+
+      copts = rocksdb_lru_cache_options_create();
+
+      rocksdb_lru_cache_options_set_capacity(copts, 100);
+      rocksdb_lru_cache_options_set_memory_allocator(copts, allocator);
+
+      co = rocksdb_cache_create_lru_opts(copts);
+      CheckCondition(100 == rocksdb_cache_get_capacity(co));
+
+      rocksdb_cache_destroy(co);
+      rocksdb_lru_cache_options_destroy(copts);
+    }
+    rocksdb_memory_allocator_destroy(allocator);
+  }
+
+  StartPhase("env");
+  {
+    rocksdb_env_t* e;
+    e = rocksdb_create_default_env();
+
+    rocksdb_env_set_background_threads(e, 10);
+    CheckCondition(10 == rocksdb_env_get_background_threads(e));
+
+    rocksdb_env_set_high_priority_background_threads(e, 20);
+    CheckCondition(20 == rocksdb_env_get_high_priority_background_threads(e));
+
+    rocksdb_env_set_low_priority_background_threads(e, 30);
+    CheckCondition(30 == rocksdb_env_get_low_priority_background_threads(e));
+
+    rocksdb_env_set_bottom_priority_background_threads(e, 40);
+    CheckCondition(40 == rocksdb_env_get_bottom_priority_background_threads(e));
+
+    rocksdb_env_destroy(e);
+  }
+
+  StartPhase("universal_compaction_options");
+  {
+    rocksdb_universal_compaction_options_t* uco;
+    uco = rocksdb_universal_compaction_options_create();
+
+    rocksdb_universal_compaction_options_set_size_ratio(uco, 5);
+    CheckCondition(5 ==
+                   rocksdb_universal_compaction_options_get_size_ratio(uco));
+
+    rocksdb_universal_compaction_options_set_min_merge_width(uco, 15);
+    CheckCondition(
+        15 == rocksdb_universal_compaction_options_get_min_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_merge_width(uco, 25);
+    CheckCondition(
+        25 == rocksdb_universal_compaction_options_get_max_merge_width(uco));
+
+    rocksdb_universal_compaction_options_set_max_size_amplification_percent(uco,
+                                                                            35);
+    CheckCondition(
+        35 ==
+        rocksdb_universal_compaction_options_get_max_size_amplification_percent(
+            uco));
+
+    rocksdb_universal_compaction_options_set_compression_size_percent(uco, 45);
+    CheckCondition(
+        45 ==
+        rocksdb_universal_compaction_options_get_compression_size_percent(uco));
+
+    rocksdb_universal_compaction_options_set_stop_style(uco, 1);
+    CheckCondition(1 ==
+                   rocksdb_universal_compaction_options_get_stop_style(uco));
+
+    rocksdb_universal_compaction_options_destroy(uco);
+  }
+
+  StartPhase("fifo_compaction_options");
+  {
+    rocksdb_fifo_compaction_options_t* fco;
+    fco = rocksdb_fifo_compaction_options_create();
+
+    rocksdb_fifo_compaction_options_set_max_table_files_size(fco, 100000);
+    CheckCondition(
+        100000 ==
+        rocksdb_fifo_compaction_options_get_max_table_files_size(fco));
+
+    rocksdb_fifo_compaction_options_destroy(fco);
+  }
+
+  StartPhase("backup_engine_option");
+  {
+    rocksdb_backup_engine_options_t* bdo;
+    bdo = rocksdb_backup_engine_options_create("path");
+
+    rocksdb_backup_engine_options_set_share_table_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_share_table_files(bdo));
+
+    rocksdb_backup_engine_options_set_sync(bdo, 1);
+    CheckCondition(1 == rocksdb_backup_engine_options_get_sync(bdo));
+
+    rocksdb_backup_engine_options_set_destroy_old_data(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_destroy_old_data(bdo));
+
+    rocksdb_backup_engine_options_set_backup_log_files(bdo, 1);
+    CheckCondition(1 ==
+                   rocksdb_backup_engine_options_get_backup_log_files(bdo));
+
+    rocksdb_backup_engine_options_set_backup_rate_limit(bdo, 123);
+    CheckCondition(123 ==
+                   rocksdb_backup_engine_options_get_backup_rate_limit(bdo));
+
+    rocksdb_backup_engine_options_set_restore_rate_limit(bdo, 37);
+    CheckCondition(37 ==
+                   rocksdb_backup_engine_options_get_restore_rate_limit(bdo));
+
+    rocksdb_backup_engine_options_set_max_background_operations(bdo, 20);
+    CheckCondition(
+        20 == rocksdb_backup_engine_options_get_max_background_operations(bdo));
+
+    rocksdb_backup_engine_options_set_callback_trigger_interval_size(bdo, 9000);
+    CheckCondition(
+        9000 ==
+        rocksdb_backup_engine_options_get_callback_trigger_interval_size(bdo));
+
+    rocksdb_backup_engine_options_set_max_valid_backups_to_open(bdo, 40);
+    CheckCondition(
+        40 == rocksdb_backup_engine_options_get_max_valid_backups_to_open(bdo));
+
+    rocksdb_backup_engine_options_set_share_files_with_checksum_naming(bdo, 2);
+    CheckCondition(
+        2 == rocksdb_backup_engine_options_get_share_files_with_checksum_naming(
+                 bdo));
+
+    rocksdb_backup_engine_options_destroy(bdo);
+  }
+
+  StartPhase("compression_options");
+  {
+    rocksdb_options_t* co;
+    co = rocksdb_options_create();
+
+    rocksdb_options_set_compression_options_zstd_max_train_bytes(co, 100);
+    CheckCondition(
+        100 ==
+        rocksdb_options_get_compression_options_zstd_max_train_bytes(co));
+
+    rocksdb_options_set_compression_options_parallel_threads(co, 2);
+    CheckCondition(
+        2 == rocksdb_options_get_compression_options_parallel_threads(co));
+
+    rocksdb_options_set_compression_options_max_dict_buffer_bytes(co, 200);
+    CheckCondition(
+        200 ==
+        rocksdb_options_get_compression_options_max_dict_buffer_bytes(co));
+
+    rocksdb_options_set_compression_options_use_zstd_dict_trainer(co, 0);
+    CheckCondition(
+        0 == rocksdb_options_get_compression_options_use_zstd_dict_trainer(co));
+    rocksdb_options_destroy(co);
+  }
+
+  StartPhase("iterate_upper_bound");
+  {
+    // Create new empty database
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(options, NULL);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_put(db, woptions, "a", 1, "0", 1, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, "g1", 2, "0", 1, &err);
+    CheckNoError(err);
+
+    // testing basic case with no iterate_upper_bound and no prefix_extractor
+    {
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+      rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+      rocksdb_iter_seek(iter, "foo", 3);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo", "bar");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo1", "bar1");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "g1", "0");
+
+      rocksdb_iter_destroy(iter);
+    }
+
+    // testing iterate_upper_bound and forward iterator
+    // to make sure it stops at bound
+    {
+      // iterate_upper_bound points beyond the last expected entry
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4);
+
+      rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions);
+
+      rocksdb_iter_seek(iter, "foo", 3);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo", "bar");
+
+      rocksdb_iter_next(iter);
+      CheckCondition(rocksdb_iter_valid(iter));
+      CheckIter(iter, "foo1", "bar1");
+
+      rocksdb_iter_next(iter);
+      // should stop here...
+      CheckCondition(!rocksdb_iter_valid(iter));
+
+      rocksdb_iter_destroy(iter);
+      rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0);
+    }
+  }
+
+  StartPhase("transactions");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    // put outside a transaction
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+
+    // delete from outside transaction
+    rocksdb_transactiondb_delete(txn_db, woptions, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+
+    // write batch into TransactionDB
+    rocksdb_writebatch_t* wb = rocksdb_writebatch_create();
+    rocksdb_writebatch_put(wb, "foo", 3, "a", 1);
+    rocksdb_writebatch_clear(wb);
+    rocksdb_writebatch_put(wb, "bar", 3, "b", 1);
+    rocksdb_writebatch_put(wb, "box", 3, "c", 1);
+    rocksdb_writebatch_delete(wb, "bar", 3);
+    rocksdb_transactiondb_write(txn_db, woptions, wb, &err);
+    rocksdb_writebatch_destroy(wb);
+    CheckTxnDBGet(txn_db, roptions, "box", "c");
+    CheckTxnDBPinGet(txn_db, roptions, "box", "c");
+    CheckNoError(err);
+
+    // multi get
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", NULL, NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // begin a transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    // put
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", "hello");
+    CheckTxnPinGet(txn, roptions, "foo", "hello");
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", "hello", NULL};
+      rocksdb_transaction_multi_get(txn, roptions, 3, keys, keys_sizes, vals,
+                                    vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+    // delete
+    rocksdb_transaction_delete(txn, "foo", 3, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo", NULL);
+    CheckTxnPinGet(txn, roptions, "foo", NULL);
+
+    rocksdb_transaction_put(txn, "foo", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, before commit
+    CheckTxnDBGet(txn_db, roptions, "foo", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo", NULL);
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", NULL, NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // commit
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+
+    // read from outside transaction, after commit
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+    {
+      const char* keys[3] = {"box", "foo", "notfound"};
+      const size_t keys_sizes[3] = {3, 3, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"c", "hello", NULL};
+      rocksdb_transactiondb_multi_get(txn_db, roptions, 3, keys, keys_sizes,
+                                      vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // reuse old transaction
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, txn);
+
+    // snapshot
+    const rocksdb_snapshot_t* snapshot;
+    snapshot = rocksdb_transactiondb_create_snapshot(txn_db);
+    rocksdb_readoptions_set_snapshot(roptions, snapshot);
+
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
+    CheckNoError(err);
+
+    CheckTxnDBGet(txn_db, roptions, "foo", "hello");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hello");
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_transactiondb_release_snapshot(txn_db, snapshot);
+    CheckTxnDBGet(txn_db, roptions, "foo", "hey");
+    CheckTxnDBPinGet(txn_db, roptions, "foo", "hey");
+
+    // iterate
+    rocksdb_transaction_put(txn, "bar", 3, "hi", 2, &err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator(txn, roptions);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "bar", "hi");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    // rollback
+    rocksdb_transaction_rollback(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "bar", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "bar", NULL);
+
+    // save point
+    rocksdb_transaction_put(txn, "foo1", 4, "hi1", 3, &err);
+    rocksdb_transaction_set_savepoint(txn);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+    rocksdb_transaction_put(txn, "foo2", 4, "hi2", 3, &err);
+    CheckTxnGet(txn, roptions, "foo2", "hi2");
+    CheckTxnPinGet(txn, roptions, "foo2", "hi2");
+
+    // rollback to savepoint
+    rocksdb_transaction_rollback_to_savepoint(txn, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn, roptions, "foo2", NULL);
+    CheckTxnGet(txn, roptions, "foo1", "hi1");
+    CheckTxnPinGet(txn, roptions, "foo2", NULL);
+    CheckTxnPinGet(txn, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo1", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    CheckTxnDBGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBGet(txn_db, roptions, "foo2", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "foo1", "hi1");
+    CheckTxnDBPinGet(txn_db, roptions, "foo2", NULL);
+
+    // Column families.
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+                                                     "txn_db_cf", &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+                                 8, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+    CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", "cf_hello");
+    {
+      const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
+      const char* keys[2] = {"cf_foo", "notfound"};
+      const size_t keys_sizes[2] = {6, 8};
+      char* vals[2];
+      size_t vals_sizes[2];
+      char* errs[2];
+      const char* expected[2] = {"cf_hello", NULL};
+      rocksdb_transactiondb_multi_get_cf(txn_db, roptions, get_handles, 2, keys,
+                                         keys_sizes, vals, vals_sizes, errs);
+      CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
+    }
+
+    rocksdb_transactiondb_delete_cf(txn_db, woptions, cfh, "cf_foo", 6, &err);
+    CheckNoError(err);
+    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+    CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
+
+    // flush
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_transactiondb_flush_wal(txn_db, 1, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_flush_cf(txn_db, flush_options, cfh, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_flush(txn_db, flush_options, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_destroy(flush_options);
+
+    // close and destroy
+    rocksdb_column_family_handle_destroy(cfh);
+    rocksdb_transaction_destroy(txn);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
+  StartPhase("two-phase commit");
+  {
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_transaction_options_set_skip_prepare(txn_options, 0);
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_commit(txn, &err);
+    CheckCondition(err != NULL);
+    Free(&err);
+    err = NULL;
+    rocksdb_transaction_prepare(txn, &err);
+    CheckCondition(err != NULL);
+    Free(&err);
+    err = NULL;
+    rocksdb_transaction_set_name(txn, "txn1", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn);
+
+    // prepare 2 transactions and close db.
+    rocksdb_transaction_t* txn1 =
+        rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_put(txn1, "bar1", 4, "1", 1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_set_name(txn1, "txn1", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn2 =
+        rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_put(txn2, "bar2", 4, "2", 1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_set_name(txn2, "txn2", 4, &err);
+    CheckNoError(err);
+    rocksdb_transaction_prepare(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+
+    // reopen db and get all prepared.
+    txn_db_options = rocksdb_transactiondb_options_create();
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_error_if_exists(options, 0);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+    CheckTxnDBPinGet(txn_db, roptions, "bar1", NULL);
+    CheckTxnDBPinGet(txn_db, roptions, "bar2", NULL);
+    size_t cnt;
+    rocksdb_transaction_t** txns =
+        rocksdb_transactiondb_get_prepared_transactions(txn_db, &cnt);
+    CheckCondition(cnt == 2);
+    size_t i;
+    for (i = 0; i < cnt; i++) {
+      txn = txns[i];
+      size_t name_len = 0;
+      char* name = rocksdb_transaction_get_name(txn, &name_len);
+      CheckCondition(name_len == 4);
+      if (strncmp(name, "txn1", name_len) == 0) {
+        rocksdb_transaction_commit(txn, &err);
+      } else if (strncmp(name, "txn2", name_len) == 0) {
+        rocksdb_transaction_rollback(txn, &err);
+      }
+      rocksdb_free(name);
+      CheckNoError(err);
+      rocksdb_transaction_destroy(txn);
+    }
+    rocksdb_free(txns);
+    CheckTxnDBGet(txn_db, roptions, "bar1", "1");
+    CheckTxnDBGet(txn_db, roptions, "bar2", NULL);
+    rocksdb_transactiondb_put(txn_db, woptions, "bar2", 4, "2", 1, &err);
+    CheckNoError(err);
+
+    // close and destroy
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
+  StartPhase("optimistic_transactions");
+  {
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    rocksdb_options_set_allow_concurrent_memtable_write(db_options, 1);
+    otxn_db = rocksdb_optimistictransactiondb_open(db_options, dbname, &err);
+    otxn_options = rocksdb_optimistictransaction_options_create();
+    rocksdb_transaction_t* txn1 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_t* txn2 = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    rocksdb_transaction_put(txn1, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put(txn2, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    CheckTxnGet(txn1, roptions, "key", "value");
+    CheckTxnPinGet(txn1, roptions, "key", "value");
+    rocksdb_transaction_commit(txn1, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+    rocksdb_transaction_destroy(txn1);
+    rocksdb_transaction_destroy(txn2);
+
+    // Check column family
+    db = rocksdb_optimistictransactiondb_get_base_db(otxn_db);
+    rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
+    CheckNoError(err);
+    rocksdb_column_family_handle_t *cfh1, *cfh2;
+    cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
+    cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              NULL);
+    rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_put_cf(txn, cfh2, "key_cf2", 7, "val_cf2", 7, &err);
+    CheckNoError(err);
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
+                                              txn);
+    CheckGetCF(db, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+    CheckTxnPinGetCF(txn, roptions, cfh1, "key_cf1", "val_cf1");
+    {
+      const rocksdb_column_family_handle_t* get_handles[3] = {cfh1, cfh2, cfh2};
+      const char* keys[3] = {"key_cf1", "key_cf2", "notfound"};
+      const size_t keys_sizes[3] = {7, 7, 8};
+      char* vals[3];
+      size_t vals_sizes[3];
+      char* errs[3];
+      const char* expected[3] = {"val_cf1", "val_cf2", NULL};
+      rocksdb_transaction_multi_get_cf(txn, roptions, get_handles, 3, keys,
+                                       keys_sizes, vals, vals_sizes, errs);
+      CheckMultiGetValues(3, vals, vals_sizes, errs, expected);
+    }
+
+    // Check iterator with column family
+    rocksdb_transaction_put_cf(txn, cfh1, "key1_cf", 7, "val1_cf", 7, &err);
+    CheckNoError(err);
+    rocksdb_iterator_t* iter =
+        rocksdb_transaction_create_iterator_cf(txn, roptions, cfh1);
+    CheckCondition(!rocksdb_iter_valid(iter));
+    rocksdb_iter_seek_to_first(iter);
+    CheckCondition(rocksdb_iter_valid(iter));
+    CheckIter(iter, "key1_cf", "val1_cf");
+    rocksdb_iter_get_error(iter, &err);
+    CheckNoError(err);
+    rocksdb_iter_destroy(iter);
+
+    rocksdb_transaction_destroy(txn);
+    rocksdb_column_family_handle_destroy(cfh1);
+    rocksdb_column_family_handle_destroy(cfh2);
+    rocksdb_optimistictransactiondb_close_base_db(db);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+
+    // Check open optimistic transaction db with column families
+    size_t cf_len;
+    char** column_fams =
+        rocksdb_list_column_families(db_options, dbname, &cf_len, &err);
+    CheckNoError(err);
+    CheckEqual("default", column_fams[0], 7);
+    CheckEqual("txn_db_cf1", column_fams[1], 10);
+    CheckEqual("txn_db_cf2", column_fams[2], 10);
+    CheckCondition(cf_len == 3);
+    rocksdb_list_column_families_destroy(column_fams, cf_len);
+
+    const char* cf_names[3] = {"default", "txn_db_cf1", "txn_db_cf2"};
+    rocksdb_options_t* cf_options = rocksdb_options_create();
+    const rocksdb_options_t* cf_opts[3] = {cf_options, cf_options, cf_options};
+
+    rocksdb_options_set_error_if_exists(cf_options, 0);
+    rocksdb_column_family_handle_t* cf_handles[3];
+    otxn_db = rocksdb_optimistictransactiondb_open_column_families(
+        db_options, dbname, 3, cf_names, cf_opts, cf_handles, &err);
+    CheckNoError(err);
+    rocksdb_transaction_t* txn_cf = rocksdb_optimistictransaction_begin(
+        otxn_db, woptions, otxn_options, NULL);
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[0], "key", "value");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[1], "key_cf1", "val_cf1");
+    CheckTxnPinGetCF(txn_cf, roptions, cf_handles[2], "key_cf2", "val_cf2");
+    rocksdb_transaction_destroy(txn_cf);
+    rocksdb_options_destroy(cf_options);
+    rocksdb_column_family_handle_destroy(cf_handles[0]);
+    rocksdb_column_family_handle_destroy(cf_handles[1]);
+    rocksdb_column_family_handle_destroy(cf_handles[2]);
+    rocksdb_optimistictransactiondb_close(otxn_db);
+    rocksdb_destroy_db(db_options, dbname, &err);
+    rocksdb_options_destroy(db_options);
+    rocksdb_optimistictransaction_options_destroy(otxn_options);
+    CheckNoError(err);
+  }
+
+  // Simple sanity check that setting memtable rep works.
+  StartPhase("memtable_reps");
+  {
+    // Create database with vector memtable.
+    rocksdb_options_set_memtable_vector_rep(options);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    // Create database with hash skiplist memtable.
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  // Check that secondary instance works.
+  StartPhase("open_as_secondary");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    rocksdb_options_t* db_options = rocksdb_options_create();
+    rocksdb_options_set_create_if_missing(db_options, 1);
+    db = rocksdb_open(db_options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_t* db1;
+    rocksdb_options_t* opts = rocksdb_options_create();
+    rocksdb_options_set_max_open_files(opts, -1);
+    rocksdb_options_set_create_if_missing(opts, 1);
+    snprintf(secondary_path, sizeof(secondary_path),
+             "%s/rocksdb_c_test_secondary-%d", GetTempDir(), ((int)geteuid()));
+    db1 = rocksdb_open_as_secondary(opts, dbname, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_writeoptions_set_sync(woptions, 0);
+    rocksdb_writeoptions_disable_WAL(woptions, 1);
+    rocksdb_put(db, woptions, "key0", 4, "value0", 6, &err);
+    CheckNoError(err);
+    rocksdb_flushoptions_t* flush_opts = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_opts, 1);
+    rocksdb_flush(db, flush_opts, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_readoptions_set_verify_checksums(ropts, 1);
+    rocksdb_readoptions_set_snapshot(ropts, NULL);
+    CheckGet(db, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key0", "value0");
+
+    rocksdb_writeoptions_disable_WAL(woptions, 0);
+    rocksdb_put(db, woptions, "key1", 4, "value1", 6, &err);
+    CheckNoError(err);
+    rocksdb_try_catch_up_with_primary(db1, &err);
+    CheckNoError(err);
+    CheckGet(db1, ropts, "key0", "value0");
+    CheckGet(db1, ropts, "key1", "value1");
+
+    rocksdb_close(db1);
+    rocksdb_destroy_db(opts, secondary_path, &err);
+    CheckNoError(err);
+
+    rocksdb_options_destroy(db_options);
+    rocksdb_options_destroy(opts);
+    rocksdb_readoptions_destroy(ropts);
+    rocksdb_flushoptions_destroy(flush_opts);
+  }
+
+  // Simple sanity check that options setting db_paths work.
+  StartPhase("open_db_paths");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+
+    const rocksdb_dbpath_t* paths[1] = {dbpath};
+    rocksdb_options_set_db_paths(options, paths, 1);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+  }
+
+  StartPhase("filter_with_prefix_seek");
+  {
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_set_prefix_extractor(
+        options, rocksdb_slicetransform_create_fixed_prefix(1));
+    rocksdb_filterpolicy_t* filter_policy =
+        rocksdb_filterpolicy_create_bloom_full(8.0);
+    rocksdb_block_based_options_set_filter_policy(table_options, filter_policy);
+    rocksdb_options_set_block_based_table_factory(options, table_options);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    int i;
+    for (i = 0; i < 10; ++i) {
+      char key = '0' + (char)i;
+      rocksdb_put(db, woptions, &key, 1, "", 1, &err);
+      CheckNoError(err);
+    }
+
+    // Flush to generate an L0 so that filter will be used later.
+    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
+    rocksdb_flushoptions_set_wait(flush_options, 1);
+    rocksdb_flush(db, flush_options, &err);
+    rocksdb_flushoptions_destroy(flush_options);
+    CheckNoError(err);
+
+    rocksdb_readoptions_t* ropts = rocksdb_readoptions_create();
+    rocksdb_iterator_t* iter = rocksdb_create_iterator(db, ropts);
+
+    rocksdb_iter_seek(iter, "0", 1);
+    int cnt = 0;
+    while (rocksdb_iter_valid(iter)) {
+      ++cnt;
+      rocksdb_iter_next(iter);
+    }
+    CheckCondition(10 == cnt);
+
+    rocksdb_iter_destroy(iter);
+    rocksdb_readoptions_destroy(ropts);
+  }
+
+  StartPhase("cancel_all_background_work");
+  rocksdb_cancel_all_background_work(db, 1);
+
+  StartPhase("cleanup");
+  rocksdb_close(db);
+  rocksdb_options_destroy(options);
+  rocksdb_block_based_options_destroy(table_options);
+  rocksdb_readoptions_destroy(roptions);
+  rocksdb_writeoptions_destroy(woptions);
+  rocksdb_compactoptions_destroy(coptions);
+  rocksdb_cache_destroy(cache);
+  rocksdb_comparator_destroy(cmp);
+  rocksdb_dbpath_destroy(dbpath);
+  rocksdb_env_destroy(env);
+
+  fprintf(stderr, "PASS\n");
+  return 0;
+}
+
+#else
+
+int main(void) {
+  fprintf(stderr, "SKIPPED\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/column_family.cc b/src/rocksdb/db/column_family.cc
new file mode 100644
index 000000000..268060ddf
--- /dev/null
+++ b/src/rocksdb/db/column_family.cc
@@ -0,0 +1,1683 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/column_family.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/db_impl/db_impl.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/table_properties_collector.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
+    ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
+    : cfd_(column_family_data), db_(db), mutex_(mutex) {
+  if (cfd_ != nullptr) {
+    cfd_->Ref();
+  }
+}
+
+ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
+  if (cfd_ != nullptr) {
+#ifndef ROCKSDB_LITE
+    for (auto& listener : cfd_->ioptions()->listeners) {
+      listener->OnColumnFamilyHandleDeletionStarted(this);
+    }
+#endif  // ROCKSDB_LITE
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    // Need to hold some shared pointers owned by the initial_cf_options
+    // before final cleaning up finishes.
+    ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
+    JobContext job_context(0);
+    mutex_->Lock();
+    bool dropped = cfd_->IsDropped();
+    if (cfd_->UnrefAndTryDelete()) {
+      if (dropped) {
+        db_->FindObsoleteFiles(&job_context, false, true);
+      }
+    }
+    mutex_->Unlock();
+    if (job_context.HaveSomethingToDelete()) {
+      bool defer_purge =
+          db_->immutable_db_options().avoid_unnecessary_blocking_io;
+      db_->PurgeObsoleteFiles(job_context, defer_purge);
+    }
+    job_context.Clean();
+  }
+}
+
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+
+const std::string& ColumnFamilyHandleImpl::GetName() const {
+  return cfd()->GetName();
+}
+
+Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
+#ifndef ROCKSDB_LITE
+  // accessing mutable cf-options requires db mutex.
+  InstrumentedMutexLock l(mutex_);
+  *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
+  return Status::OK();
+#else
+  (void)desc;
+  return Status::NotSupported();
+#endif  // !ROCKSDB_LITE
+}
+
+const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
+  return cfd()->user_comparator();
+}
+
+void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
+  assert(int_tbl_prop_collector_factories);
+
+  auto& collector_factories = ioptions.table_properties_collector_factories;
+  for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
+       ++i) {
+    assert(collector_factories[i]);
+    int_tbl_prop_collector_factories->emplace_back(
+        new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
+  }
+}
+
+Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
+  if (!cf_options.compression_per_level.empty()) {
+    for (size_t level = 0; level < cf_options.compression_per_level.size();
+         ++level) {
+      if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
+        return Status::InvalidArgument(
+            "Compression type " +
+            CompressionTypeToString(cf_options.compression_per_level[level]) +
+            " is not linked with the binary.");
+      }
+    }
+  } else {
+    if (!CompressionTypeSupported(cf_options.compression)) {
+      return Status::InvalidArgument(
+          "Compression type " +
+          CompressionTypeToString(cf_options.compression) +
+          " is not linked with the binary.");
+    }
+  }
+  if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
+    if (cf_options.compression_opts.use_zstd_dict_trainer) {
+      if (!ZSTD_TrainDictionarySupported()) {
+        return Status::InvalidArgument(
+            "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
+            "is not linked with the binary.");
+      }
+    } else if (!ZSTD_FinalizeDictionarySupported()) {
+      return Status::InvalidArgument(
+          "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ "
+          "is not linked with the binary.");
+    }
+    if (cf_options.compression_opts.max_dict_bytes == 0) {
+      return Status::InvalidArgument(
+          "The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
+          "should be nonzero if we're using zstd's dictionary generator.");
+    }
+  }
+
+  if (!CompressionTypeSupported(cf_options.blob_compression_type)) {
+    std::ostringstream oss;
+    oss << "The specified blob compression type "
+        << CompressionTypeToString(cf_options.blob_compression_type)
+        << " is not available.";
+
+    return Status::InvalidArgument(oss.str());
+  }
+
+  return Status::OK();
+}
+
+Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
+  if (cf_options.inplace_update_support) {
+    return Status::InvalidArgument(
+        "In-place memtable updates (inplace_update_support) is not compatible "
+        "with concurrent writes (allow_concurrent_memtable_write)");
+  }
+  if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
+    return Status::InvalidArgument(
+        "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
+  }
+  return Status::OK();
+}
+
+Status CheckCFPathsSupported(const DBOptions& db_options,
+                             const ColumnFamilyOptions& cf_options) {
+  // More than one cf_paths are supported only in universal
+  // and level compaction styles. This function also checks the case
+  // in which cf_paths is not specified, which results in db_paths
+  // being used.
+  if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
+      (cf_options.compaction_style != kCompactionStyleLevel)) {
+    if (cf_options.cf_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one CF paths are only supported in "
+          "universal and level compaction styles. ");
+    } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) {
+      return Status::NotSupported(
+          "More than one DB paths are only supported in "
+          "universal and level compaction styles. ");
+    }
+  }
+  return Status::OK();
+}
+
+namespace {
+const uint64_t kDefaultTtl = 0xfffffffffffffffe;
+const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
+}  // anonymous namespace
+
+ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                    const ColumnFamilyOptions& src) {
+  ColumnFamilyOptions result = src;
+  size_t clamp_max = std::conditional<
+      sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
+      std::integral_constant<uint64_t, 64ull << 30>>::type::value;
+  ClipToRange(&result.write_buffer_size, (static_cast<size_t>(64)) << 10,
+              clamp_max);
+  // if user sets arena_block_size, we trust user to use this value. Otherwise,
+  // calculate a proper value from writer_buffer_size;
+  if (result.arena_block_size <= 0) {
+    result.arena_block_size =
+        std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
+
+    // Align up to 4k
+    const size_t align = 4 * 1024;
+    result.arena_block_size =
+        ((result.arena_block_size + align - 1) / align) * align;
+  }
+  result.min_write_buffer_number_to_merge =
+      std::min(result.min_write_buffer_number_to_merge,
+               result.max_write_buffer_number - 1);
+  if (result.min_write_buffer_number_to_merge < 1) {
+    result.min_write_buffer_number_to_merge = 1;
+  }
+
+  if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) {
+    ROCKS_LOG_WARN(
+        db_options.logger,
+        "Currently, if atomic_flush is true, then triggering flush for any "
+        "column family internally (non-manual flush) will trigger flushing "
+        "all column families even if the number of memtables is smaller "
+        "min_write_buffer_number_to_merge. Therefore, configuring "
+        "min_write_buffer_number_to_merge > 1 is not compatible and should "
+        "be satinized to 1. Not doing so will lead to data loss and "
+        "inconsistent state across multiple column families when WAL is "
+        "disabled, which is a common setting for atomic flush");
+
+    result.min_write_buffer_number_to_merge = 1;
+  }
+
+  if (result.num_levels < 1) {
+    result.num_levels = 1;
+  }
+  if (result.compaction_style == kCompactionStyleLevel &&
+      result.num_levels < 2) {
+    result.num_levels = 2;
+  }
+
+  if (result.compaction_style == kCompactionStyleUniversal &&
+      db_options.allow_ingest_behind && result.num_levels < 3) {
+    result.num_levels = 3;
+  }
+
+  if (result.max_write_buffer_number < 2) {
+    result.max_write_buffer_number = 2;
+  }
+  // fall back max_write_buffer_number_to_maintain if
+  // max_write_buffer_size_to_maintain is not set
+  if (result.max_write_buffer_size_to_maintain < 0) {
+    result.max_write_buffer_size_to_maintain =
+        result.max_write_buffer_number *
+        static_cast<int64_t>(result.write_buffer_size);
+  } else if (result.max_write_buffer_size_to_maintain == 0 &&
+             result.max_write_buffer_number_to_maintain < 0) {
+    result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
+  }
+  // bloom filter size shouldn't exceed 1/4 of memtable size.
+  if (result.memtable_prefix_bloom_size_ratio > 0.25) {
+    result.memtable_prefix_bloom_size_ratio = 0.25;
+  } else if (result.memtable_prefix_bloom_size_ratio < 0) {
+    result.memtable_prefix_bloom_size_ratio = 0;
+  }
+
+  if (!result.prefix_extractor) {
+    assert(result.memtable_factory);
+    Slice name = result.memtable_factory->Name();
+    if (name.compare("HashSkipListRepFactory") == 0 ||
+        name.compare("HashLinkListRepFactory") == 0) {
+      result.memtable_factory = std::make_shared<SkipListFactory>();
+    }
+  }
+
+  if (result.compaction_style == kCompactionStyleFIFO) {
+    // since we delete level0 files in FIFO compaction when there are too many
+    // of them, these options don't really mean anything
+    result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+    result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+  }
+
+  if (result.max_bytes_for_level_multiplier <= 0) {
+    result.max_bytes_for_level_multiplier = 1;
+  }
+
+  if (result.level0_file_num_compaction_trigger == 0) {
+    ROCKS_LOG_WARN(db_options.logger,
+                   "level0_file_num_compaction_trigger cannot be 0");
+    result.level0_file_num_compaction_trigger = 1;
+  }
+
+  if (result.level0_stop_writes_trigger <
+          result.level0_slowdown_writes_trigger ||
+      result.level0_slowdown_writes_trigger <
+          result.level0_file_num_compaction_trigger) {
+    ROCKS_LOG_WARN(db_options.logger,
+                   "This condition must be satisfied: "
+                   "level0_stop_writes_trigger(%d) >= "
+                   "level0_slowdown_writes_trigger(%d) >= "
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+    if (result.level0_slowdown_writes_trigger <
+        result.level0_file_num_compaction_trigger) {
+      result.level0_slowdown_writes_trigger =
+          result.level0_file_num_compaction_trigger;
+    }
+    if (result.level0_stop_writes_trigger <
+        result.level0_slowdown_writes_trigger) {
+      result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
+    }
+    ROCKS_LOG_WARN(db_options.logger,
+                   "Adjust the value to "
+                   "level0_stop_writes_trigger(%d)"
+                   "level0_slowdown_writes_trigger(%d)"
+                   "level0_file_num_compaction_trigger(%d)",
+                   result.level0_stop_writes_trigger,
+                   result.level0_slowdown_writes_trigger,
+                   result.level0_file_num_compaction_trigger);
+  }
+
+  if (result.soft_pending_compaction_bytes_limit == 0) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  } else if (result.hard_pending_compaction_bytes_limit > 0 &&
+             result.soft_pending_compaction_bytes_limit >
+                 result.hard_pending_compaction_bytes_limit) {
+    result.soft_pending_compaction_bytes_limit =
+        result.hard_pending_compaction_bytes_limit;
+  }
+
+#ifndef ROCKSDB_LITE
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm =
+      static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
+  for (size_t i = 0; i < result.cf_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(db_options.env, sfm,
+                                      result.cf_paths[i].path)
+        .PermitUncheckedError();
+  }
+#endif
+
+  if (result.cf_paths.empty()) {
+    result.cf_paths = db_options.db_paths;
+  }
+
+  if (result.level_compaction_dynamic_level_bytes) {
+    if (result.compaction_style != kCompactionStyleLevel) {
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "level_compaction_dynamic_level_bytes only makes sense"
+                     "for level-based compaction");
+      result.level_compaction_dynamic_level_bytes = false;
+    } else if (result.cf_paths.size() > 1U) {
+      // we don't yet know how to make both of this feature and multiple
+      // DB path work.
+      ROCKS_LOG_WARN(db_options.info_log.get(),
+                     "multiple cf_paths/db_paths and"
+                     "level_compaction_dynamic_level_bytes"
+                     "can't be used together");
+      result.level_compaction_dynamic_level_bytes = false;
+    }
+  }
+
+  if (result.max_compaction_bytes == 0) {
+    result.max_compaction_bytes = result.target_file_size_base * 25;
+  }
+
+  bool is_block_based_table = (result.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+
+  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
+  if (result.ttl == kDefaultTtl) {
+    if (is_block_based_table &&
+        result.compaction_style != kCompactionStyleFIFO) {
+      result.ttl = kAdjustedTtl;
+    } else {
+      result.ttl = 0;
+    }
+  }
+
+  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
+
+  // Turn on periodic compactions and set them to occur once every 30 days if
+  // compaction filters are used and periodic_compaction_seconds is set to the
+  // default value.
+  if (result.compaction_style != kCompactionStyleFIFO) {
+    if ((result.compaction_filter != nullptr ||
+         result.compaction_filter_factory != nullptr) &&
+        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+    }
+  } else {
+    // result.compaction_style == kCompactionStyleFIFO
+    if (result.ttl == 0) {
+      if (is_block_based_table) {
+        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+        }
+        result.ttl = result.periodic_compaction_seconds;
+      }
+    } else if (result.periodic_compaction_seconds != 0) {
+      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+    }
+  }
+
+  // TTL compactions would work similar to Periodic Compactions in Universal in
+  // most of the cases. So, if ttl is set, execute the periodic compaction
+  // codepath.
+  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
+    if (result.periodic_compaction_seconds != 0) {
+      result.periodic_compaction_seconds =
+          std::min(result.ttl, result.periodic_compaction_seconds);
+    } else {
+      result.periodic_compaction_seconds = result.ttl;
+    }
+  }
+
+  if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
+    result.periodic_compaction_seconds = 0;
+  }
+
+  return result;
+}
+
+int SuperVersion::dummy = 0;
+void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
+void* const SuperVersion::kSVObsolete = nullptr;
+
+SuperVersion::~SuperVersion() {
+  for (auto td : to_delete) {
+    delete td;
+  }
+}
+
+SuperVersion* SuperVersion::Ref() {
+  refs.fetch_add(1, std::memory_order_relaxed);
+  return this;
+}
+
+bool SuperVersion::Unref() {
+  // fetch_sub returns the previous value of ref
+  uint32_t previous_refs = refs.fetch_sub(1);
+  assert(previous_refs > 0);
+  return previous_refs == 1;
+}
+
+void SuperVersion::Cleanup() {
+  assert(refs.load(std::memory_order_relaxed) == 0);
+  // Since this SuperVersion object is being deleted,
+  // decrement reference to the immutable MemtableList
+  // this SV object was pointing to.
+  imm->Unref(&to_delete);
+  MemTable* m = mem->Unref();
+  if (m != nullptr) {
+    auto* memory_usage = current->cfd()->imm()->current_memory_usage();
+    assert(*memory_usage >= m->ApproximateMemoryUsage());
+    *memory_usage -= m->ApproximateMemoryUsage();
+    to_delete.push_back(m);
+  }
+  current->Unref();
+  cfd->UnrefAndTryDelete();
+}
+
+void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+                        MemTableListVersion* new_imm, Version* new_current) {
+  cfd = new_cfd;
+  mem = new_mem;
+  imm = new_imm;
+  current = new_current;
+  cfd->Ref();
+  mem->Ref();
+  imm->Ref();
+  current->Ref();
+  refs.store(1, std::memory_order_relaxed);
+}
+
+namespace {
+void SuperVersionUnrefHandle(void* ptr) {
+  // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
+  // destroyed. When the former happens, the thread shouldn't see kSVInUse.
+  // When the latter happens, only super_version_ holds a reference
+  // to ColumnFamilyData, so no further queries are possible.
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  bool was_last_ref __attribute__((__unused__));
+  was_last_ref = sv->Unref();
+  // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
+  // This is important because we can't do SuperVersion cleanup here.
+  // That would require locking DB mutex, which would deadlock because
+  // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
+  assert(!was_last_ref);
+}
+}  // anonymous namespace
+
+std::vector<std::string> ColumnFamilyData::GetDbPaths() const {
+  std::vector<std::string> paths;
+  paths.reserve(ioptions_.cf_paths.size());
+  for (const DbPath& db_path : ioptions_.cf_paths) {
+    paths.emplace_back(db_path.path);
+  }
+  return paths;
+}
+
+const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId =
+    std::numeric_limits<uint32_t>::max();
+
+ColumnFamilyData::ColumnFamilyData(
+    uint32_t id, const std::string& name, Version* _dummy_versions,
+    Cache* _table_cache, WriteBufferManager* write_buffer_manager,
+    const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
+    const FileOptions* file_options, ColumnFamilySet* column_family_set,
+    BlockCacheTracer* const block_cache_tracer,
+    const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
+    const std::string& db_session_id)
+    : id_(id),
+      name_(name),
+      dummy_versions_(_dummy_versions),
+      current_(nullptr),
+      refs_(0),
+      initialized_(false),
+      dropped_(false),
+      internal_comparator_(cf_options.comparator),
+      initial_cf_options_(SanitizeOptions(db_options, cf_options)),
+      ioptions_(db_options, initial_cf_options_),
+      mutable_cf_options_(initial_cf_options_),
+      is_delete_range_supported_(
+          cf_options.table_factory->IsDeleteRangeSupported()),
+      write_buffer_manager_(write_buffer_manager),
+      mem_(nullptr),
+      imm_(ioptions_.min_write_buffer_number_to_merge,
+           ioptions_.max_write_buffer_number_to_maintain,
+           ioptions_.max_write_buffer_size_to_maintain),
+      super_version_(nullptr),
+      super_version_number_(0),
+      local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+      next_(nullptr),
+      prev_(nullptr),
+      log_number_(0),
+      flush_reason_(FlushReason::kOthers),
+      column_family_set_(column_family_set),
+      queued_for_flush_(false),
+      queued_for_compaction_(false),
+      prev_compaction_needed_bytes_(0),
+      allow_2pc_(db_options.allow_2pc),
+      last_memtable_id_(0),
+      db_paths_registered_(false),
+      mempurge_used_(false) {
+  if (id_ != kDummyColumnFamilyDataId) {
+    // TODO(cc): RegisterDbPaths can be expensive, considering moving it
+    // outside of this constructor which might be called with db mutex held.
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->RegisterDbPaths(GetDbPaths());
+    if (s.ok()) {
+      db_paths_registered_ = true;
+    } else {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to register data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
+  Ref();
+
+  // Convert user defined table properties collector factories to internal ones.
+  GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
+
+  // if _dummy_versions is nullptr, then this is a dummy column family.
+  if (_dummy_versions != nullptr) {
+    internal_stats_.reset(
+        new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
+    table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
+                                      block_cache_tracer, io_tracer,
+                                      db_session_id));
+    blob_file_cache_.reset(
+        new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
+                          internal_stats_->GetBlobFileReadHist(), io_tracer));
+    blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id,
+                                      blob_file_cache_.get()));
+
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+#ifndef ROCKSDB_LITE
+    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      compaction_picker_.reset(
+          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
+      compaction_picker_.reset(
+          new FIFOCompactionPicker(ioptions_, &internal_comparator_));
+    } else if (ioptions_.compaction_style == kCompactionStyleNone) {
+      compaction_picker_.reset(
+          new NullCompactionPicker(ioptions_, &internal_comparator_));
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "Column family %s does not use any background compaction. "
+                     "Compactions can only be done via CompactFiles\n",
+                     GetName().c_str());
+#endif  // !ROCKSDB_LITE
+    } else {
+      ROCKS_LOG_ERROR(ioptions_.logger,
+                      "Unable to recognize the specified compaction style %d. "
+                      "Column family %s will use kCompactionStyleLevel.\n",
+                      ioptions_.compaction_style, GetName().c_str());
+      compaction_picker_.reset(
+          new LevelCompactionPicker(ioptions_, &internal_comparator_));
+    }
+
+    if (column_family_set_->NumberOfColumnFamilies() < 10) {
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "--------------- Options for column family [%s]:\n",
+                     name.c_str());
+      initial_cf_options_.Dump(ioptions_.logger);
+    } else {
+      ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
+    }
+  }
+
+  RecalculateWriteStallConditions(mutable_cf_options_);
+
+  if (cf_options.table_factory->IsInstanceOf(
+          TableFactory::kBlockBasedTableName()) &&
+      cf_options.table_factory->GetOptions<BlockBasedTableOptions>()) {
+    const BlockBasedTableOptions* bbto =
+        cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
+    const auto& options_overrides = bbto->cache_usage_options.options_overrides;
+    const auto file_metadata_charged =
+        options_overrides.at(CacheEntryRole::kFileMetadata).charged;
+    if (bbto->block_cache &&
+        file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) {
+      // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope
+      // responsible for reservation of `ObsoleteFileInfo` so that we can keep
+      // this `file_metadata_cache_res_mgr_` nonconcurrent
+      file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
+          std::make_shared<
+              CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>>(
+              bbto->block_cache)));
+    }
+  }
+}
+
+// DB mutex held
+ColumnFamilyData::~ColumnFamilyData() {
+  assert(refs_.load(std::memory_order_relaxed) == 0);
+  // remove from linked list
+  auto prev = prev_;
+  auto next = next_;
+  prev->next_ = next;
+  next->prev_ = prev;
+
+  if (!dropped_ && column_family_set_ != nullptr) {
+    // If it's dropped, it's already removed from column family set
+    // If column_family_set_ == nullptr, this is dummy CFD and not in
+    // ColumnFamilySet
+    column_family_set_->RemoveColumnFamily(this);
+  }
+
+  if (current_ != nullptr) {
+    current_->Unref();
+  }
+
+  // It would be wrong if this ColumnFamilyData is in flush_queue_ or
+  // compaction_queue_ and we destroyed it
+  assert(!queued_for_flush_);
+  assert(!queued_for_compaction_);
+  assert(super_version_ == nullptr);
+
+  if (dummy_versions_ != nullptr) {
+    // List must be empty
+    assert(dummy_versions_->Next() == dummy_versions_);
+    bool deleted __attribute__((__unused__));
+    deleted = dummy_versions_->Unref();
+    assert(deleted);
+  }
+
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  autovector<MemTable*> to_delete;
+  imm_.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+
+  if (db_paths_registered_) {
+    // TODO(cc): considering using ioptions_.fs, currently some tests rely on
+    // EnvWrapper, that's the main reason why we use env here.
+    Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(
+          ioptions_.logger,
+          "Failed to unregister data paths of column family (id: %d, name: %s)",
+          id_, name_.c_str());
+    }
+  }
+}
+
+bool ColumnFamilyData::UnrefAndTryDelete() {
+  int old_refs = refs_.fetch_sub(1);
+  assert(old_refs > 0);
+
+  if (old_refs == 1) {
+    assert(super_version_ == nullptr);
+    delete this;
+    return true;
+  }
+
+  if (old_refs == 2 && super_version_ != nullptr) {
+    // Only the super_version_ holds me
+    SuperVersion* sv = super_version_;
+    super_version_ = nullptr;
+
+    // Release SuperVersion references kept in ThreadLocalPtr.
+    local_sv_.reset();
+
+    if (sv->Unref()) {
+      // Note: sv will delete this ColumnFamilyData during Cleanup()
+      assert(sv->cfd == this);
+      sv->Cleanup();
+      delete sv;
+      return true;
+    }
+  }
+  return false;
+}
+
+void ColumnFamilyData::SetDropped() {
+  // can't drop default CF
+  assert(id_ != 0);
+  dropped_ = true;
+  write_controller_token_.reset();
+
+  // remove from column_family_set
+  column_family_set_->RemoveColumnFamily(this);
+}
+
+ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
+  return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+}
+
+uint64_t ColumnFamilyData::OldestLogToKeep() {
+  auto current_log = GetLogNumber();
+
+  if (allow_2pc_) {
+    auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
+    auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
+
+    if (imm_prep_log > 0 && imm_prep_log < current_log) {
+      current_log = imm_prep_log;
+    }
+
+    if (mem_prep_log > 0 && mem_prep_log < current_log) {
+      current_log = mem_prep_log;
+    }
+  }
+
+  return current_log;
+}
+
+const double kIncSlowdownRatio = 0.8;
+const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
+const double kNearStopSlowdownRatio = 0.6;
+const double kDelayRecoverSlowdownRatio = 1.4;
+
+namespace {
+// If penalize_stop is true, we further reduce slowdown rate.
+std::unique_ptr<WriteControllerToken> SetupDelay(
+    WriteController* write_controller, uint64_t compaction_needed_bytes,
+    uint64_t prev_compaction_need_bytes, bool penalize_stop,
+    bool auto_compactions_disabled) {
+  const uint64_t kMinWriteRate = 16 * 1024u;  // Minimum write rate 16KB/s.
+
+  uint64_t max_write_rate = write_controller->max_delayed_write_rate();
+  uint64_t write_rate = write_controller->delayed_write_rate();
+
+  if (auto_compactions_disabled) {
+    // When auto compaction is disabled, always use the value user gave.
+    write_rate = max_write_rate;
+  } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
+    // If user gives rate less than kMinWriteRate, don't adjust it.
+    //
+    // If already delayed, need to adjust based on previous compaction debt.
+    // When there are two or more column families require delay, we always
+    // increase or reduce write rate based on information for one single
+    // column family. It is likely to be OK but we can improve if there is a
+    // problem.
+    // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
+    // is only available in level-based compaction
+    //
+    // If the compaction debt stays the same as previously, we also further slow
+    // down. It usually means a mem table is full. It's mainly for the case
+    // where both of flush and compaction are much slower than the speed we
+    // insert to mem tables, so we need to actively slow down before we get
+    // feedback signal from compaction and flushes to avoid the full stop
+    // because of hitting the max write buffer number.
+    //
+    // If DB just falled into the stop condition, we need to further reduce
+    // the write rate to avoid the stop condition.
+    if (penalize_stop) {
+      // Penalize the near stop or stop condition by more aggressive slowdown.
+      // This is to provide the long term slowdown increase signal.
+      // The penalty is more than the reward of recovering to the normal
+      // condition.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kNearStopSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > 0 &&
+               prev_compaction_need_bytes <= compaction_needed_bytes) {
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kIncSlowdownRatio);
+      if (write_rate < kMinWriteRate) {
+        write_rate = kMinWriteRate;
+      }
+    } else if (prev_compaction_need_bytes > compaction_needed_bytes) {
+      // We are speeding up by ratio of kSlowdownRatio when we have paid
+      // compaction debt. But we'll never speed up to faster than the write rate
+      // given by users.
+      write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
+                                         kDecSlowdownRatio);
+      if (write_rate > max_write_rate) {
+        write_rate = max_write_rate;
+      }
+    }
+  }
+  return write_controller->GetDelayToken(write_rate);
+}
+
+int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
+                                    int level0_slowdown_writes_trigger) {
+  // SanitizeOptions() ensures it.
+  assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
+
+  if (level0_file_num_compaction_trigger < 0) {
+    return std::numeric_limits<int>::max();
+  }
+
+  const int64_t twice_level0_trigger =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
+
+  const int64_t one_fourth_trigger_slowdown =
+      static_cast<int64_t>(level0_file_num_compaction_trigger) +
+      ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
+       4);
+
+  assert(twice_level0_trigger >= 0);
+  assert(one_fourth_trigger_slowdown >= 0);
+
+  // 1/4 of the way between L0 compaction trigger threshold and slowdown
+  // condition.
+  // Or twice as compaction trigger, if it is smaller.
+  int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
+  if (res >= std::numeric_limits<int32_t>::max()) {
+    return std::numeric_limits<int32_t>::max();
+  } else {
+    // res fits in int
+    return static_cast<int>(res);
+  }
+}
+}  // anonymous namespace
+
+std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+ColumnFamilyData::GetWriteStallConditionAndCause(
+    int num_unflushed_memtables, int num_l0_files,
+    uint64_t num_compaction_needed_bytes,
+    const MutableCFOptions& mutable_cf_options,
+    const ImmutableCFOptions& immutable_cf_options) {
+  if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
+    return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.hard_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kStopped,
+            WriteStallCause::kPendingCompactionBytes};
+  } else if (mutable_cf_options.max_write_buffer_number > 3 &&
+             num_unflushed_memtables >=
+                 mutable_cf_options.max_write_buffer_number - 1 &&
+             num_unflushed_memtables - 1 >=
+                 immutable_cf_options.min_write_buffer_number_to_merge) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
+             num_l0_files >=
+                 mutable_cf_options.level0_slowdown_writes_trigger) {
+    return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
+  } else if (!mutable_cf_options.disable_auto_compactions &&
+             mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
+             num_compaction_needed_bytes >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit) {
+    return {WriteStallCondition::kDelayed,
+            WriteStallCause::kPendingCompactionBytes};
+  }
+  return {WriteStallCondition::kNormal, WriteStallCause::kNone};
+}
+
+WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
+    const MutableCFOptions& mutable_cf_options) {
+  auto write_stall_condition = WriteStallCondition::kNormal;
+  if (current_ != nullptr) {
+    auto* vstorage = current_->storage_info();
+    auto write_controller = column_family_set_->write_controller_;
+    uint64_t compaction_needed_bytes =
+        vstorage->estimated_compaction_needed_bytes();
+
+    auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
+        imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
+        vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
+        *ioptions());
+    write_stall_condition = write_stall_condition_and_cause.first;
+    auto write_stall_cause = write_stall_condition_and_cause.second;
+
+    bool was_stopped = write_controller->IsStopped();
+    bool needed_delay = write_controller->NeedsDelay();
+
+    if (write_stall_condition == WriteStallCondition::kStopped &&
+        write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stopping writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d",
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number);
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "[%s] Stopping writes because we have %d level-0 files",
+                     name_.c_str(), vstorage->l0_delay_trigger_count());
+    } else if (write_stall_condition == WriteStallCondition::kStopped &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      write_controller_token_ = write_controller->GetStopToken();
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stopping writes because of estimated pending compaction "
+          "bytes %" PRIu64,
+          name_.c_str(), compaction_needed_bytes);
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kMemtableLimit) {
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stalling writes because we have %d immutable memtables "
+          "(waiting for flush), max_write_buffer_number is set to %d "
+          "rate %" PRIu64,
+          name_.c_str(), imm()->NumNotFlushed(),
+          mutable_cf_options.max_write_buffer_number,
+          write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kL0FileCountLimit) {
+      // L0 is the last two files from stopping.
+      bool near_stop = vstorage->l0_delay_trigger_count() >=
+                       mutable_cf_options.level0_stop_writes_trigger - 2;
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+                                  1);
+      if (compaction_picker_->IsLevel0CompactionInProgress()) {
+        internal_stats_->AddCFStats(
+            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+      }
+      ROCKS_LOG_WARN(ioptions_.logger,
+                     "[%s] Stalling writes because we have %d level-0 files "
+                     "rate %" PRIu64,
+                     name_.c_str(), vstorage->l0_delay_trigger_count(),
+                     write_controller->delayed_write_rate());
+    } else if (write_stall_condition == WriteStallCondition::kDelayed &&
+               write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
+      // If the distance to hard limit is less than 1/4 of the gap between soft
+      // and
+      // hard bytes limit, we think it is near stop and speed up the slowdown.
+      bool near_stop =
+          mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
+          (compaction_needed_bytes -
+           mutable_cf_options.soft_pending_compaction_bytes_limit) >
+              3 *
+                  (mutable_cf_options.hard_pending_compaction_bytes_limit -
+                   mutable_cf_options.soft_pending_compaction_bytes_limit) /
+                  4;
+
+      write_controller_token_ =
+          SetupDelay(write_controller, compaction_needed_bytes,
+                     prev_compaction_needed_bytes_, was_stopped || near_stop,
+                     mutable_cf_options.disable_auto_compactions);
+      internal_stats_->AddCFStats(
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+      ROCKS_LOG_WARN(
+          ioptions_.logger,
+          "[%s] Stalling writes because of estimated pending compaction "
+          "bytes %" PRIu64 " rate %" PRIu64,
+          name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
+          write_controller->delayed_write_rate());
+    } else {
+      assert(write_stall_condition == WriteStallCondition::kNormal);
+      if (vstorage->l0_delay_trigger_count() >=
+          GetL0ThresholdSpeedupCompaction(
+              mutable_cf_options.level0_file_num_compaction_trigger,
+              mutable_cf_options.level0_slowdown_writes_trigger)) {
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        ROCKS_LOG_INFO(
+            ioptions_.logger,
+            "[%s] Increasing compaction threads because we have %d level-0 "
+            "files ",
+            name_.c_str(), vstorage->l0_delay_trigger_count());
+      } else if (vstorage->estimated_compaction_needed_bytes() >=
+                 mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
+        // Increase compaction threads if bytes needed for compaction exceeds
+        // 1/4 of threshold for slowing down.
+        // If soft pending compaction byte limit is not set, always speed up
+        // compaction.
+        write_controller_token_ =
+            write_controller->GetCompactionPressureToken();
+        if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
+          ROCKS_LOG_INFO(
+              ioptions_.logger,
+              "[%s] Increasing compaction threads because of estimated pending "
+              "compaction "
+              "bytes %" PRIu64,
+              name_.c_str(), vstorage->estimated_compaction_needed_bytes());
+        }
+      } else {
+        write_controller_token_.reset();
+      }
+      // If the DB recovers from delay conditions, we reward with reducing
+      // double the slowdown ratio. This is to balance the long term slowdown
+      // increase signal.
+      if (needed_delay) {
+        uint64_t write_rate = write_controller->delayed_write_rate();
+        write_controller->set_delayed_write_rate(static_cast<uint64_t>(
+            static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
+        // Set the low pri limit to be 1/4 the delayed write rate.
+        // Note we don't reset this value even after delay condition is relased.
+        // Low-pri rate will continue to apply if there is a compaction
+        // pressure.
+        write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
+                                                                    4);
+      }
+    }
+    prev_compaction_needed_bytes_ = compaction_needed_bytes;
+  }
+  return write_stall_condition;
+}
+
+const FileOptions* ColumnFamilyData::soptions() const {
+  return &(column_family_set_->file_options_);
+}
+
+void ColumnFamilyData::SetCurrent(Version* current_version) {
+  current_ = current_version;
+}
+
+uint64_t ColumnFamilyData::GetNumLiveVersions() const {
+  return VersionSet::GetNumLiveVersions(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
+  return VersionSet::GetTotalSstFilesSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetTotalBlobFileSize() const {
+  return VersionSet::GetTotalBlobFileSize(dummy_versions_);
+}
+
+uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
+  return current_->GetSstFilesSize();
+}
+
+MemTable* ColumnFamilyData::ConstructNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+                      write_buffer_manager_, earliest_seq, id_);
+}
+
+void ColumnFamilyData::CreateNewMemtable(
+    const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
+  if (mem_ != nullptr) {
+    delete mem_->Unref();
+  }
+  SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
+  mem_->Ref();
+}
+
+bool ColumnFamilyData::NeedsCompaction() const {
+  return !mutable_cf_options_.disable_auto_compactions &&
+         compaction_picker_->NeedsCompaction(current_->storage_info());
+}
+
+Compaction* ColumnFamilyData::PickCompaction(
+    const MutableCFOptions& mutable_options,
+    const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
+  SequenceNumber earliest_mem_seqno =
+      std::min(mem_->GetEarliestSequenceNumber(),
+               imm_.current()->GetEarliestSequenceNumber(false));
+  auto* result = compaction_picker_->PickCompaction(
+      GetName(), mutable_options, mutable_db_options, current_->storage_info(),
+      log_buffer, earliest_mem_seqno);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+bool ColumnFamilyData::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  return compaction_picker_->RangeOverlapWithCompaction(
+      smallest_user_key, largest_user_key, level);
+}
+
+Status ColumnFamilyData::RangesOverlapWithMemtables(
+    const autovector<Range>& ranges, SuperVersion* super_version,
+    bool allow_data_in_errors, bool* overlap) {
+  assert(overlap != nullptr);
+  *overlap = false;
+  // Create an InternalIterator over all unflushed memtables
+  Arena arena;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
+  merge_iter_builder.AddIterator(
+      super_version->mem->NewIterator(read_opts, &arena));
+  super_version->imm->AddIterators(read_opts, &merge_iter_builder,
+                                   false /* add_range_tombstone_iter */);
+  ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
+
+  auto read_seq = super_version->current->version_set()->LastSequence();
+  ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
+  auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+      read_opts, read_seq, false /* immutable_memtable */);
+  range_del_agg.AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
+  Status status;
+  status = super_version->imm->AddRangeTombstoneIterators(
+      read_opts, nullptr /* arena */, &range_del_agg);
+  // AddRangeTombstoneIterators always return Status::OK.
+  assert(status.ok());
+
+  for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
+    auto* vstorage = super_version->current->storage_info();
+    auto* ucmp = vstorage->InternalComparator()->user_comparator();
+    InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    memtable_iter->Seek(range_start.Encode());
+    status = memtable_iter->status();
+    ParsedInternalKey seek_result;
+
+    if (status.ok() && memtable_iter->Valid()) {
+      status = ParseInternalKey(memtable_iter->key(), &seek_result,
+                                allow_data_in_errors);
+    }
+
+    if (status.ok()) {
+      if (memtable_iter->Valid() &&
+          ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
+        *overlap = true;
+      } else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
+                                                 ranges[i].limit)) {
+        *overlap = true;
+      }
+    }
+  }
+  return status;
+}
+
+const int ColumnFamilyData::kCompactAllLevels = -1;
+const int ColumnFamilyData::kCompactToBaseLevel = -2;
+
+Compaction* ColumnFamilyData::CompactRange(
+    const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, int input_level,
+    int output_level, const CompactRangeOptions& compact_range_options,
+    const InternalKey* begin, const InternalKey* end,
+    InternalKey** compaction_end, bool* conflict,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  auto* result = compaction_picker_->CompactRange(
+      GetName(), mutable_cf_options, mutable_db_options,
+      current_->storage_info(), input_level, output_level,
+      compact_range_options, begin, end, compaction_end, conflict,
+      max_file_num_to_ignore, trim_ts);
+  if (result != nullptr) {
+    result->SetInputVersion(current_);
+  }
+  return result;
+}
+
+SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
+  SuperVersion* sv = GetThreadLocalSuperVersion(db);
+  sv->Ref();
+  if (!ReturnThreadLocalSuperVersion(sv)) {
+    // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
+    // when the thread-local pointer was populated. So, the Ref() earlier in
+    // this function still prevents the returned SuperVersion* from being
+    // deleted out from under the caller.
+    sv->Unref();
+  }
+  return sv;
+}
+
+SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
+  // The SuperVersion is cached in thread local storage to avoid acquiring
+  // mutex when SuperVersion does not change since the last use. When a new
+  // SuperVersion is installed, the compaction or flush thread cleans up
+  // cached SuperVersion in all existing thread local storage. To avoid
+  // acquiring mutex for this operation, we use atomic Swap() on the thread
+  // local pointer to guarantee exclusive access. If the thread local pointer
+  // is being used while a new SuperVersion is installed, the cached
+  // SuperVersion can become stale. In that case, the background thread would
+  // have swapped in kSVObsolete. We re-check the value at when returning
+  // SuperVersion back to thread local, with an atomic compare and swap.
+  // The superversion will need to be released if detected to be stale.
+  void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+  // Invariant:
+  // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
+  // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
+  // should only keep kSVInUse before ReturnThreadLocalSuperVersion call
+  // (if no Scrape happens).
+  assert(ptr != SuperVersion::kSVInUse);
+  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
+  if (sv == SuperVersion::kSVObsolete ||
+      sv->version_number != super_version_number_.load()) {
+    RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
+    SuperVersion* sv_to_delete = nullptr;
+
+    if (sv && sv->Unref()) {
+      RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
+      db->mutex()->Lock();
+      // NOTE: underlying resources held by superversion (sst files) might
+      // not be released until the next background job.
+      sv->Cleanup();
+      if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
+        db->AddSuperVersionsToFreeQueue(sv);
+        db->SchedulePurge();
+      } else {
+        sv_to_delete = sv;
+      }
+    } else {
+      db->mutex()->Lock();
+    }
+    sv = super_version_->Ref();
+    db->mutex()->Unlock();
+
+    delete sv_to_delete;
+  }
+  assert(sv != nullptr);
+  return sv;
+}
+
+bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
+  assert(sv != nullptr);
+  // Put the SuperVersion back
+  void* expected = SuperVersion::kSVInUse;
+  if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
+    // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
+    // storage has not been altered and no Scrape has happened. The
+    // SuperVersion is still current.
+    return true;
+  } else {
+    // ThreadLocal scrape happened in the process of this GetImpl call (after
+    // thread local Swap() at the beginning and before CompareAndSwap()).
+    // This means the SuperVersion it holds is obsolete.
+    assert(expected == SuperVersion::kSVObsolete);
+  }
+  return false;
+}
+
+void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
+                                           InstrumentedMutex* db_mutex) {
+  db_mutex->AssertHeld();
+  return InstallSuperVersion(sv_context, mutable_cf_options_);
+}
+
+void ColumnFamilyData::InstallSuperVersion(
+    SuperVersionContext* sv_context,
+    const MutableCFOptions& mutable_cf_options) {
+  SuperVersion* new_superversion = sv_context->new_superversion.release();
+  new_superversion->mutable_cf_options = mutable_cf_options;
+  new_superversion->Init(this, mem_, imm_.current(), current_);
+  SuperVersion* old_superversion = super_version_;
+  super_version_ = new_superversion;
+  ++super_version_number_;
+  super_version_->version_number = super_version_number_;
+  if (old_superversion == nullptr || old_superversion->current != current() ||
+      old_superversion->mem != mem_ ||
+      old_superversion->imm != imm_.current()) {
+    // Should not recalculate slow down condition if nothing has changed, since
+    // currently RecalculateWriteStallConditions() treats it as further slowing
+    // down is needed.
+    super_version_->write_stall_condition =
+        RecalculateWriteStallConditions(mutable_cf_options);
+  } else {
+    super_version_->write_stall_condition =
+        old_superversion->write_stall_condition;
+  }
+  if (old_superversion != nullptr) {
+    // Reset SuperVersions cached in thread local storage.
+    // This should be done before old_superversion->Unref(). That's to ensure
+    // that local_sv_ never holds the last reference to SuperVersion, since
+    // it has no means to safely do SuperVersion cleanup.
+    ResetThreadLocalSuperVersions();
+
+    if (old_superversion->mutable_cf_options.write_buffer_size !=
+        mutable_cf_options.write_buffer_size) {
+      mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
+    }
+    if (old_superversion->write_stall_condition !=
+        new_superversion->write_stall_condition) {
+      sv_context->PushWriteStallNotification(
+          old_superversion->write_stall_condition,
+          new_superversion->write_stall_condition, GetName(), ioptions());
+    }
+    if (old_superversion->Unref()) {
+      old_superversion->Cleanup();
+      sv_context->superversions_to_free.push_back(old_superversion);
+    }
+  }
+}
+
+void ColumnFamilyData::ResetThreadLocalSuperVersions() {
+  autovector<void*> sv_ptrs;
+  local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+  for (auto ptr : sv_ptrs) {
+    assert(ptr);
+    if (ptr == SuperVersion::kSVInUse) {
+      continue;
+    }
+    auto sv = static_cast<SuperVersion*>(ptr);
+    bool was_last_ref __attribute__((__unused__));
+    was_last_ref = sv->Unref();
+    // sv couldn't have been the last reference because
+    // ResetThreadLocalSuperVersions() is called before
+    // unref'ing super_version_.
+    assert(!was_last_ref);
+  }
+}
+
+Status ColumnFamilyData::ValidateOptions(
+    const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
+  Status s;
+  s = CheckCompressionSupported(cf_options);
+  if (s.ok() && db_options.allow_concurrent_memtable_write) {
+    s = CheckConcurrentWritesSupported(cf_options);
+  }
+  if (s.ok() && db_options.unordered_write &&
+      cf_options.max_successive_merges != 0) {
+    s = Status::InvalidArgument(
+        "max_successive_merges > 0 is incompatible with unordered_write");
+  }
+  if (s.ok()) {
+    s = CheckCFPathsSupported(db_options, cf_options);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      return Status::NotSupported(
+          "TTL is only supported in Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.periodic_compaction_seconds > 0 &&
+      cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+    if (!cf_options.table_factory->IsInstanceOf(
+            TableFactory::kBlockBasedTableName())) {
+      return Status::NotSupported(
+          "Periodic Compaction is only supported in "
+          "Block-Based Table format. ");
+    }
+  }
+
+  if (cf_options.enable_blob_garbage_collection) {
+    if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
+        cf_options.blob_garbage_collection_age_cutoff > 1.0) {
+      return Status::InvalidArgument(
+          "The age cutoff for blob garbage collection should be in the range "
+          "[0.0, 1.0].");
+    }
+    if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
+        cf_options.blob_garbage_collection_force_threshold > 1.0) {
+      return Status::InvalidArgument(
+          "The garbage ratio threshold for forcing blob garbage collection "
+          "should be in the range [0.0, 1.0].");
+    }
+  }
+
+  if (cf_options.compaction_style == kCompactionStyleFIFO &&
+      db_options.max_open_files != -1 && cf_options.ttl > 0) {
+    return Status::NotSupported(
+        "FIFO compaction only supported with max_open_files = -1.");
+  }
+
+  std::vector<uint32_t> supported{0, 1, 2, 4, 8};
+  if (std::find(supported.begin(), supported.end(),
+                cf_options.memtable_protection_bytes_per_key) ==
+      supported.end()) {
+    return Status::NotSupported(
+        "Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
+        "or 8 bytes per key.");
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status ColumnFamilyData::SetOptions(
+    const DBOptions& db_opts,
+    const std::unordered_map<std::string, std::string>& options_map) {
+  ColumnFamilyOptions cf_opts =
+      BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
+  ConfigOptions config_opts;
+  config_opts.mutable_options_only = true;
+  Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
+                                           &cf_opts);
+  if (s.ok()) {
+    s = ValidateOptions(db_opts, cf_opts);
+  }
+  if (s.ok()) {
+    mutable_cf_options_ = MutableCFOptions(cf_opts);
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: DB mutex held
+Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
+  if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
+    return Env::WLTH_NOT_SET;
+  }
+  if (level == 0) {
+    return Env::WLTH_MEDIUM;
+  }
+  int base_level = current_->storage_info()->base_level();
+
+  // L1: medium, L2: long, ...
+  if (level - base_level >= 2) {
+    return Env::WLTH_EXTREME;
+  } else if (level < base_level) {
+    // There is no restriction which prevents level passed in to be smaller
+    // than base_level.
+    return Env::WLTH_MEDIUM;
+  }
+  return static_cast<Env::WriteLifeTimeHint>(
+      level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
+}
+
+Status ColumnFamilyData::AddDirectories(
+    std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
+  Status s;
+  assert(created_dirs != nullptr);
+  assert(data_dirs_.empty());
+  for (auto& p : ioptions_.cf_paths) {
+    auto existing_dir = created_dirs->find(p.path);
+
+    if (existing_dir == created_dirs->end()) {
+      std::unique_ptr<FSDirectory> path_directory;
+      s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
+                                        &path_directory);
+      if (!s.ok()) {
+        return s;
+      }
+      assert(path_directory != nullptr);
+      data_dirs_.emplace_back(path_directory.release());
+      (*created_dirs)[p.path] = data_dirs_.back();
+    } else {
+      data_dirs_.emplace_back(existing_dir->second);
+    }
+  }
+  assert(data_dirs_.size() == ioptions_.cf_paths.size());
+  return s;
+}
+
+FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
+  if (data_dirs_.empty()) {
+    return nullptr;
+  }
+
+  assert(path_id < data_dirs_.size());
+  return data_dirs_[path_id].get();
+}
+
+ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
+                                 const ImmutableDBOptions* db_options,
+                                 const FileOptions& file_options,
+                                 Cache* table_cache,
+                                 WriteBufferManager* _write_buffer_manager,
+                                 WriteController* _write_controller,
+                                 BlockCacheTracer* const block_cache_tracer,
+                                 const std::shared_ptr<IOTracer>& io_tracer,
+                                 const std::string& db_id,
+                                 const std::string& db_session_id)
+    : max_column_family_(0),
+      file_options_(file_options),
+      dummy_cfd_(new ColumnFamilyData(
+          ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
+          nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
+          block_cache_tracer, io_tracer, db_id, db_session_id)),
+      default_cfd_cache_(nullptr),
+      db_name_(dbname),
+      db_options_(db_options),
+      table_cache_(table_cache),
+      write_buffer_manager_(_write_buffer_manager),
+      write_controller_(_write_controller),
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_id_(db_id),
+      db_session_id_(db_session_id) {
+  // initialize linked list
+  dummy_cfd_->prev_ = dummy_cfd_;
+  dummy_cfd_->next_ = dummy_cfd_;
+}
+
+ColumnFamilySet::~ColumnFamilySet() {
+  while (column_family_data_.size() > 0) {
+    // cfd destructor will delete itself from column_family_data_
+    auto cfd = column_family_data_.begin()->second;
+    bool last_ref __attribute__((__unused__));
+    last_ref = cfd->UnrefAndTryDelete();
+    assert(last_ref);
+  }
+  bool dummy_last_ref __attribute__((__unused__));
+  dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
+  assert(dummy_last_ref);
+}
+
+ColumnFamilyData* ColumnFamilySet::GetDefault() const {
+  assert(default_cfd_cache_ != nullptr);
+  return default_cfd_cache_;
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
+  auto cfd_iter = column_family_data_.find(id);
+  if (cfd_iter != column_family_data_.end()) {
+    return cfd_iter->second;
+  } else {
+    return nullptr;
+  }
+}
+
+ColumnFamilyData* ColumnFamilySet::GetColumnFamily(
+    const std::string& name) const {
+  auto cfd_iter = column_families_.find(name);
+  if (cfd_iter != column_families_.end()) {
+    auto cfd = GetColumnFamily(cfd_iter->second);
+    assert(cfd != nullptr);
+    return cfd;
+  } else {
+    return nullptr;
+  }
+}
+
+uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
+  return ++max_column_family_;
+}
+
+uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
+
+void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
+  max_column_family_ = std::max(new_max_column_family, max_column_family_);
+}
+
+size_t ColumnFamilySet::NumberOfColumnFamilies() const {
+  return column_families_.size();
+}
+
+// under a DB mutex AND write thread
+ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
+    const std::string& name, uint32_t id, Version* dummy_versions,
+    const ColumnFamilyOptions& options) {
+  assert(column_families_.find(name) == column_families_.end());
+  ColumnFamilyData* new_cfd = new ColumnFamilyData(
+      id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
+      *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
+      db_id_, db_session_id_);
+  column_families_.insert({name, id});
+  column_family_data_.insert({id, new_cfd});
+  max_column_family_ = std::max(max_column_family_, id);
+  // add to linked list
+  new_cfd->next_ = dummy_cfd_;
+  auto prev = dummy_cfd_->prev_;
+  new_cfd->prev_ = prev;
+  prev->next_ = new_cfd;
+  dummy_cfd_->prev_ = new_cfd;
+  if (id == 0) {
+    default_cfd_cache_ = new_cfd;
+  }
+  return new_cfd;
+}
+
+// under a DB mutex AND from a write thread
+void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
+  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  assert(cfd_iter != column_family_data_.end());
+  column_family_data_.erase(cfd_iter);
+  column_families_.erase(cfd->GetName());
+}
+
+// under a DB mutex OR from a write thread
+bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
+  if (column_family_id == 0) {
+    // optimization for common case
+    current_ = column_family_set_->GetDefault();
+  } else {
+    current_ = column_family_set_->GetColumnFamily(column_family_id);
+  }
+  handle_.SetCFD(current_);
+  return current_ != nullptr;
+}
+
+uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
+  assert(current_ != nullptr);
+  return current_->GetLogNumber();
+}
+
+MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
+  assert(current_ != nullptr);
+  return current_->mem();
+}
+
+ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
+  assert(current_ != nullptr);
+  return &handle_;
+}
+
+uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
+  uint32_t column_family_id = 0;
+  if (column_family != nullptr) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    column_family_id = cfh->GetID();
+  }
+  return column_family_id;
+}
+
+const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family) {
+  if (column_family != nullptr) {
+    return column_family->GetComparator();
+  }
+  return nullptr;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family.h b/src/rocksdb/db/column_family.h
new file mode 100644
index 000000000..3e6d01d22
--- /dev/null
+++ b/src/rocksdb/db/column_family.h
@@ -0,0 +1,845 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/memtable_list.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/write_batch_internal.h"
+#include "db/write_controller.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/hash_containers.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Version;
+class VersionSet;
+class VersionStorageInfo;
+class MemTable;
+class MemTableListVersion;
+class CompactionPicker;
+class Compaction;
+class InternalKey;
+class InternalStats;
+class ColumnFamilyData;
+class DBImpl;
+class LogBuffer;
+class InstrumentedMutex;
+class InstrumentedMutexLock;
+struct SuperVersionContext;
+class BlobFileCache;
+class BlobSource;
+
+extern const double kIncSlowdownRatio;
+// This file contains a list of data structures for managing column family
+// level metadata.
+//
+// The basic relationships among classes declared here are illustrated as
+// following:
+//
+//       +----------------------+    +----------------------+   +--------+
+//   +---+ ColumnFamilyHandle 1 | +--+ ColumnFamilyHandle 2 |   | DBImpl |
+//   |   +----------------------+ |  +----------------------+   +----+---+
+//   | +--------------------------+                                  |
+//   | |                               +-----------------------------+
+//   | |                               |
+//   | | +-----------------------------v-------------------------------+
+//   | | |                                                             |
+//   | | |                      ColumnFamilySet                        |
+//   | | |                                                             |
+//   | | +-------------+--------------------------+----------------+---+
+//   | |               |                          |                |
+//   | +-------------------------------------+    |                |
+//   |                 |                     |    |                v
+//   |   +-------------v-------------+ +-----v----v---------+
+//   |   |                           | |                    |
+//   |   |     ColumnFamilyData 1    | | ColumnFamilyData 2 |    ......
+//   |   |                           | |                    |
+//   +--->                           | |                    |
+//       |                 +---------+ |                    |
+//       |                 | MemTable| |                    |
+//       |                 |  List   | |                    |
+//       +--------+---+--+-+----+----+ +--------------------++
+//                |   |  |      |
+//                |   |  |      |
+//                |   |  |      +-----------------------+
+//                |   |  +-----------+                  |
+//                v   +--------+     |                  |
+//       +--------+--------+   |     |                  |
+//       |                 |   |     |       +----------v----------+
+// +---> |SuperVersion 1.a +----------------->                     |
+//       |                 +------+  |       | MemTableListVersion |
+//       +---+-------------+   |  |  |       |                     |
+//           |                 |  |  |       +----+------------+---+
+//           |      current    |  |  |            |            |
+//           |   +-------------+  |  |mem         |            |
+//           |   |                |  |            |            |
+//         +-v---v-------+    +---v--v---+  +-----v----+  +----v-----+
+//         |             |    |          |  |          |  |          |
+//         | Version 1.a |    | memtable |  | memtable |  | memtable |
+//         |             |    |   1.a    |  |   1.b    |  |   1.c    |
+//         +-------------+    |          |  |          |  |          |
+//                            +----------+  +----------+  +----------+
+//
+// DBImpl keeps a ColumnFamilySet, which references to all column families by
+// pointing to respective ColumnFamilyData object of each column family.
+// This is how DBImpl can list and operate on all the column families.
+// ColumnFamilyHandle also points to ColumnFamilyData directly, so that
+// when a user executes a query, it can directly find memtables and Version
+// as well as SuperVersion to the column family, without going through
+// ColumnFamilySet.
+//
+// ColumnFamilySet points to the latest view of the LSM-tree (list of memtables
+// and SST files) indirectly, while ongoing operations may hold references
+// to a current or an out-of-date SuperVersion, which in turn points to a
+// point-in-time view of the LSM-tree. This guarantees the memtables and SST
+// files being operated on will not go away, until the SuperVersion is
+// unreferenced to 0 and destoryed.
+//
+// The following graph illustrates a possible referencing relationships:
+//
+// Column       +--------------+      current       +-----------+
+// Family +---->+              +------------------->+           |
+//  Data        | SuperVersion +----------+         | Version A |
+//              |      3       |   imm    |         |           |
+// Iter2 +----->+              |  +-------v------+  +-----------+
+//              +-----+--------+  | MemtableList +----------------> Empty
+//                    |           |   Version r  |  +-----------+
+//                    |           +--------------+  |           |
+//                    +------------------+   current| Version B |
+//              +--------------+         |   +----->+           |
+//              |              |         |   |      +-----+-----+
+// Compaction +>+ SuperVersion +-------------+            ^
+//    Job       |      2       +------+  |                |current
+//              |              +----+ |  |     mem        |    +------------+
+//              +--------------+    | |  +--------------------->            |
+//                                  | +------------------------> MemTable a |
+//                                  |          mem        |    |            |
+//              +--------------+    |                     |    +------------+
+//              |              +--------------------------+
+//  Iter1 +-----> SuperVersion |    |                          +------------+
+//              |      1       +------------------------------>+            |
+//              |              +-+  |        mem               | MemTable b |
+//              +--------------+ |  |                          |            |
+//                               |  |    +--------------+      +-----^------+
+//                               |  |imm | MemtableList |            |
+//                               |  +--->+   Version s  +------------+
+//                               |       +--------------+
+//                               |       +--------------+
+//                               |       | MemtableList |
+//                               +------>+   Version t  +-------->  Empty
+//                                 imm   +--------------+
+//
+// In this example, even if the current LSM-tree consists of Version A and
+// memtable a, which is also referenced by SuperVersion, two older SuperVersion
+// SuperVersion2 and Superversion1 still exist, and are referenced by a
+// compaction job and an old iterator Iter1, respectively. SuperVersion2
+// contains Version B, memtable a and memtable b; SuperVersion1 contains
+// Version B and memtable b (mutable). As a result, Version B and memtable b
+// are prevented from being destroyed or deleted.
+
+// ColumnFamilyHandleImpl is the class that clients use to access different
+// column families. It has non-trivial destructor, which gets called when client
+// is done using the column family
+class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
+ public:
+  // create while holding the mutex
+  ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db,
+                         InstrumentedMutex* mutex);
+  // destroy without mutex
+  virtual ~ColumnFamilyHandleImpl();
+  virtual ColumnFamilyData* cfd() const { return cfd_; }
+
+  virtual uint32_t GetID() const override;
+  virtual const std::string& GetName() const override;
+  virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
+  virtual const Comparator* GetComparator() const override;
+
+ private:
+  ColumnFamilyData* cfd_;
+  DBImpl* db_;
+  InstrumentedMutex* mutex_;
+};
+
+// Does not ref-count ColumnFamilyData
+// We use this dummy ColumnFamilyHandleImpl because sometimes MemTableInserter
+// calls DBImpl methods. When this happens, MemTableInserter need access to
+// ColumnFamilyHandle (same as the client would need). In that case, we feed
+// MemTableInserter dummy ColumnFamilyHandle and enable it to call DBImpl
+// methods
+class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
+ public:
+  ColumnFamilyHandleInternal()
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        internal_cfd_(nullptr) {}
+
+  void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
+  virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+
+ private:
+  ColumnFamilyData* internal_cfd_;
+};
+
+// holds references to memtable, all immutable memtables and version
+struct SuperVersion {
+  // Accessing members of this class is not thread-safe and requires external
+  // synchronization (ie db mutex held or on write thread).
+  ColumnFamilyData* cfd;
+  MemTable* mem;
+  MemTableListVersion* imm;
+  Version* current;
+  MutableCFOptions mutable_cf_options;
+  // Version number of the current SuperVersion
+  uint64_t version_number;
+  WriteStallCondition write_stall_condition;
+
+  // should be called outside the mutex
+  SuperVersion() = default;
+  ~SuperVersion();
+  SuperVersion* Ref();
+  // If Unref() returns true, Cleanup() should be called with mutex held
+  // before deleting this SuperVersion.
+  bool Unref();
+
+  // call these two methods with db mutex held
+  // Cleanup unrefs mem, imm and current. Also, it stores all memtables
+  // that needs to be deleted in to_delete vector. Unrefing those
+  // objects needs to be done in the mutex
+  void Cleanup();
+  void Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
+            MemTableListVersion* new_imm, Version* new_current);
+
+  // The value of dummy is not actually used. kSVInUse takes its address as a
+  // mark in the thread local storage to indicate the SuperVersion is in use
+  // by thread. This way, the value of kSVInUse is guaranteed to have no
+  // conflict with SuperVersion object address and portable on different
+  // platform.
+  static int dummy;
+  static void* const kSVInUse;
+  static void* const kSVObsolete;
+
+ private:
+  std::atomic<uint32_t> refs;
+  // We need to_delete because during Cleanup(), imm->Unref() returns
+  // all memtables that we need to free through this vector. We then
+  // delete all those memtables outside of mutex, during destruction
+  autovector<MemTable*> to_delete;
+};
+
+extern Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options);
+
+extern Status CheckConcurrentWritesSupported(
+    const ColumnFamilyOptions& cf_options);
+
+extern Status CheckCFPathsSupported(const DBOptions& db_options,
+                                    const ColumnFamilyOptions& cf_options);
+
+extern ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
+                                           const ColumnFamilyOptions& src);
+// Wrap user defined table properties collector factories `from cf_options`
+// into internal ones in int_tbl_prop_collector_factories. Add a system internal
+// one too.
+extern void GetIntTblPropCollectorFactory(
+    const ImmutableCFOptions& ioptions,
+    IntTblPropCollectorFactories* int_tbl_prop_collector_factories);
+
+class ColumnFamilySet;
+
+// This class keeps all the data that a column family needs.
+// Most methods require DB mutex held, unless otherwise noted
+class ColumnFamilyData {
+ public:
+  ~ColumnFamilyData();
+
+  // thread-safe
+  uint32_t GetID() const { return id_; }
+  // thread-safe
+  const std::string& GetName() const { return name_; }
+
+  // Ref() can only be called from a context where the caller can guarantee
+  // that ColumnFamilyData is alive (while holding a non-zero ref already,
+  // holding a DB mutex, or as the leader in a write batch group).
+  void Ref() { refs_.fetch_add(1); }
+
+  // UnrefAndTryDelete() decreases the reference count and do free if needed,
+  // return true if this is freed else false, UnrefAndTryDelete() can only
+  // be called while holding a DB mutex, or during single-threaded recovery.
+  bool UnrefAndTryDelete();
+
+  // SetDropped() can only be called under following conditions:
+  // 1) Holding a DB mutex,
+  // 2) from single-threaded write thread, AND
+  // 3) from single-threaded VersionSet::LogAndApply()
+  // After dropping column family no other operation on that column family
+  // will be executed. All the files and memory will be, however, kept around
+  // until client drops the column family handle. That way, client can still
+  // access data from dropped column family.
+  // Column family can be dropped and still alive. In that state:
+  // *) Compaction and flush is not executed on the dropped column family.
+  // *) Client can continue reading from column family. Writes will fail unless
+  // WriteOptions::ignore_missing_column_families is true
+  // When the dropped column family is unreferenced, then we:
+  // *) Remove column family from the linked list maintained by ColumnFamilySet
+  // *) delete all memory associated with that column family
+  // *) delete all the files associated with that column family
+  void SetDropped();
+  bool IsDropped() const { return dropped_.load(std::memory_order_relaxed); }
+
+  // thread-safe
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetFlushReason(FlushReason flush_reason) {
+    flush_reason_ = flush_reason;
+  }
+  FlushReason GetFlushReason() const { return flush_reason_; }
+  // thread-safe
+  const FileOptions* soptions() const;
+  const ImmutableOptions* ioptions() const { return &ioptions_; }
+  // REQUIRES: DB mutex held
+  // This returns the MutableCFOptions used by current SuperVersion
+  // You should use this API to reference MutableCFOptions most of the time.
+  const MutableCFOptions* GetCurrentMutableCFOptions() const {
+    return &(super_version_->mutable_cf_options);
+  }
+  // REQUIRES: DB mutex held
+  // This returns the latest MutableCFOptions, which may be not in effect yet.
+  const MutableCFOptions* GetLatestMutableCFOptions() const {
+    return &mutable_cf_options_;
+  }
+
+  // REQUIRES: DB mutex held
+  // Build ColumnFamiliesOptions with immutable options and latest mutable
+  // options.
+  ColumnFamilyOptions GetLatestCFOptions() const;
+
+  bool is_delete_range_supported() { return is_delete_range_supported_; }
+
+  // Validate CF options against DB options
+  static Status ValidateOptions(const DBOptions& db_options,
+                                const ColumnFamilyOptions& cf_options);
+#ifndef ROCKSDB_LITE
+  // REQUIRES: DB mutex held
+  Status SetOptions(
+      const DBOptions& db_options,
+      const std::unordered_map<std::string, std::string>& options_map);
+#endif  // ROCKSDB_LITE
+
+  InternalStats* internal_stats() { return internal_stats_.get(); }
+
+  MemTableList* imm() { return &imm_; }
+  MemTable* mem() { return mem_; }
+
+  bool IsEmpty() {
+    return mem()->GetFirstSequenceNumber() == 0 && imm()->NumNotFlushed() == 0;
+  }
+
+  Version* current() { return current_; }
+  Version* dummy_versions() { return dummy_versions_; }
+  void SetCurrent(Version* _current);
+  uint64_t GetNumLiveVersions() const;    // REQUIRE: DB mutex held
+  uint64_t GetTotalSstFilesSize() const;  // REQUIRE: DB mutex held
+  uint64_t GetLiveSstFilesSize() const;   // REQUIRE: DB mutex held
+  uint64_t GetTotalBlobFileSize() const;  // REQUIRE: DB mutex held
+  void SetMemtable(MemTable* new_mem) {
+    uint64_t memtable_id = last_memtable_id_.fetch_add(1) + 1;
+    new_mem->SetID(memtable_id);
+    mem_ = new_mem;
+  }
+
+  // calculate the oldest log needed for the durability of this column family
+  uint64_t OldestLogToKeep();
+
+  // See Memtable constructor for explanation of earliest_seq param.
+  MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
+                                 SequenceNumber earliest_seq);
+  void CreateNewMemtable(const MutableCFOptions& mutable_cf_options,
+                         SequenceNumber earliest_seq);
+
+  TableCache* table_cache() const { return table_cache_.get(); }
+  BlobSource* blob_source() const { return blob_source_.get(); }
+
+  // See documentation in compaction_picker.h
+  // REQUIRES: DB mutex held
+  bool NeedsCompaction() const;
+  // REQUIRES: DB mutex held
+  Compaction* PickCompaction(const MutableCFOptions& mutable_options,
+                             const MutableDBOptions& mutable_db_options,
+                             LogBuffer* log_buffer);
+
+  // Check if the passed range overlap with any running compactions.
+  // REQUIRES: DB mutex held
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Check if the passed ranges overlap with any unflushed memtables
+  // (immutable or mutable).
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status RangesOverlapWithMemtables(const autovector<Range>& ranges,
+                                    SuperVersion* super_version,
+                                    bool allow_data_in_errors, bool* overlap);
+
+  // A flag to tell a manual compaction is to compact all levels together
+  // instead of a specific level.
+  static const int kCompactAllLevels;
+  // A flag to tell a manual compaction's output is base level.
+  static const int kCompactToBaseLevel;
+  // REQUIRES: DB mutex held
+  Compaction* CompactRange(const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
+                           int input_level, int output_level,
+                           const CompactRangeOptions& compact_range_options,
+                           const InternalKey* begin, const InternalKey* end,
+                           InternalKey** compaction_end, bool* manual_conflict,
+                           uint64_t max_file_num_to_ignore,
+                           const std::string& trim_ts);
+
+  CompactionPicker* compaction_picker() { return compaction_picker_.get(); }
+  // thread-safe
+  const Comparator* user_comparator() const {
+    return internal_comparator_.user_comparator();
+  }
+  // thread-safe
+  const InternalKeyComparator& internal_comparator() const {
+    return internal_comparator_;
+  }
+
+  const IntTblPropCollectorFactories* int_tbl_prop_collector_factories() const {
+    return &int_tbl_prop_collector_factories_;
+  }
+
+  SuperVersion* GetSuperVersion() { return super_version_; }
+  // thread-safe
+  // Return a already referenced SuperVersion to be used safely.
+  SuperVersion* GetReferencedSuperVersion(DBImpl* db);
+  // thread-safe
+  // Get SuperVersion stored in thread local storage. If it does not exist,
+  // get a reference from a current SuperVersion.
+  SuperVersion* GetThreadLocalSuperVersion(DBImpl* db);
+  // Try to return SuperVersion back to thread local storage. Return true on
+  // success and false on failure. It fails when the thread local storage
+  // contains anything other than SuperVersion::kSVInUse flag.
+  bool ReturnThreadLocalSuperVersion(SuperVersion* sv);
+  // thread-safe
+  uint64_t GetSuperVersionNumber() const {
+    return super_version_number_.load();
+  }
+  // will return a pointer to SuperVersion* if previous SuperVersion
+  // if its reference count is zero and needs deletion or nullptr if not
+  // As argument takes a pointer to allocated SuperVersion to enable
+  // the clients to allocate SuperVersion outside of mutex.
+  // IMPORTANT: Only call this from DBImpl::InstallSuperVersion()
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           const MutableCFOptions& mutable_cf_options);
+  void InstallSuperVersion(SuperVersionContext* sv_context,
+                           InstrumentedMutex* db_mutex);
+
+  void ResetThreadLocalSuperVersions();
+
+  // Protected by DB mutex
+  void set_queued_for_flush(bool value) { queued_for_flush_ = value; }
+  void set_queued_for_compaction(bool value) { queued_for_compaction_ = value; }
+  bool queued_for_flush() { return queued_for_flush_; }
+  bool queued_for_compaction() { return queued_for_compaction_; }
+
+  enum class WriteStallCause {
+    kNone,
+    kMemtableLimit,
+    kL0FileCountLimit,
+    kPendingCompactionBytes,
+  };
+  static std::pair<WriteStallCondition, WriteStallCause>
+  GetWriteStallConditionAndCause(
+      int num_unflushed_memtables, int num_l0_files,
+      uint64_t num_compaction_needed_bytes,
+      const MutableCFOptions& mutable_cf_options,
+      const ImmutableCFOptions& immutable_cf_options);
+
+  // Recalculate some stall conditions, which are changed only during
+  // compaction, adding new memtable and/or recalculation of compaction score.
+  WriteStallCondition RecalculateWriteStallConditions(
+      const MutableCFOptions& mutable_cf_options);
+
+  void set_initialized() { initialized_.store(true); }
+
+  bool initialized() const { return initialized_.load(); }
+
+  const ColumnFamilyOptions& initial_cf_options() {
+    return initial_cf_options_;
+  }
+
+  Env::WriteLifeTimeHint CalculateSSTWriteHint(int level);
+
+  // created_dirs remembers directory created, so that we don't need to call
+  // the same data creation operation again.
+  Status AddDirectories(
+      std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs);
+
+  FSDirectory* GetDataDir(size_t path_id) const;
+
+  // full_history_ts_low_ can only increase.
+  void SetFullHistoryTsLow(std::string ts_low) {
+    assert(!ts_low.empty());
+    const Comparator* ucmp = user_comparator();
+    assert(ucmp);
+    if (full_history_ts_low_.empty() ||
+        ucmp->CompareTimestamp(ts_low, full_history_ts_low_) > 0) {
+      full_history_ts_low_ = std::move(ts_low);
+    }
+  }
+
+  const std::string& GetFullHistoryTsLow() const {
+    return full_history_ts_low_;
+  }
+
+  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
+  std::shared_ptr<CacheReservationManager>
+  GetFileMetadataCacheReservationManager() {
+    return file_metadata_cache_res_mgr_;
+  }
+
+  SequenceNumber GetFirstMemtableSequenceNumber() const;
+
+  static const uint32_t kDummyColumnFamilyDataId;
+
+  // Keep track of whether the mempurge feature was ever used.
+  void SetMempurgeUsed() { mempurge_used_ = true; }
+  bool GetMempurgeUsed() { return mempurge_used_; }
+
+ private:
+  friend class ColumnFamilySet;
+  ColumnFamilyData(uint32_t id, const std::string& name,
+                   Version* dummy_versions, Cache* table_cache,
+                   WriteBufferManager* write_buffer_manager,
+                   const ColumnFamilyOptions& options,
+                   const ImmutableDBOptions& db_options,
+                   const FileOptions* file_options,
+                   ColumnFamilySet* column_family_set,
+                   BlockCacheTracer* const block_cache_tracer,
+                   const std::shared_ptr<IOTracer>& io_tracer,
+                   const std::string& db_id, const std::string& db_session_id);
+
+  std::vector<std::string> GetDbPaths() const;
+
+  uint32_t id_;
+  const std::string name_;
+  Version* dummy_versions_;  // Head of circular doubly-linked list of versions.
+  Version* current_;         // == dummy_versions->prev_
+
+  std::atomic<int> refs_;  // outstanding references to ColumnFamilyData
+  std::atomic<bool> initialized_;
+  std::atomic<bool> dropped_;  // true if client dropped it
+
+  const InternalKeyComparator internal_comparator_;
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories_;
+
+  const ColumnFamilyOptions initial_cf_options_;
+  const ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+
+  const bool is_delete_range_supported_;
+
+  std::unique_ptr<TableCache> table_cache_;
+  std::unique_ptr<BlobFileCache> blob_file_cache_;
+  std::unique_ptr<BlobSource> blob_source_;
+
+  std::unique_ptr<InternalStats> internal_stats_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  MemTable* mem_;
+  MemTableList imm_;
+  SuperVersion* super_version_;
+
+  // An ordinal representing the current SuperVersion. Updated by
+  // InstallSuperVersion(), i.e. incremented every time super_version_
+  // changes.
+  std::atomic<uint64_t> super_version_number_;
+
+  // Thread's local copy of SuperVersion pointer
+  // This needs to be destructed before mutex_
+  std::unique_ptr<ThreadLocalPtr> local_sv_;
+
+  // pointers for a circular linked list. we use it to support iterations over
+  // all column families that are alive (note: dropped column families can also
+  // be alive as long as client holds a reference)
+  ColumnFamilyData* next_;
+  ColumnFamilyData* prev_;
+
+  // This is the earliest log file number that contains data from this
+  // Column Family. All earlier log files must be ignored and not
+  // recovered from
+  uint64_t log_number_;
+
+  std::atomic<FlushReason> flush_reason_;
+
+  // An object that keeps all the compaction stats
+  // and picks the next compaction
+  std::unique_ptr<CompactionPicker> compaction_picker_;
+
+  ColumnFamilySet* column_family_set_;
+
+  std::unique_ptr<WriteControllerToken> write_controller_token_;
+
+  // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_
+  bool queued_for_flush_;
+
+  // If true --> this ColumnFamily is currently present in
+  // DBImpl::compaction_queue_
+  bool queued_for_compaction_;
+
+  uint64_t prev_compaction_needed_bytes_;
+
+  // if the database was opened with 2pc enabled
+  bool allow_2pc_;
+
+  // Memtable id to track flush.
+  std::atomic<uint64_t> last_memtable_id_;
+
+  // Directories corresponding to cf_paths.
+  std::vector<std::shared_ptr<FSDirectory>> data_dirs_;
+
+  bool db_paths_registered_;
+
+  std::string full_history_ts_low_;
+
+  // For charging memory usage of file metadata created for newly added files to
+  // a Version associated with this CFD
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+  bool mempurge_used_;
+};
+
+// ColumnFamilySet has interesting thread-safety requirements
+// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB
+// mutex AND executed in the write thread.
+// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND
+// single-threaded write thread. It is also called during Recovery and in
+// DumpManifest().
+// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be
+// held and it needs to be executed from the write thread. SetDropped() also
+// guarantees that it will be called only from single-threaded LogAndApply(),
+// but this condition is not that important.
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
+// * GetDefault() -- thread safe
+// * GetColumnFamily() -- either inside of DB mutex or from a write thread
+// * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
+// NumberOfColumnFamilies -- inside of DB mutex
+class ColumnFamilySet {
+ public:
+  // ColumnFamilySet supports iteration
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilyData* cfd) : current_(cfd) {}
+    // NOTE: minimum operators for for-loop iteration
+    iterator& operator++() {
+      current_ = current_->next_;
+      return *this;
+    }
+    bool operator!=(const iterator& other) const {
+      return this->current_ != other.current_;
+    }
+    ColumnFamilyData* operator*() { return current_; }
+
+   private:
+    ColumnFamilyData* current_;
+  };
+
+  ColumnFamilySet(const std::string& dbname,
+                  const ImmutableDBOptions* db_options,
+                  const FileOptions& file_options, Cache* table_cache,
+                  WriteBufferManager* _write_buffer_manager,
+                  WriteController* _write_controller,
+                  BlockCacheTracer* const block_cache_tracer,
+                  const std::shared_ptr<IOTracer>& io_tracer,
+                  const std::string& db_id, const std::string& db_session_id);
+  ~ColumnFamilySet();
+
+  ColumnFamilyData* GetDefault() const;
+  // GetColumnFamily() calls return nullptr if column family is not found
+  ColumnFamilyData* GetColumnFamily(uint32_t id) const;
+  ColumnFamilyData* GetColumnFamily(const std::string& name) const;
+  // this call will return the next available column family ID. it guarantees
+  // that there is no column family with id greater than or equal to the
+  // returned value in the current running instance or anytime in RocksDB
+  // instance history.
+  uint32_t GetNextColumnFamilyID();
+  uint32_t GetMaxColumnFamily();
+  void UpdateMaxColumnFamily(uint32_t new_max_column_family);
+  size_t NumberOfColumnFamilies() const;
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& name, uint32_t id,
+                                       Version* dummy_version,
+                                       const ColumnFamilyOptions& options);
+
+  iterator begin() { return iterator(dummy_cfd_->next_); }
+  iterator end() { return iterator(dummy_cfd_); }
+
+  Cache* get_table_cache() { return table_cache_; }
+
+  WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
+
+  WriteController* write_controller() { return write_controller_; }
+
+ private:
+  friend class ColumnFamilyData;
+  // helper function that gets called from cfd destructor
+  // REQUIRES: DB mutex held
+  void RemoveColumnFamily(ColumnFamilyData* cfd);
+
+  // column_families_ and column_family_data_ need to be protected:
+  // * when mutating both conditions have to be satisfied:
+  // 1. DB mutex locked
+  // 2. thread currently in single-threaded write thread
+  // * when reading, at least one condition needs to be satisfied:
+  // 1. DB mutex locked
+  // 2. accessed from a single-threaded write thread
+  UnorderedMap<std::string, uint32_t> column_families_;
+  UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
+
+  uint32_t max_column_family_;
+  const FileOptions file_options_;
+
+  ColumnFamilyData* dummy_cfd_;
+  // We don't hold the refcount here, since default column family always exists
+  // We are also not responsible for cleaning up default_cfd_cache_. This is
+  // just a cache that makes common case (accessing default column family)
+  // faster
+  ColumnFamilyData* default_cfd_cache_;
+
+  const std::string db_name_;
+  const ImmutableDBOptions* const db_options_;
+  Cache* table_cache_;
+  WriteBufferManager* write_buffer_manager_;
+  WriteController* write_controller_;
+  BlockCacheTracer* const block_cache_tracer_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  const std::string& db_id_;
+  std::string db_session_id_;
+};
+
+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+      MaybeRef(*wrapped_);
+    }
+    ~iterator() { MaybeUnref(*wrapped_); }
+    inline void MaybeRef(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->Ref();
+      }
+    }
+    inline void MaybeUnref(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->UnrefAndTryDelete();
+      }
+    }
+    // NOTE: minimum operators for for-loop iteration
+    inline iterator& operator++() {
+      ColumnFamilyData* old = *wrapped_;
+      ++wrapped_;
+      // Can only unref & potentially free cfd after accessing its next_
+      MaybeUnref(old);
+      MaybeRef(*wrapped_);
+      return *this;
+    }
+    inline bool operator!=(const iterator& other) const {
+      return this->wrapped_ != other.wrapped_;
+    }
+    inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+   private:
+    ColumnFamilySet::iterator wrapped_;
+  };
+
+  iterator begin() { return iterator(wrapped_->begin()); }
+  iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+  ColumnFamilySet* wrapped_;
+};
+
+// We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
+// memtables of different column families (specified by ID in the write batch)
+class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set)
+      : column_family_set_(column_family_set), current_(nullptr) {}
+
+  // Constructs a ColumnFamilyMemTablesImpl equivalent to one constructed
+  // with the arguments used to construct *orig.
+  explicit ColumnFamilyMemTablesImpl(ColumnFamilyMemTablesImpl* orig)
+      : column_family_set_(orig->column_family_set_), current_(nullptr) {}
+
+  // sets current_ to ColumnFamilyData with column_family_id
+  // returns false if column family doesn't exist
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  bool Seek(uint32_t column_family_id) override;
+
+  // Returns log number of the selected column family
+  // REQUIRES: under a DB mutex OR from a write thread
+  uint64_t GetLogNumber() const override;
+
+  // REQUIRES: Seek() called first
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual MemTable* GetMemTable() const override;
+
+  // Returns column family handle for the selected column family
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
+
+  // Cannot be called while another thread is calling Seek().
+  // REQUIRES: use this function of DBImpl::column_family_memtables_ should be
+  //           under a DB mutex OR from a write thread
+  virtual ColumnFamilyData* current() override { return current_; }
+
+ private:
+  ColumnFamilySet* column_family_set_;
+  ColumnFamilyData* current_;
+  ColumnFamilyHandleInternal handle_;
+};
+
+extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
+
+extern const Comparator* GetColumnFamilyUserComparator(
+    ColumnFamilyHandle* column_family);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/column_family_test.cc b/src/rocksdb/db/column_family_test.cc
new file mode 100644
index 000000000..d33cbe50a
--- /dev/null
+++ b/src/rocksdb/db/column_family_test.cc
@@ -0,0 +1,3453 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+
+// counts how many operations were performed
+class EnvCounter : public SpecialEnv {
+ public:
+  explicit EnvCounter(Env* base)
+      : SpecialEnv(base), num_new_writable_file_(0) {}
+  int GetNumberOfNewWritableFileCalls() { return num_new_writable_file_; }
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    ++num_new_writable_file_;
+    return EnvWrapper::NewWritableFile(f, r, soptions);
+  }
+
+ private:
+  std::atomic<int> num_new_writable_file_;
+};
+
+class ColumnFamilyTestBase : public testing::Test {
+ public:
+  explicit ColumnFamilyTestBase(uint32_t format) : rnd_(139), format_(format) {
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(nullptr, base_env);
+    env_ = new EnvCounter(base_env);
+    env_->skip_fsync_ = true;
+    dbname_ = test::PerThreadDBPath("column_family_test");
+    db_options_.create_if_missing = true;
+    db_options_.fail_if_options_file_error = true;
+    db_options_.env = env_;
+    EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
+  }
+
+  ~ColumnFamilyTestBase() override {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (auto h : handles_) {
+      ColumnFamilyDescriptor cfdescriptor;
+      Status s = h->GetDescriptor(&cfdescriptor);
+#ifdef ROCKSDB_LITE
+      EXPECT_TRUE(s.IsNotSupported());
+#else
+      EXPECT_OK(s);
+#endif  // ROCKSDB_LITE
+      column_families.push_back(cfdescriptor);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Destroy(column_families);
+    delete env_;
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions options;
+    options.format_version = format_;
+    return options;
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+    } else {
+      Random r(k);
+      *storage = r.RandomString(kValueSize);
+    }
+    return Slice(*storage);
+  }
+
+  void Build(int base, int n, int flush_every = 0) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+        dbi->TEST_FlushMemTable();
+      }
+
+      int keyi = base + i;
+      Slice key(DBTestBase::Key(keyi));
+
+      batch.Clear();
+      batch.Put(handles_[0], key, Value(keyi, &value_space));
+      batch.Put(handles_[1], key, Value(keyi, &value_space));
+      batch.Put(handles_[2], key, Value(keyi, &value_space));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void CheckMissed() {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    for (int cf = 0; cf < 3; cf++) {
+      next_expected = 0;
+      Iterator* iter = db_->NewIterator(ReadOptions(false, true), handles_[cf]);
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        uint64_t key;
+        Slice in(iter->key());
+        in.remove_prefix(3);
+        if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+            key < next_expected) {
+          bad_keys++;
+          continue;
+        }
+        missed += (key - next_expected);
+        next_expected = key + 1;
+        if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+          bad_values++;
+        } else {
+          correct++;
+        }
+      }
+      delete iter;
+    }
+
+    ASSERT_EQ(0, bad_keys);
+    ASSERT_EQ(0, bad_values);
+    ASSERT_EQ(0, missed);
+    (void)correct;
+  }
+
+  void Close() {
+    for (auto h : handles_) {
+      if (h) {
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+      }
+    }
+    handles_.clear();
+    names_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryOpen(std::vector<std::string> cf,
+                 std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+      names_.push_back(cf[i]);
+    }
+    return DB::Open(db_options_, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status OpenReadOnly(std::vector<std::string> cf,
+                      std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<ColumnFamilyDescriptor> column_families;
+    names_.clear();
+    for (size_t i = 0; i < cf.size(); ++i) {
+      column_families.emplace_back(
+          cf[i], options.size() == 0 ? column_family_options_ : options[i]);
+      names_.push_back(cf[i]);
+    }
+    return DB::OpenForReadOnly(db_options_, dbname_, column_families, &handles_,
+                               &db_);
+  }
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+  void AssertOpenReadOnly(std::vector<std::string> cf,
+                          std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(OpenReadOnly(cf, options));
+  }
+#endif  // !ROCKSDB_LITE
+
+  void Open(std::vector<std::string> cf,
+            std::vector<ColumnFamilyOptions> options = {}) {
+    ASSERT_OK(TryOpen(cf, options));
+  }
+
+  void Open() { Open({"default"}); }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  int GetProperty(int cf, std::string property) {
+    std::string value;
+    EXPECT_TRUE(dbfull()->GetProperty(handles_[cf], property, &value));
+#ifndef CYGWIN
+    return std::stoi(value);
+#else
+    return std::strtol(value.c_str(), 0 /* off */, 10 /* base */);
+#endif
+  }
+
+  bool IsDbWriteStopped() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
+    return (v == 1);
+#else
+    return dbfull()->TEST_write_controler().IsStopped();
+#endif  // !ROCKSDB_LITE
+  }
+
+  uint64_t GetDbDelayedWriteRate() {
+#ifndef ROCKSDB_LITE
+    uint64_t v;
+    EXPECT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
+    return v;
+#else
+    if (!dbfull()->TEST_write_controler().NeedsDelay()) {
+      return 0;
+    }
+    return dbfull()->TEST_write_controler().delayed_write_rate();
+#endif  // !ROCKSDB_LITE
+  }
+
+  void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
+                   std::vector<ColumnFamilyDescriptor>()) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_),
+                        column_families));
+  }
+
+  void CreateColumnFamilies(
+      const std::vector<std::string>& cfs,
+      const std::vector<ColumnFamilyOptions> options = {}) {
+    int cfi = static_cast<int>(handles_.size());
+    handles_.resize(cfi + cfs.size());
+    names_.resize(cfi + cfs.size());
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      const auto& current_cf_opt =
+          options.size() == 0 ? column_family_options_ : options[i];
+      ASSERT_OK(
+          db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
+      names_[cfi] = cfs[i];
+
+#ifndef ROCKSDB_LITE  // RocksDBLite does not support GetDescriptor
+      // Verify the CF options of the returned CF handle.
+      ColumnFamilyDescriptor desc;
+      ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
+      // Need to sanitize the default column family options before comparing
+      // them.
+      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
+          ConfigOptions(), desc.options,
+          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
+#endif  // !ROCKSDB_LITE
+      cfi++;
+    }
+  }
+
+  void Reopen(const std::vector<ColumnFamilyOptions> options = {}) {
+    std::vector<std::string> names;
+    for (auto name : names_) {
+      if (name != "") {
+        names.push_back(name);
+      }
+    }
+    Close();
+    assert(options.size() == 0 || names.size() == options.size());
+    Open(names, options);
+  }
+
+  void CreateColumnFamiliesAndReopen(const std::vector<std::string>& cfs) {
+    CreateColumnFamilies(cfs);
+    Reopen();
+  }
+
+  void DropColumnFamilies(const std::vector<int>& cfs) {
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[cf]));
+      ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[cf]));
+      handles_[cf] = nullptr;
+      names_[cf] = "";
+    }
+  }
+
+  void PutRandomData(int cf, int num, int key_value_size, bool save = false) {
+    if (cf >= static_cast<int>(keys_.size())) {
+      keys_.resize(cf + 1);
+    }
+    for (int i = 0; i < num; ++i) {
+      // 10 bytes for key, rest is value
+      if (!save) {
+        ASSERT_OK(Put(cf, test::RandomKey(&rnd_, 11),
+                      rnd_.RandomString(key_value_size - 10)));
+      } else {
+        std::string key = test::RandomKey(&rnd_, 11);
+        keys_[cf].insert(key);
+        ASSERT_OK(Put(cf, key, rnd_.RandomString(key_value_size - 10)));
+      }
+    }
+    ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+  }
+
+#ifndef ROCKSDB_LITE  // TEST functions in DB are not supported in lite
+  void WaitForFlush(int cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  void WaitForCompaction() { ASSERT_OK(dbfull()->TEST_WaitForCompact()); }
+
+  uint64_t MaxTotalInMemoryState() {
+    return dbfull()->TEST_MaxTotalInMemoryState();
+  }
+
+  void AssertMaxTotalInMemoryState(uint64_t value) {
+    ASSERT_EQ(value, MaxTotalInMemoryState());
+  }
+#endif  // !ROCKSDB_LITE
+
+  Status Put(int cf, const std::string& key, const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Merge(int cf, const std::string& key, const std::string& value) {
+    return db_->Merge(WriteOptions(), handles_[cf], Slice(key), Slice(value));
+  }
+  Status Flush(int cf) { return db_->Flush(FlushOptions(), handles_[cf]); }
+
+  std::string Get(int cf, const std::string& key) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], Slice(key), &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  void CompactAll(int cf) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr,
+                                nullptr));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  int NumTableFilesAtLevel(int level, int cf) {
+    return GetProperty(cf,
+                       "rocksdb.num-files-at-level" + std::to_string(level));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf) {
+    std::string result;
+    int last_non_zero_offset = 0;
+    for (int level = 0; level < dbfull()->NumberLevels(handles_[cf]); level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = static_cast<int>(result.size());
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+#endif
+
+  void AssertFilesPerLevel(const std::string& value, int cf) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(value, FilesPerLevel(cf));
+#else
+    (void)value;
+    (void)cf;
+#endif
+  }
+
+#ifndef ROCKSDB_LITE  // GetLiveFilesMetaData is not supported
+  int CountLiveFiles() {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    return static_cast<int>(metadata.size());
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveFiles(int expected_value) {
+#ifndef ROCKSDB_LITE
+    ASSERT_EQ(expected_value, CountLiveFiles());
+#else
+    (void)expected_value;
+#endif
+  }
+
+  // Do n memtable flushes, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int cf, int n, const std::string& small,
+                  const std::string& large) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(db_->Flush(FlushOptions(), handles_[cf]));
+    }
+  }
+
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+  int CountLiveLogFiles() {
+    int micros_wait_for_log_deletion = 20000;
+    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
+    int ret = 0;
+    VectorLogPtr wal_files;
+    Status s;
+    // GetSortedWalFiles is a flakey function -- it gets all the wal_dir
+    // children files and then later checks for their existence. if some of the
+    // log files doesn't exist anymore, it reports an error. it does all of this
+    // without DB mutex held, so if a background process deletes the log file
+    // while the function is being executed, it returns an error. We retry the
+    // function 10 times to avoid the error failing the test
+    for (int retries = 0; retries < 10; ++retries) {
+      wal_files.clear();
+      s = db_->GetSortedWalFiles(wal_files);
+      if (s.ok()) {
+        break;
+      }
+    }
+    EXPECT_OK(s);
+    for (const auto& wal : wal_files) {
+      if (wal->Type() == kAliveLogFile) {
+        ++ret;
+      }
+    }
+    return ret;
+    return 0;
+  }
+#endif  // !ROCKSDB_LITE
+
+  void AssertCountLiveLogFiles(int value) {
+#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
+    ASSERT_EQ(value, CountLiveLogFiles());
+#else
+    (void)value;
+#endif  // !ROCKSDB_LITE
+  }
+
+  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
+    assert(num_per_cf.size() == handles_.size());
+
+#ifndef ROCKSDB_LITE  // GetProperty is not supported in lite
+    for (size_t i = 0; i < num_per_cf.size(); ++i) {
+      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
+                                           "rocksdb.num-immutable-mem-table"));
+    }
+#endif  // !ROCKSDB_LITE
+  }
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0) {
+    const EnvOptions soptions;
+    std::unique_ptr<SequentialFile> srcfile;
+    ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+    std::unique_ptr<WritableFile> destfile;
+    ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+    if (size == 0) {
+      // default argument means copy everything
+      ASSERT_OK(env_->GetFileSize(source, &size));
+    }
+
+    char buffer[4096];
+    Slice slice;
+    while (size > 0) {
+      uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+      ASSERT_OK(srcfile->Read(one, &slice, buffer));
+      ASSERT_OK(destfile->Append(slice));
+      size -= slice.size();
+    }
+    ASSERT_OK(destfile->Close());
+  }
+
+  int GetSstFileCount(std::string path) {
+    std::vector<std::string> files;
+    DBTestBase::GetSstFiles(env_, path, &files);
+    return static_cast<int>(files.size());
+  }
+
+  void RecalculateWriteStallConditions(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options) {
+    // add lock to avoid race condition between
+    // `RecalculateWriteStallConditions` which writes to CFStats and
+    // background `DBImpl::DumpStats()` threads which read CFStats
+    dbfull()->TEST_LockMutex();
+    cfd->RecalculateWriteStallConditions(mutable_cf_options);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  std::vector<ColumnFamilyHandle*> handles_;
+  std::vector<std::string> names_;
+  std::vector<std::set<std::string>> keys_;
+  ColumnFamilyOptions column_family_options_;
+  DBOptions db_options_;
+  std::string dbname_;
+  DB* db_ = nullptr;
+  EnvCounter* env_;
+  std::shared_ptr<Env> env_guard_;
+  Random rnd_;
+  uint32_t format_;
+};
+
+class ColumnFamilyTest
+    : public ColumnFamilyTestBase,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ public:
+  ColumnFamilyTest() : ColumnFamilyTestBase(GetParam()) {}
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ColumnFamilyTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ColumnFamilyTest,
+                        testing::Values(kLatestFormatVersion));
+
+TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
+  for (int iter = 0; iter < 3; ++iter) {
+    Open();
+    CreateColumnFamilies({"one", "two", "three"});
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[i]);
+      ASSERT_EQ(i, cfh->GetID());
+    }
+    if (iter == 1) {
+      Reopen();
+    }
+    DropColumnFamilies({3});
+    Reopen();
+    if (iter == 2) {
+      // this tests if max_column_family is correctly persisted with
+      // WriteSnapshot()
+      Reopen();
+    }
+    CreateColumnFamilies({"three2"});
+    // ID 3 that was used for dropped column family "three" should not be
+    // reused
+    auto cfh3 = static_cast_with_check<ColumnFamilyHandleImpl>(handles_[3]);
+    ASSERT_EQ(4U, cfh3->GetID());
+    Close();
+    Destroy();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
+  Open();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteOptionsFile:1",
+        "ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1"},
+       {"ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2",
+        "DBImpl::WriteOptionsFile:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&] { CreateColumnFamilies({"one"}); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:1");
+  uint64_t pv;
+  db_->GetAggregatedIntProperty(DB::Properties::kEstimateTableReadersMem, &pv);
+  TEST_SYNC_POINT("ColumnFamilyTest.CreateCFRaceWithGetAggProperty:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // !ROCKSDB_LITE
+
+class FlushEmptyCFTestWithParam
+    : public ColumnFamilyTestBase,
+      virtual public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  FlushEmptyCFTestWithParam()
+      : ColumnFamilyTestBase(std::get<0>(GetParam())),
+        allow_2pc_(std::get<1>(GetParam())) {}
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  bool allow_2pc_;
+};
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Boths' min log number is set to file C in memory, so
+  // after flushing file B is deleted. At the same time, the min log number of
+  // default CF is not written to manifest. Log file A still remains.
+  // Flushed to SST file Y.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 4
+  ASSERT_OK(Put(1, "foo", "v4"));  // seqID 5
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+
+  // Before opening, there are four files:
+  //   Log file A contains seqID 1
+  //   Log file C contains seqID 4, 5
+  //   SST file X contains seqID 1
+  //   SST file Y contains seqID 2, 3
+  // Min log number:
+  //   default CF: 0
+  //   CF one, two: C
+  // When opening the DB, all the seqID should be preserved.
+  Open(names, {});
+  ASSERT_EQ("v4", Get(1, "foo"));
+  ASSERT_EQ("v3", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+TEST_P(FlushEmptyCFTestWithParam, FlushEmptyCFTest2) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  db_options_.allow_2pc = allow_2pc_;
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  // Generate log file A.
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 1
+
+  Reopen();
+  // Log file A is not dropped after reopening because default column family's
+  // min log number is 0.
+  // It flushes to SST file X
+  ASSERT_OK(Put(1, "foo", "v1"));  // seqID 2
+  ASSERT_OK(Put(1, "bar", "v2"));  // seqID 3
+  // Current log file is file B now. While flushing, a new log file C is created
+  // and is set to current. Both CFs' min log number is set to file C so after
+  // flushing file B is deleted. Log file A still remains.
+  // Flushed to SST file Y.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(0, "bar", "v2"));  // seqID 4
+  ASSERT_OK(Put(2, "bar", "v2"));  // seqID 5
+  ASSERT_OK(Put(1, "bar", "v3"));  // seqID 6
+  // Flushing all column families. This forces all CFs' min log to current. This
+  // is written to the manifest file. Log file C is cleared.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Flush(2));
+  // Write to log file D
+  ASSERT_OK(Put(1, "bar", "v4"));  // seqID 7
+  ASSERT_OK(Put(1, "bar", "v5"));  // seqID 8
+  ASSERT_OK(db_->FlushWAL(/*sync=*/false));
+  // Preserve file system state up to here to simulate a crash condition.
+  fault_env->SetFilesystemActive(false);
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+
+  Close();
+  fault_env->ResetState();
+  // Before opening, there are two logfiles:
+  //   Log file A contains seqID 1
+  //   Log file D contains seqID 7, 8
+  // Min log number:
+  //   default CF: D
+  //   CF one, two: D
+  // When opening the DB, log file D should be replayed using the seqID
+  // specified in the file.
+  Open(names, {});
+  ASSERT_EQ("v1", Get(1, "foo"));
+  ASSERT_EQ("v5", Get(1, "bar"));
+  Close();
+
+  db_options_.env = env_;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(test::kDefaultFormatVersion, true),
+                    std::make_tuple(test::kDefaultFormatVersion, false)));
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, FlushEmptyCFTestWithParam,
+    testing::Values(std::make_tuple(kLatestFormatVersion, true),
+                    std::make_tuple(kLatestFormatVersion, false)));
+
+TEST_P(ColumnFamilyTest, AddDrop) {
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(2, "fodor"));
+  DropColumnFamilies({2});
+  ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+  CreateColumnFamilies({"four"});
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  ASSERT_EQ("NOT_FOUND", Get(3, "fodor"));
+  Close();
+  ASSERT_TRUE(TryOpen({"default"}).IsInvalidArgument());
+  Open({"default", "one", "three", "four"});
+  DropColumnFamilies({1});
+  Reopen();
+  Close();
+
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families ==
+              std::vector<std::string>({"default", "four", "three"}));
+}
+
+TEST_P(ColumnFamilyTest, BulkAddDrop) {
+  constexpr int kNumCF = 1000;
+  ColumnFamilyOptions cf_options;
+  WriteOptions write_options;
+  Open();
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyHandle*> cf_handles;
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_names.push_back("cf1-" + std::to_string(i));
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_options, cf_names, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  std::vector<ColumnFamilyDescriptor> cf_descriptors;
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  cf_handles.clear();
+  for (int i = 1; i <= kNumCF; i++) {
+    cf_descriptors.emplace_back("cf2-" + std::to_string(i),
+                                ColumnFamilyOptions());
+  }
+  ASSERT_OK(db_->CreateColumnFamilies(cf_descriptors, &cf_handles));
+  for (int i = 1; i <= kNumCF; i++) {
+    ASSERT_OK(db_->Put(write_options, cf_handles[i - 1], "foo", "bar"));
+  }
+  ASSERT_OK(db_->DropColumnFamilies(cf_handles));
+  for (auto* handle : cf_handles) {
+    delete handle;
+  }
+  Close();
+  std::vector<std::string> families;
+  ASSERT_OK(DB::ListColumnFamilies(db_options_, dbname_, &families));
+  std::sort(families.begin(), families.end());
+  ASSERT_TRUE(families == std::vector<std::string>({"default"}));
+}
+
+TEST_P(ColumnFamilyTest, DropTest) {
+  // first iteration - don't reopen DB before dropping
+  // second iteration - reopen DB before dropping
+  for (int iter = 0; iter < 2; ++iter) {
+    Open({"default"});
+    CreateColumnFamiliesAndReopen({"pikachu"});
+    for (int i = 0; i < 100; ++i) {
+      ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    if (iter == 1) {
+      Reopen();
+    }
+    ASSERT_EQ("bar1", Get(1, "1"));
+
+    AssertCountLiveFiles(1);
+    DropColumnFamilies({1});
+    // make sure that all files are deleted when we drop the column family
+    AssertCountLiveFiles(0);
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, WriteBatchFailure) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(handles_[0], Slice("existing"), Slice("column-family")));
+  ASSERT_OK(
+      batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  DropColumnFamilies({1});
+  WriteOptions woptions_ignore_missing_cf;
+  woptions_ignore_missing_cf.ignore_missing_column_families = true;
+  ASSERT_OK(
+      batch.Put(handles_[0], Slice("still here"), Slice("column-family")));
+  ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
+  ASSERT_EQ("column-family", Get(0, "still here"));
+  Status s = db_->Write(WriteOptions(), &batch);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, ReadWrite) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int iter = 0; iter <= 3; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  // delete old files in backup_logs directory
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  std::vector<std::string> old_files;
+  ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+  for (auto& file : old_files) {
+    ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+  }
+
+  column_family_options_.merge_operator =
+      MergeOperators::CreateUInt64AddOperator();
+  db_options_.wal_dir = dbname_ + "/logs";
+  Destroy();
+  Open();
+  CreateColumnFamilies({"cf1", "cf2"});
+
+  // fill up the DB
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(1, "mirko", one));
+  ASSERT_OK(Merge(0, "foo", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(2, "fodor", one));
+  ASSERT_OK(Merge(0, "bar", one));
+  ASSERT_OK(Merge(2, "bla", one));
+  ASSERT_OK(Merge(1, "mirko", two));
+  ASSERT_OK(Merge(1, "franjo", one));
+
+  // copy the logs to backup
+  std::vector<std::string> logs;
+  ASSERT_OK(env_->GetChildren(db_options_.wal_dir, &logs));
+  for (auto& log : logs) {
+    CopyFile(db_options_.wal_dir + "/" + log, backup_logs + "/" + log);
+  }
+
+  // recover the DB
+  Close();
+
+  // 1. check consistency
+  // 2. copy the logs from backup back to WAL dir. if the recovery happens
+  // again on the same log files, this should lead to incorrect results
+  // due to applying merge operator twice
+  // 3. check consistency
+  for (int iter = 0; iter < 2; ++iter) {
+    // assert consistency
+    Open({"default", "cf1", "cf2"});
+    ASSERT_EQ(two, Get(0, "foo"));
+    ASSERT_EQ(one, Get(0, "bar"));
+    ASSERT_EQ(three, Get(1, "mirko"));
+    ASSERT_EQ(one, Get(1, "franjo"));
+    ASSERT_EQ(one, Get(2, "fodor"));
+    ASSERT_EQ(two, Get(2, "bla"));
+    Close();
+
+    if (iter == 0) {
+      // copy the logs from backup back to wal dir
+      for (auto& log : logs) {
+        CopyFile(backup_logs + "/" + log, db_options_.wal_dir + "/" + log);
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE  // TEST functions used are not supported
+TEST_P(ColumnFamilyTest, FlushTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+  ASSERT_OK(Put(0, "foo", "v1"));
+  ASSERT_OK(Put(0, "bar", "v2"));
+  ASSERT_OK(Put(1, "mirko", "v3"));
+  ASSERT_OK(Put(0, "foo", "v2"));
+  ASSERT_OK(Put(2, "fodor", "v5"));
+
+  for (int j = 0; j < 2; j++) {
+    ReadOptions ro;
+    std::vector<Iterator*> iterators;
+    // Hold super version.
+    if (j == 0) {
+      ASSERT_OK(db_->NewIterators(ro, handles_, &iterators));
+    }
+
+    for (int i = 0; i < 3; ++i) {
+      uint64_t max_total_in_memory_state = MaxTotalInMemoryState();
+      ASSERT_OK(Flush(i));
+      AssertMaxTotalInMemoryState(max_total_in_memory_state);
+    }
+    ASSERT_OK(Put(1, "foofoo", "bar"));
+    ASSERT_OK(Put(0, "foofoo", "bar"));
+
+    for (auto* it : iterators) {
+      ASSERT_OK(it->status());
+      delete it;
+    }
+  }
+  Reopen();
+
+  for (int iter = 0; iter <= 2; ++iter) {
+    ASSERT_EQ("v2", Get(0, "foo"));
+    ASSERT_EQ("v2", Get(0, "bar"));
+    ASSERT_EQ("v3", Get(1, "mirko"));
+    ASSERT_EQ("v5", Get(2, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "fodor"));
+    ASSERT_EQ("NOT_FOUND", Get(2, "foo"));
+    if (iter <= 1) {
+      Reopen();
+    }
+  }
+  Close();
+}
+
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, LogDeletionTest) {
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  column_family_options_.arena_block_size = 4 * 1024;
+  column_family_options_.write_buffer_size = 128000;  // 128KB
+  Open();
+  CreateColumnFamilies({"one", "two", "three", "four"});
+  // Each bracket is one log file. if number is in (), it means
+  // we don't need it anymore (it's been flushed)
+  // []
+  AssertCountLiveLogFiles(0);
+  PutRandomData(0, 1, 128);
+  // [0]
+  PutRandomData(1, 1, 128);
+  // [0, 1]
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(0, 1, 128);
+  // [0, (1)] [0, 1]
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1, 128);
+  // [0, (1)] [0, 1, 2]
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [2]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [0, (1)] [0, 1, (2)] [(2)] [2]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [2, 3]
+  PutRandomData(1, 1, 128);
+  // [0, (1)] [0, 1, (2)] [(2)] [1, 2, 3]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [0, (1)] [0, (1), (2)] [(2)] [(1), 2, 3] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(0), (1)] [(0), (1), (2)] [(2)] [(1), 2, 3] [1, (0)] [0]
+  // delete obsolete logs -->
+  // [(1), 2, 3] [1, (0)] [0]
+  AssertCountLiveLogFiles(3);
+  PutRandomData(0, 1000, 128);
+  WaitForFlush(0);
+  // [(1), 2, 3] [1, (0)], [(0)] [0]
+  AssertCountLiveLogFiles(4);
+  PutRandomData(1, 1000, 128);
+  WaitForFlush(1);
+  // [(1), 2, 3] [(1), (0)] [(0)] [0, (1)] [1]
+  AssertCountLiveLogFiles(5);
+  PutRandomData(2, 1000, 128);
+  WaitForFlush(2);
+  // [(1), (2), 3] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2]
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 1000, 128);
+  WaitForFlush(3);
+  // [(1), (2), (3)] [(1), (0)] [(0)] [0, (1)] [1, (2)], [2, (3)] [3]
+  // delete obsolete logs -->
+  // [0, (1)] [1, (2)], [2, (3)] [3]
+  AssertCountLiveLogFiles(4);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CrashAfterFlush) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  db_options_.env = fault_env.get();
+  Open();
+  CreateColumnFamilies({"one"});
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(handles_[0], Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(handles_[1], Slice("foo"), Slice("bar")));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  ASSERT_OK(Flush(0));
+  fault_env->SetFilesystemActive(false);
+
+  std::vector<std::string> names;
+  for (auto name : names_) {
+    if (name != "") {
+      names.push_back(name);
+    }
+  }
+  Close();
+  ASSERT_OK(fault_env->DropUnsyncedFileData());
+  fault_env->ResetState();
+  Open(names, {});
+
+  // Write batch should be atomic.
+  ASSERT_EQ(Get(0, "foo"), Get(1, "foo"));
+
+  Close();
+  db_options_.env = env_;
+}
+
+TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
+  ASSERT_OK(TryOpen({"default"}));
+  Close();
+  ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+// Makes sure that obsolete log files get deleted
+TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
+  // disable flushing stale column families
+  db_options_.max_total_wal_size = std::numeric_limits<uint64_t>::max();
+  Open();
+  CreateColumnFamilies({"one", "two", "three"});
+  ColumnFamilyOptions default_cf, one, two, three;
+  // setup options. all column families have max_write_buffer_number setup to 10
+  // "default" -> 100KB memtable, start flushing immediately
+  // "one" -> 200KB memtable, start flushing with two immutable memtables
+  // "two" -> 1MB memtable, start flushing with three immutable memtables
+  // "three" -> 90KB memtable, start flushing with four immutable memtables
+  default_cf.write_buffer_size = 100000;
+  default_cf.arena_block_size = 4 * 4096;
+  default_cf.max_write_buffer_number = 10;
+  default_cf.min_write_buffer_number_to_merge = 1;
+  default_cf.max_write_buffer_size_to_maintain = 0;
+  one.write_buffer_size = 200000;
+  one.arena_block_size = 4 * 4096;
+  one.max_write_buffer_number = 10;
+  one.min_write_buffer_number_to_merge = 2;
+  one.max_write_buffer_size_to_maintain =
+      static_cast<int>(one.write_buffer_size);
+  two.write_buffer_size = 1000000;
+  two.arena_block_size = 4 * 4096;
+  two.max_write_buffer_number = 10;
+  two.min_write_buffer_number_to_merge = 3;
+  two.max_write_buffer_size_to_maintain =
+      static_cast<int>(two.write_buffer_size);
+  three.write_buffer_size = 4096 * 22;
+  three.arena_block_size = 4096;
+  three.max_write_buffer_number = 10;
+  three.min_write_buffer_number_to_merge = 4;
+  three.max_write_buffer_size_to_maintain =
+      static_cast<int>(three.write_buffer_size);
+
+  Reopen({default_cf, one, two, three});
+
+  int micros_wait_for_flush = 10000;
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(1);
+  PutRandomData(1, 200, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(2);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 1, 0});
+  AssertCountLiveLogFiles(3);
+  PutRandomData(2, 1000, 1000);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 0});
+  AssertCountLiveLogFiles(4);
+  PutRandomData(3, 93, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 2});
+  AssertCountLiveLogFiles(6);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(7);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 1, 2, 3});
+  AssertCountLiveLogFiles(8);
+  PutRandomData(2, 100, 10000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 3});
+  AssertCountLiveLogFiles(9);
+  PutRandomData(3, 88, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 0});
+  AssertCountLiveLogFiles(10);
+  PutRandomData(3, 88, 990);
+  env_->SleepForMicroseconds(micros_wait_for_flush);
+  AssertNumberOfImmutableMemtables({0, 1, 0, 1});
+  AssertCountLiveLogFiles(11);
+  PutRandomData(1, 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 1});
+  AssertCountLiveLogFiles(5);
+  PutRandomData(3, 88 * 3, 990);
+  WaitForFlush(3);
+  PutRandomData(3, 88 * 4, 990);
+  WaitForFlush(3);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(0, 100, 1000);
+  WaitForFlush(0);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(2, 3 * 1000, 1000);
+  WaitForFlush(2);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(12);
+  PutRandomData(1, 2 * 200, 1000);
+  WaitForFlush(1);
+  AssertNumberOfImmutableMemtables({0, 0, 0, 0});
+  AssertCountLiveLogFiles(7);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// The test is commented out because we want to test that snapshot is
+// not created for memtables not supported it, but There isn't a memtable
+// that doesn't support snapshot right now. If we have one later, we can
+// re-enable the test.
+//
+// #ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
+//   TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
+//   db_options_.allow_concurrent_memtable_write = false;
+//   Open();
+//   auto* s1 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s1 != nullptr);
+//   dbfull()->ReleaseSnapshot(s1);
+
+//   // Add a column family that doesn't support snapshot
+//   ColumnFamilyOptions first;
+//   first.memtable_factory.reset(new DummyMemtableNotSupportingSnapshot());
+//   CreateColumnFamilies({"first"}, {first});
+//   auto* s2 = dbfull()->GetSnapshot();
+//   ASSERT_TRUE(s2 == nullptr);
+
+//   // Add a column family that supports snapshot. Snapshot stays not
+//   supported. ColumnFamilyOptions second; CreateColumnFamilies({"second"},
+//   {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
+//   Close();
+// }
+// #endif  // !ROCKSDB_LITE
+
+class TestComparator : public Comparator {
+  int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+              const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+    return 0;
+  }
+  const char* Name() const override { return "Test"; }
+  void FindShortestSeparator(
+      std::string* /*start*/,
+      const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+static TestComparator third_comparator;
+static TestComparator fourth_comparator;
+
+// Test that we can retrieve the comparator from a created CF
+TEST_P(ColumnFamilyTest, GetComparator) {
+  Open();
+  // Add a column family with no comparator specified
+  CreateColumnFamilies({"first"});
+  const Comparator* comp = handles_[0]->GetComparator();
+  ASSERT_EQ(comp, BytewiseComparator());
+
+  // Add three column families - one with no comparator and two
+  // with comparators specified
+  ColumnFamilyOptions second, third, fourth;
+  second.comparator = &third_comparator;
+  third.comparator = &fourth_comparator;
+  CreateColumnFamilies({"second", "third", "fourth"}, {second, third, fourth});
+  ASSERT_EQ(handles_[1]->GetComparator(), BytewiseComparator());
+  ASSERT_EQ(handles_[2]->GetComparator(), &third_comparator);
+  ASSERT_EQ(handles_[3]->GetComparator(), &fourth_comparator);
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
+  Open();
+  CreateColumnFamilies({"first", "second"});
+  ColumnFamilyOptions default_cf, first, second;
+  first.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  second.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen({default_cf, first, second});
+
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  ASSERT_OK(Put(0, "foo", two));
+  ASSERT_OK(Put(0, "foo", one));
+  ASSERT_TRUE(Merge(0, "foo", two).IsNotSupported());
+  ASSERT_EQ(Get(0, "foo"), one);
+
+  ASSERT_OK(Put(1, "foo", two));
+  ASSERT_OK(Put(1, "foo", one));
+  ASSERT_OK(Merge(1, "foo", two));
+  ASSERT_EQ(Get(1, "foo"), three);
+
+  ASSERT_OK(Put(2, "foo", two));
+  ASSERT_OK(Put(2, "foo", one));
+  ASSERT_OK(Merge(2, "foo", two));
+  ASSERT_EQ(Get(2, "foo"), one + "," + two);
+  Close();
+}
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = static_cast<uint64_t>(1) << 60;
+
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(1, 10, 12000);
+    PutRandomData(1, 1, 10);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+
+  // TRIGGER compaction "one"
+  PutRandomData(1, 10, 12000);
+  PutRandomData(1, 1, 10);
+
+  // TRIGGER compaction "two"
+  PutRandomData(2, 10, 12000);
+  PutRandomData(2, 1, 10);
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+// Sync points not supported in RocksDB Lite
+
+TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::MultiManual:4", "ColumnFamilyTest::MultiManual:1"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:5"},
+       {"ColumnFamilyTest::MultiManual:2", "ColumnFamilyTest::MultiManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:3");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  threads.emplace_back([&] {
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:1");
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::MultiManual:5");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  std::atomic_bool cf_1_1{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:1"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:5"},
+       {"ColumnFamilyTest::AutoManual:2", "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[2], nullptr, nullptr));
+    TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  two.compaction_style = kCompactionStyleLevel;
+  two.num_levels = 4;
+  two.level0_file_num_compaction_trigger = 3;
+  two.write_buffer_size = 100000;
+
+  Reopen({default_cf, one, two});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:1"},
+       {"ColumnFamilyTest::ManualAuto:5", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:2", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  // SETUP column family "two" -- level style with 4 levels
+  for (int i = 0; i < two.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(2, 10, 12000);
+    PutRandomData(2, 1, 10);
+    WaitForFlush(2);
+    AssertFilesPerLevel(std::to_string(i + 1), 2);
+  }
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+  threads.join();
+
+  // WAIT for compactions
+  WaitForCompaction();
+
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+
+  // VERIFY compaction "two"
+  AssertFilesPerLevel("0,1", 2);
+  CompactAll(2);
+  AssertFilesPerLevel("0,1", 2);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:4", "ColumnFamilyTest::ManualManual:5"},
+       {"ColumnFamilyTest::ManualManual:1", "ColumnFamilyTest::ManualManual:2"},
+       {"ColumnFamilyTest::ManualManual:1",
+        "ColumnFamilyTest::ManualManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = true;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force another manual compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  ROCKSDB_NAMESPACE::port::Thread threads1([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualManual:1");
+
+  threads.join();
+  threads1.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  WaitForFlush(1);
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  ASSERT_LE(NumTableFilesAtLevel(0, 1), 2);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(ColumnFamilyTest, SameCFManualAutomaticCompactionsLevel) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleLevel;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 3;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // SETUP column family "one" -- level style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:2"},
+       {"ColumnFamilyTest::ManualAuto:4", "ColumnFamilyTest::ManualAuto:5"},
+       {"ColumnFamilyTest::ManualAuto:3", "ColumnFamilyTest::ManualAuto:2"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "ColumnFamilyTest::ManualAuto:3"},
+       {"ColumnFamilyTest::ManualAuto:1", "ColumnFamilyTest::ManualAuto:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    CompactRangeOptions compact_options;
+    compact_options.exclusive_manual_compaction = false;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:5");
+
+  // Add more L0 files and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(
+        std::to_string(one.level0_file_num_compaction_trigger + i), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::ManualAuto:1");
+
+  threads.join();
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("0,1", 1);
+
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In this test, we generate enough files to trigger automatic compactions.
+// The automatic compaction waits in NonTrivial:AfterRun
+// We generate more files and then trigger an automatic compaction
+// This will wait because the automatic compaction has files it needs.
+// Once the conflict is hit, the automatic compaction starts and ends
+// Then the manual will run and end.
+TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyOptions default_cf, one;
+  db_options_.max_open_files = 20;  // only 10 files in file cache
+  db_options_.max_background_compactions = 3;
+
+  default_cf.compaction_style = kCompactionStyleLevel;
+  default_cf.num_levels = 3;
+  default_cf.write_buffer_size = 64 << 10;  // 64KB
+  default_cf.target_file_size_base = 30 << 10;
+  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  ;
+  table_options.no_block_cache = true;
+  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  one.compaction_style = kCompactionStyleUniversal;
+
+  one.num_levels = 1;
+  // trigger compaction if there are >= 4 files
+  one.level0_file_num_compaction_trigger = 4;
+  one.write_buffer_size = 120000;
+
+  Reopen({default_cf, one});
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  std::atomic_bool cf_1_1{true};
+  std::atomic_bool cf_1_2{true};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:2"},
+       {"ColumnFamilyTest::AutoManual:4", "ColumnFamilyTest::AutoManual:5"},
+       {"CompactionPicker::CompactRange:Conflict",
+        "ColumnFamilyTest::AutoManual:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (cf_1_1.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:4");
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:3");
+        } else if (cf_1_2.exchange(false)) {
+          TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // SETUP column family "one" -- universal style
+  for (int i = 0; i < one.level0_file_num_compaction_trigger; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+    AssertFilesPerLevel(std::to_string(i + 1), 1);
+  }
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:5");
+
+  // Add another L0 file and force automatic compaction
+  for (int i = 0; i < one.level0_file_num_compaction_trigger - 2; ++i) {
+    PutRandomData(1, 10, 12000, true);
+    PutRandomData(1, 1, 10, true);
+    WaitForFlush(1);
+  }
+
+  CompactRangeOptions compact_options;
+  compact_options.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+  TEST_SYNC_POINT("ColumnFamilyTest::AutoManual:1");
+
+  WaitForCompaction();
+  // VERIFY compaction "one"
+  AssertFilesPerLevel("1", 1);
+  // Compare against saved keys
+  std::set<std::string>::iterator key_iter = keys_[1].begin();
+  while (key_iter != keys_[1].end()) {
+    ASSERT_NE("NOT_FOUND", Get(1, *key_iter));
+    key_iter++;
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // Tailing iterator not supported
+namespace {
+std::string IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    EXPECT_OK(iter->status());
+    result = "(invalid)";
+  }
+  return result;
+}
+}  // anonymous namespace
+
+TEST_P(ColumnFamilyTest, NewIteratorsTest) {
+  // iter == 0 -- no tailing
+  // iter == 2 -- tailing
+  for (int iter = 0; iter < 2; ++iter) {
+    Open();
+    CreateColumnFamiliesAndReopen({"one", "two"});
+    ASSERT_OK(Put(0, "a", "b"));
+    ASSERT_OK(Put(1, "b", "a"));
+    ASSERT_OK(Put(2, "c", "m"));
+    ASSERT_OK(Put(2, "v", "t"));
+    std::vector<Iterator*> iterators;
+    ReadOptions options;
+    options.tailing = (iter == 1);
+    ASSERT_OK(db_->NewIterators(options, handles_, &iterators));
+
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "b->a");
+    ASSERT_EQ(IterStatus(iterators[2]), "c->m");
+
+    ASSERT_OK(Put(1, "x", "x"));
+
+    for (auto it : iterators) {
+      it->Next();
+    }
+
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    if (iter == 0) {
+      // no tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    } else {
+      // tailing
+      ASSERT_EQ(IterStatus(iterators[1]), "x->x");
+    }
+    ASSERT_EQ(IterStatus(iterators[2]), "v->t");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+    Destroy();
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
+TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+  ASSERT_OK(Put(0, "a", "b"));
+  ASSERT_OK(Put(1, "foo", "bla"));
+  ASSERT_OK(Put(2, "foo", "blabla"));
+  ASSERT_OK(Put(3, "foo", "blablabla"));
+  ASSERT_OK(Put(4, "foo", "blablablabla"));
+
+  DropColumnFamilies({2});
+  Close();
+  // open only a subset of column families
+  AssertOpenReadOnly({"default", "one", "four"});
+  ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  ASSERT_EQ("bla", Get(1, "foo"));
+  ASSERT_EQ("blablablabla", Get(2, "foo"));
+
+  // test newiterators
+  {
+    std::vector<Iterator*> iterators;
+    ASSERT_OK(db_->NewIterators(ReadOptions(), handles_, &iterators));
+    for (auto it : iterators) {
+      it->SeekToFirst();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "a->b");
+    ASSERT_EQ(IterStatus(iterators[1]), "foo->bla");
+    ASSERT_EQ(IterStatus(iterators[2]), "foo->blablablabla");
+    for (auto it : iterators) {
+      it->Next();
+    }
+    ASSERT_EQ(IterStatus(iterators[0]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[1]), "(invalid)");
+    ASSERT_EQ(IterStatus(iterators[2]), "(invalid)");
+
+    for (auto it : iterators) {
+      delete it;
+    }
+  }
+
+  Close();
+  // can't open dropped column family
+  Status s = OpenReadOnly({"default", "one", "two"});
+  ASSERT_TRUE(!s.ok());
+
+  // Can't open without specifying default column family
+  s = OpenReadOnly({"one", "four"});
+  ASSERT_TRUE(!s.ok());
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForFlush() is not supported in lite
+TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
+
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    PutRandomData(static_cast<int>(i), 10, 100);
+  }
+  int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls();
+  // this will trigger the flushes
+  for (int i = 0; i <= 4; ++i) {
+    ASSERT_OK(Flush(i));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    WaitForFlush(i);
+  }
+  int total_new_writable_files =
+      env_->GetNumberOfNewWritableFileCalls() - num_writable_file_start;
+  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  //  WaitForCompaction() is not supported in lite
+TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
+  Open();
+  CreateColumnFamilies({"one", "two"});
+  ColumnFamilyOptions default_cf, one, two;
+  default_cf.write_buffer_size = 100000;  // small write buffer size
+  default_cf.arena_block_size = 4096;
+  default_cf.disable_auto_compactions = true;
+  one.disable_auto_compactions = true;
+  two.disable_auto_compactions = true;
+  db_options_.max_total_wal_size = 210000;
+
+  Reopen({default_cf, one, two});
+
+  PutRandomData(2, 1, 10);  // 10 bytes
+  for (int i = 0; i < 2; ++i) {
+    PutRandomData(0, 100, 1000);  // flush
+    WaitForFlush(0);
+
+    AssertCountLiveFiles(i + 1);
+  }
+  // third flush. now, CF [two] should be detected as stale and flushed
+  // column family 1 should not be flushed since it's empty
+  PutRandomData(0, 100, 1000);  // flush
+  WaitForFlush(0);
+  WaitForFlush(2);
+  // at least 3 files for default column families, 1 file for column family
+  // [two], zero files for column family [one], because it's empty
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_GE(metadata.size(), 4);
+  bool has_cf1_sst = false;
+  bool has_cf2_sst = false;
+  for (const auto& file : metadata) {
+    if (file.column_family_name == "one") {
+      has_cf1_sst = true;
+    } else if (file.column_family_name == "two") {
+      has_cf2_sst = true;
+    }
+  }
+  ASSERT_FALSE(has_cf1_sst);
+  ASSERT_TRUE(has_cf2_sst);
+
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
+  Status s = TryOpen({"one", "two"});
+  ASSERT_TRUE(!s.ok());
+  db_options_.create_missing_column_families = true;
+  s = TryOpen({"default", "one", "two"});
+  ASSERT_TRUE(s.ok());
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, SanitizeOptions) {
+  DBOptions db_options;
+  for (int s = kCompactionStyleLevel; s <= kCompactionStyleUniversal; ++s) {
+    for (int l = 0; l <= 2; l++) {
+      for (int i = 1; i <= 3; i++) {
+        for (int j = 1; j <= 3; j++) {
+          for (int k = 1; k <= 3; k++) {
+            ColumnFamilyOptions original;
+            original.compaction_style = static_cast<CompactionStyle>(s);
+            original.num_levels = l;
+            original.level0_stop_writes_trigger = i;
+            original.level0_slowdown_writes_trigger = j;
+            original.level0_file_num_compaction_trigger = k;
+            original.write_buffer_size =
+                l * 4 * 1024 * 1024 + i * 1024 * 1024 + j * 1024 + k;
+
+            ColumnFamilyOptions result =
+                SanitizeOptions(ImmutableDBOptions(db_options), original);
+            ASSERT_TRUE(result.level0_stop_writes_trigger >=
+                        result.level0_slowdown_writes_trigger);
+            ASSERT_TRUE(result.level0_slowdown_writes_trigger >=
+                        result.level0_file_num_compaction_trigger);
+            ASSERT_TRUE(result.level0_file_num_compaction_trigger ==
+                        original.level0_file_num_compaction_trigger);
+            if (s == kCompactionStyleLevel) {
+              ASSERT_GE(result.num_levels, 2);
+            } else {
+              ASSERT_GE(result.num_levels, 1);
+              if (original.num_levels >= 1) {
+                ASSERT_EQ(result.num_levels, original.num_levels);
+              }
+            }
+
+            // Make sure Sanitize options sets arena_block_size to 1/8 of
+            // the write_buffer_size, rounded up to a multiple of 4k.
+            size_t expected_arena_block_size =
+                l * 4 * 1024 * 1024 / 8 + i * 1024 * 1024 / 8;
+            if (j + k != 0) {
+              // not a multiple of 4k, round up 4k
+              expected_arena_block_size += 4 * 1024;
+            }
+            expected_arena_block_size =
+                std::min(size_t{1024 * 1024}, expected_arena_block_size);
+            ASSERT_EQ(expected_arena_block_size, result.arena_block_size);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(ColumnFamilyTest, ReadDroppedColumnFamily) {
+  // iter 0 -- drop CF, don't reopen
+  // iter 1 -- delete CF, reopen
+  for (int iter = 0; iter < 2; ++iter) {
+    db_options_.create_missing_column_families = true;
+    db_options_.max_open_files = 20;
+    // delete obsolete files always
+    db_options_.delete_obsolete_files_period_micros = 0;
+    Open({"default", "one", "two"});
+    ColumnFamilyOptions options;
+    options.level0_file_num_compaction_trigger = 100;
+    options.level0_slowdown_writes_trigger = 200;
+    options.level0_stop_writes_trigger = 200;
+    options.write_buffer_size = 100000;  // small write buffer size
+    Reopen({options, options, options});
+
+    // 1MB should create ~10 files for each CF
+    int kKeysNum = 10000;
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+    PutRandomData(2, kKeysNum, 100);
+
+    {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[2]));
+      iterator->SeekToFirst();
+
+      if (iter == 0) {
+        // Drop CF two
+        ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+      } else {
+        // delete CF two
+        ASSERT_OK(db_->DestroyColumnFamilyHandle(handles_[2]));
+        handles_[2] = nullptr;
+      }
+      // Make sure iterator created can still be used.
+      int count = 0;
+      for (; iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum);
+    }
+
+    // Add bunch more data to other CFs
+    PutRandomData(0, kKeysNum, 100);
+    PutRandomData(1, kKeysNum, 100);
+
+    if (iter == 1) {
+      Reopen();
+    }
+
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    for (int i = 0; i < 3; ++i) {
+      std::unique_ptr<Iterator> iterator(
+          db_->NewIterator(ReadOptions(), handles_[i]));
+      int count = 0;
+      for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+        ASSERT_OK(iterator->status());
+        ++count;
+      }
+      ASSERT_OK(iterator->status());
+      ASSERT_EQ(count, kKeysNum * ((i == 2) ? 1 : 2));
+    }
+
+    Close();
+    Destroy();
+  }
+}
+
+TEST_P(ColumnFamilyTest, LiveIteratorWithDroppedColumnFamily) {
+  db_options_.create_missing_column_families = true;
+  db_options_.max_open_files = 20;
+  // delete obsolete files always
+  db_options_.delete_obsolete_files_period_micros = 0;
+  Open({"default", "one", "two"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options, options});
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+  {
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iterator->SeekToFirst();
+
+    DropColumnFamilies({1});
+
+    // Make sure iterator created can still be used.
+    int count = 0;
+    for (; iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+
+  Reopen();
+  Close();
+  Destroy();
+}
+
+TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
+  db_options_.create_missing_column_families = true;
+  Open({"default", "one"});
+  ColumnFamilyOptions options;
+  options.level0_file_num_compaction_trigger = 100;
+  options.level0_slowdown_writes_trigger = 200;
+  options.level0_stop_writes_trigger = 200;
+  options.max_write_buffer_number = 20;
+  options.write_buffer_size = 100000;  // small write buffer size
+  Reopen({options, options});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply::ColumnFamilyDrop:0",
+        "FlushJob::WriteLevel0Table"},
+       {"VersionSet::LogAndApply::ColumnFamilyDrop:1",
+        "FlushJob::InstallResults"},
+       {"FlushJob::InstallResults",
+        "VersionSet::LogAndApply::ColumnFamilyDrop:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  test::SleepingBackgroundTask sleeping_task;
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  // 1MB should create ~10 files for each CF
+  int kKeysNum = 10000;
+  PutRandomData(1, kKeysNum, 100);
+
+  std::vector<port::Thread> threads;
+  threads.emplace_back([&] { ASSERT_OK(db_->DropColumnFamily(handles_[1])); });
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  sleeping_task.Reset();
+  // now we sleep again. this is just so we're certain that flush job finished
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+
+  {
+    // Since we didn't delete CF handle, RocksDB's contract guarantees that
+    // we're still able to read dropped CF
+    std::unique_ptr<Iterator> iterator(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (iterator->SeekToFirst(); iterator->Valid(); iterator->Next()) {
+      ASSERT_OK(iterator->status());
+      ++count;
+    }
+    ASSERT_OK(iterator->status());
+    ASSERT_EQ(count, kKeysNum);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  Close();
+  Destroy();
+}
+
+#ifndef ROCKSDB_LITE
+// skipped as persisting options is not supported in ROCKSDB_LITE
+namespace {
+std::atomic<int> test_stage(0);
+std::atomic<bool> ordered_by_writethread(false);
+const int kMainThreadStartPersistingOptionsFile = 1;
+const int kChildThreadFinishDroppingColumnFamily = 2;
+void DropSingleColumnFamily(ColumnFamilyTest* cf_test, int cf_id,
+                            std::vector<Comparator*>* comparators) {
+  while (test_stage < kMainThreadStartPersistingOptionsFile &&
+         !ordered_by_writethread) {
+    Env::Default()->SleepForMicroseconds(100);
+  }
+  cf_test->DropColumnFamilies({cf_id});
+  if ((*comparators)[cf_id]) {
+    delete (*comparators)[cf_id];
+    (*comparators)[cf_id] = nullptr;
+  }
+  test_stage = kChildThreadFinishDroppingColumnFamily;
+}
+}  // anonymous namespace
+
+TEST_P(ColumnFamilyTest, CreateAndDropRace) {
+  const int kCfCount = 5;
+  std::vector<ColumnFamilyOptions> cf_opts;
+  std::vector<Comparator*> comparators;
+  for (int i = 0; i < kCfCount; ++i) {
+    cf_opts.emplace_back();
+    comparators.push_back(new test::SimpleSuffixReverseComparator());
+    cf_opts.back().comparator = comparators.back();
+  }
+  db_options_.create_if_missing = true;
+  db_options_.create_missing_column_families = true;
+
+  auto main_thread_id = std::this_thread::get_id();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PersistRocksDBOptions:start", [&](void* /*arg*/) {
+        auto current_thread_id = std::this_thread::get_id();
+        // If it's the main thread hitting this sync-point, then it
+        // will be blocked until some other thread update the test_stage.
+        if (main_thread_id == current_thread_id) {
+          test_stage = kMainThreadStartPersistingOptionsFile;
+          while (test_stage < kChildThreadFinishDroppingColumnFamily &&
+                 !ordered_by_writethread) {
+            Env::Default()->SleepForMicroseconds(100);
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::EnterUnbatched:Wait", [&](void* /*arg*/) {
+        // This means a thread doing DropColumnFamily() is waiting for
+        // other thread to finish persisting options.
+        // In such case, we update the test_stage to unblock the main thread.
+        ordered_by_writethread = true;
+      });
+
+  // Create a database with four column families
+  Open({"default", "one", "two", "three"},
+       {cf_opts[0], cf_opts[1], cf_opts[2], cf_opts[3]});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start a thread that will drop the first column family
+  // and its comparator
+  ROCKSDB_NAMESPACE::port::Thread drop_cf_thread(DropSingleColumnFamily, this,
+                                                 1, &comparators);
+
+  DropColumnFamilies({2});
+
+  drop_cf_thread.join();
+  Close();
+  Destroy();
+  for (auto* comparator : comparators) {
+    if (comparator) {
+      delete comparator;
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
+  const uint64_t kBaseRate = 800000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  db_options_.max_background_compactions = 6;
+
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+  mutable_cf_options.disable_auto_compactions = false;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(400);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(450);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(205);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(202);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(198);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(399);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(599);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(2001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(3001);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(390);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(100);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(101);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(200);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  mutable_cf_options.disable_auto_compactions = true;
+  dbfull()->TEST_write_controler().set_delayed_write_rate(kBaseRate);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage->set_l0_delay_trigger_count(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  vstorage->set_l0_delay_trigger_count(60);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(0, GetDbDelayedWriteRate());
+  ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate());
+
+  mutable_cf_options.disable_auto_compactions = false;
+  vstorage->set_l0_delay_trigger_count(70);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->set_l0_delay_trigger_count(71);
+  vstorage->TEST_set_estimated_compaction_needed_bytes(501);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) {
+  db_options_.max_background_compactions = 6;
+  Open({"default"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 50;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(45);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(6);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 16;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+
+  vstorage->set_l0_delay_trigger_count(5);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(7);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(3);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, WriteStallTwoColumnFamilies) {
+  const uint64_t kBaseRate = 810000u;
+  db_options_.delayed_write_rate = kBaseRate;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  mutable_cf_options.level0_slowdown_writes_trigger = 20;
+  mutable_cf_options.level0_stop_writes_trigger = 10000;
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.soft_pending_compaction_bytes_limit = 500;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(50);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(201);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(800);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(300);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(700);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(500);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25 / 1.25, GetDbDelayedWriteRate());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(600);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_TRUE(!IsDbWriteStopped());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_EQ(kBaseRate / 1.25, GetDbDelayedWriteRate());
+}
+
+TEST_P(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) {
+  db_options_.max_background_compactions = 6;
+  column_family_options_.soft_pending_compaction_bytes_limit = 200;
+  column_family_options_.hard_pending_compaction_bytes_limit = 2000;
+  Open();
+  CreateColumnFamilies({"one"});
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+  VersionStorageInfo* vstorage = cfd->current()->storage_info();
+
+  ColumnFamilyData* cfd1 =
+      static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  VersionStorageInfo* vstorage1 = cfd1->current()->storage_info();
+
+  MutableCFOptions mutable_cf_options(column_family_options_);
+  // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8
+  mutable_cf_options.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options.level0_slowdown_writes_trigger = 36;
+  mutable_cf_options.level0_stop_writes_trigger = 30;
+  // Speedup threshold = 200 / 4 = 50
+  mutable_cf_options.soft_pending_compaction_bytes_limit = 200;
+  mutable_cf_options.hard_pending_compaction_bytes_limit = 2000;
+
+  MutableCFOptions mutable_cf_options1 = mutable_cf_options;
+  mutable_cf_options1.level0_slowdown_writes_trigger = 16;
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(40);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(60);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(30);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(70);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->TEST_set_estimated_compaction_needed_bytes(20);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->TEST_set_estimated_compaction_needed_bytes(3);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(9);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage1->set_l0_delay_trigger_count(2);
+  RecalculateWriteStallConditions(cfd1, mutable_cf_options);
+  ASSERT_EQ(6, dbfull()->TEST_BGCompactionsAllowed());
+
+  vstorage->set_l0_delay_trigger_count(0);
+  RecalculateWriteStallConditions(cfd, mutable_cf_options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_P(ColumnFamilyTest, CreateAndDestroyOptions) {
+  std::unique_ptr<ColumnFamilyOptions> cfo(new ColumnFamilyOptions());
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(*(cfo.get()), "yoyo", &cfh));
+  cfo.reset();
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
+  ColumnFamilyHandle* cfh;
+  Open();
+  ASSERT_OK(db_->CreateColumnFamily(ColumnFamilyOptions(), "yoyo", &cfh));
+  ASSERT_OK(db_->Put(WriteOptions(), cfh, "foo", "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cfh));
+  ASSERT_OK(db_->DisableFileDeletions());
+  ASSERT_OK(db_->DropColumnFamily(cfh));
+  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
+}
+
+TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BGWorkFlush:done", "FlushCloseWALFiles:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block flush jobs from running
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  TEST_SYNC_POINT("FlushCloseWALFiles:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
+  SpecialEnv env(Env::Default());
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(it->status());
+  // A flush will make `it` hold the last reference of its super version.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  // Flush jobs will close previous WAL files after finishing. By
+  // block flush jobs from running, we trigger a condition where
+  // the iterator destructor should close the WAL files.
+  test::SleepingBackgroundTask sleeping_task;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  // Make sure the task is sleeping. Otherwise, it might start to execute
+  // after sleeping_task.WaitUntilDone() and cause TSAN warning.
+  sleeping_task.WaitUntilSleeping();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+  WaitForFlush(1);
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+
+TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2));
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  // Create an iterator holding the current super version.
+  ReadOptions ro;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  ASSERT_OK(it->status());
+  // A flush will make `it` hold the last reference of its super version.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  delete it;
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
+TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
+  SpecialEnv env(Env::Default());
+  // Allow both of flush and purge job to schedule.
+  env.SetBackgroundThreads(2, Env::HIGH);
+  db_options_.env = &env;
+  db_options_.max_background_flushes = 1;
+  column_family_options_.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(3));
+  column_family_options_.level0_file_num_compaction_trigger = 2;
+  Open();
+  CreateColumnFamilies({"one"});
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  ASSERT_OK(Flush(1));
+
+  // Create an iterator holding the current super version, as well as
+  // the SST file just flushed.
+  ReadOptions ro;
+  ro.tailing = true;
+  ro.background_purge_on_iterator_cleanup = true;
+  Iterator* it = db_->NewIterator(ro, handles_[1]);
+  // A flush will make `it` hold the last reference of its super version.
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodar2", "mirko"));
+  ASSERT_OK(Flush(1));
+
+  WaitForCompaction();
+
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+  ASSERT_OK(Put(0, "fodor", "mirko"));
+  ASSERT_OK(Put(1, "fodor", "mirko"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ColumnFamilyTest::IteratorCloseWALFile2:0",
+       "DBImpl::BGWorkPurge:start"},
+      {"ColumnFamilyTest::IteratorCloseWALFile2:2",
+       "DBImpl::BackgroundCallFlush:start"},
+      {"DBImpl::BGWorkPurge:end", "ColumnFamilyTest::IteratorCloseWALFile2:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = true;
+  ASSERT_OK(db_->Put(wo, handles_[1], "fodor", "mirko"));
+
+  env.delete_count_.store(0);
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  // Deleting the iterator will clear its super version, triggering
+  // closing all files
+  it->Seek("");
+  ASSERT_OK(it->status());
+
+  ASSERT_EQ(2, env.num_open_wal_file_.load());
+  ASSERT_EQ(0, env.delete_count_.load());
+
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:0");
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:1");
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+  TEST_SYNC_POINT("ColumnFamilyTest::IteratorCloseWALFile2:2");
+  WaitForFlush(1);
+  ASSERT_EQ(1, env.num_open_wal_file_.load());
+  ASSERT_EQ(1, env.delete_count_.load());
+
+  delete it;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen();
+  ASSERT_EQ("mirko", Get(0, "fodor"));
+  ASSERT_EQ("mirko", Get(1, "fodor"));
+  db_options_.env = env_;
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
+// to return true which is not so in unbuffered mode.
+#ifndef OS_WIN
+TEST_P(ColumnFamilyTest, LogSyncConflictFlush) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  ASSERT_OK(Put(0, "", ""));
+  ASSERT_OK(Put(1, "foo", "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1",
+        "ColumnFamilyTest::LogSyncConflictFlush:1"},
+       {"ColumnFamilyTest::LogSyncConflictFlush:2",
+        "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&] { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1");
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+
+  TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2");
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Close();
+}
+#endif
+
+// this test is placed here, because the infrastructure for Column Family
+// test is being used to ensure a roll of wal files.
+// Basic idea is to test that WAL truncation is being detected and not
+// ignored
+TEST_P(ColumnFamilyTest, DISABLED_LogTruncationTest) {
+  Open();
+  CreateColumnFamiliesAndReopen({"one", "two"});
+
+  Build(0, 100);
+
+  // Flush the 0th column family to force a roll of the wal log
+  ASSERT_OK(Flush(0));
+
+  // Add some more entries
+  Build(100, 100);
+
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+
+  // collect wal files
+  std::vector<std::string> logfs;
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (!(ParseFileName(filenames[i], &number, &type))) continue;
+
+    if (type != kWalFile) continue;
+
+    logfs.push_back(filenames[i]);
+  }
+
+  std::sort(logfs.begin(), logfs.end());
+  ASSERT_GE(logfs.size(), 2);
+
+  // Take the last but one file, and truncate it
+  std::string fpath = dbname_ + "/" + logfs[logfs.size() - 2];
+  std::vector<std::string> names_save = names_;
+
+  uint64_t fsize;
+  ASSERT_OK(env_->GetFileSize(fpath, &fsize));
+  ASSERT_GT(fsize, 0);
+
+  Close();
+
+  std::string backup_logs = dbname_ + "/backup_logs";
+  std::string t_fpath = backup_logs + "/" + logfs[logfs.size() - 2];
+
+  ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+  // Not sure how easy it is to make this data driven.
+  // need to read back the WAL file and truncate last 10
+  // entries
+  CopyFile(fpath, t_fpath, fsize - 9180);
+
+  ASSERT_OK(env_->DeleteFile(fpath));
+  ASSERT_OK(env_->RenameFile(t_fpath, fpath));
+
+  db_options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  OpenReadOnly(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  Open(names_save);
+
+  CheckMissed();
+
+  Close();
+
+  // cleanup
+  ASSERT_OK(env_->DeleteDir(backup_logs));
+}
+
+TEST_P(ColumnFamilyTest, DefaultCfPathsTest) {
+  Open();
+  // Leave cf_paths for one column families to be empty.
+  // Files should be generated according to db_paths for that
+  // column family.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  // Fill Column family 1.
+  PutRandomData(1, 100, 100);
+  ASSERT_OK(Flush(1));
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Fill column family 2
+  PutRandomData(2, 100, 100);
+  ASSERT_OK(Flush(2));
+
+  // SST from Column family 2 should be generated in
+  // db_paths which is dbname_ in this case.
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+}
+
+TEST_P(ColumnFamilyTest, MultipleCFPathsTest) {
+  Open();
+  // Configure Column family specific paths.
+  ColumnFamilyOptions cf_opt1, cf_opt2;
+  cf_opt1.cf_paths.emplace_back(dbname_ + "_one_1",
+                                std::numeric_limits<uint64_t>::max());
+  cf_opt2.cf_paths.emplace_back(dbname_ + "_two_1",
+                                std::numeric_limits<uint64_t>::max());
+  CreateColumnFamilies({"one", "two"}, {cf_opt1, cf_opt2});
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+
+  PutRandomData(1, 100, 100, true /* save */);
+  ASSERT_OK(Flush(1));
+
+  // Check that files are generated in appropriate paths.
+  ASSERT_EQ(1, GetSstFileCount(cf_opt1.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  PutRandomData(2, 100, 100, true /* save */);
+  ASSERT_OK(Flush(2));
+
+  ASSERT_EQ(1, GetSstFileCount(cf_opt2.cf_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Re-open and verify the keys.
+  Reopen({ColumnFamilyOptions(), cf_opt1, cf_opt2});
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  for (int cf = 1; cf != 3; ++cf) {
+    ReadOptions read_options;
+    read_options.readahead_size = 0;
+    auto it = dbi->NewIterator(read_options, handles_[cf]);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      ASSERT_OK(it->status());
+      Slice key(it->key());
+      ASSERT_NE(keys_[cf].end(), keys_[cf].find(key.ToString()));
+    }
+    ASSERT_OK(it->status());
+    delete it;
+
+    for (const auto& key : keys_[cf]) {
+      ASSERT_NE("NOT_FOUND", Get(cf, key));
+    }
+  }
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCCutoff) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_age_cutoff = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_age_cutoff = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateBlobGCForceThreshold) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  cf_options.enable_blob_garbage_collection = true;
+
+  cf_options.blob_garbage_collection_force_threshold = -0.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+
+  cf_options.blob_garbage_collection_force_threshold = 0.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 0.5;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.blob_garbage_collection_force_threshold = 1.5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsInvalidArgument());
+}
+
+TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
+  DBOptions db_options;
+
+  ColumnFamilyOptions cf_options;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.memtable_protection_bytes_per_key = 5;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsNotSupported());
+
+  cf_options.memtable_protection_bytes_per_key = 1;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+
+  cf_options.memtable_protection_bytes_per_key = 16;
+  ASSERT_TRUE(ColumnFamilyData::ValidateOptions(db_options, cf_options)
+                  .IsNotSupported());
+
+  cf_options.memtable_protection_bytes_per_key = 0;
+  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compact_files_test.cc b/src/rocksdb/db/compact_files_test.cc
new file mode 100644
index 000000000..ef38946f7
--- /dev/null
+++ b/src/rocksdb/db/compact_files_test.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactFilesTest : public testing::Test {
+ public:
+  CompactFilesTest() {
+    env_ = Env::Default();
+    db_name_ = test::PerThreadDBPath("compact_files_test");
+  }
+
+  std::string db_name_;
+  Env* env_;
+};
+
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+TEST_F(CompactFilesTest, L0ConflictsFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 10000;
+  const int kLevel0Trigger = 2;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  // Small slowdown and stop trigger for experimental purpose.
+  options.level0_slowdown_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.level0_stop_writes_trigger = 20;
+  options.write_buffer_size = kWriteBufferSize;
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.compression = kNoCompression;
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0", "BackgroundCallCompaction:0"},
+      {"BackgroundCallCompaction:1", "CompactFilesImpl:1"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // create couple files
+  // Background compaction starts and waits in BackgroundCallCompaction:0
+  for (int i = 0; i < kLevel0Trigger * 4; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(100 - i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  std::string file1;
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    if (file1 == "") {
+      file1 = file.db_path + "/" + file.name;
+    } else {
+      std::string file2 = file.db_path + "/" + file.name;
+      // Another thread starts a compact files and creates an L0 compaction
+      // The background compaction then notices that there is an L0 compaction
+      // already in progress and doesn't do an L0 compaction
+      // Once the background compaction finishes, the compact files finishes
+      ASSERT_OK(db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 {file1, file2}, 0));
+      break;
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete db;
+}
+
+TEST_F(CompactFilesTest, MultipleLevel) {
+  Options options;
+  options.create_if_missing = true;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 6;
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files in L0, L3, L4 and L5
+  for (int i = 5; i > 2; --i) {
+    collector->ClearFlushedFiles();
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i), ""));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, i));
+
+    std::string prop;
+    ASSERT_TRUE(db->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(i), &prop));
+    ASSERT_EQ("1", prop);
+  }
+  ASSERT_OK(db->Put(WriteOptions(), std::to_string(0), ""));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  // Compact files except the file in L3
+  std::vector<std::string> files;
+  for (int i = 0; i < 6; ++i) {
+    if (i == 3) continue;
+    for (auto& file : meta.levels[i].files) {
+      files.push_back(file.db_path + "/" + file.name);
+    }
+  }
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "CompactFilesTest.MultipleLevel:0"},
+      {"CompactFilesTest.MultipleLevel:1", "CompactFilesImpl:3"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread thread([&] {
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:0");
+    ASSERT_OK(db->Put(WriteOptions(), "bar", "v2"));
+    ASSERT_OK(db->Put(WriteOptions(), "foo", "v2"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    TEST_SYNC_POINT("CompactFilesTest.MultipleLevel:1");
+  });
+
+  // Compaction cannot move up the data to higher level
+  // here we have input file from level 5, so the output level has to be >= 5
+  for (int invalid_output_level = 0; invalid_output_level < 5;
+       invalid_output_level++) {
+    s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
+    std::cout << s.ToString() << std::endl;
+    ASSERT_TRUE(s.IsInvalidArgument());
+  }
+
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), files, 5));
+  SyncPoint::GetInstance()->DisableProcessing();
+  thread.join();
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, ObsoleteFiles) {
+  Options options;
+  // to trigger compaction more easily
+  const int kWriteBufferSize = 65536;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.write_buffer_size = kWriteBufferSize;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  ASSERT_NE(db, nullptr);
+
+  // create couple files
+  for (int i = 1000; i < 2000; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(kWriteBufferSize / 10, 'a' + (i % 26))));
+  }
+
+  auto l0_files = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForCompact());
+
+  // verify all compaction input files are deleted
+  for (auto fname : l0_files) {
+    ASSERT_EQ(Status::NotFound(), env_->FileExists(fname));
+  }
+  delete db;
+}
+
+TEST_F(CompactFilesTest, NotCutOutputOnLevel0) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  assert(s.ok());
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_1 = collector->GetFlushedFiles();
+  collector->ClearFlushedFiles();
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_2 = collector->GetFlushedFiles();
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_1, 0));
+  ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files_2, 0));
+  // no assertion failure
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CapturingPendingFiles) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  // Always do full scans for obsolete files (needed to reproduce the issue).
+  options.delete_obsolete_files_period_micros = 0;
+
+  // Add listener.
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+
+  // Create 5 files.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), "key" + std::to_string(i), "value"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+  }
+
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+  auto l0_files = collector->GetFlushedFiles();
+  EXPECT_EQ(5, l0_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"},
+      {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start compacting files.
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread(
+      [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); });
+
+  // In the meantime flush another file.
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0");
+  ASSERT_OK(db->Put(WriteOptions(), "key5", "value"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+  TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1");
+
+  compaction_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  delete db;
+
+  // Make sure we can reopen the DB.
+  s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+  delete db;
+}
+
+TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
+  class FilterWithGet : public CompactionFilter {
+   public:
+    bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+                std::string* /*new_value*/,
+                bool* /*value_changed*/) const override {
+      if (db_ == nullptr) {
+        return true;
+      }
+      std::string res;
+      db_->Get(ReadOptions(), "", &res);
+      return true;
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    const char* Name() const override { return "FilterWithGet"; }
+
+   private:
+    DB* db_;
+  };
+
+  std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
+
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_filter = cf.get();
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+
+  cf->SetDB(db);
+
+  // Write one L0 file
+  ASSERT_OK(db->Put(WriteOptions(), "K1", "V1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  // Compact all L0 files using CompactFiles
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData meta;
+  db->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    std::string fname = file.db_path + "/" + file.name;
+    ASSERT_OK(
+        db->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), {fname}, 0));
+  }
+
+  delete db;
+}
+
+TEST_F(CompactFilesTest, SentinelCompressionType) {
+  if (!Zlib_Supported()) {
+    fprintf(stderr, "zlib compression not supported, skip this test\n");
+    return;
+  }
+  if (!Snappy_Supported()) {
+    fprintf(stderr, "snappy compression not supported, skip this test\n");
+    return;
+  }
+  // Check that passing `CompressionType::kDisableCompressionOption` to
+  // `CompactFiles` causes it to use the column family compression options.
+  for (auto compaction_style : {CompactionStyle::kCompactionStyleLevel,
+                                CompactionStyle::kCompactionStyleUniversal,
+                                CompactionStyle::kCompactionStyleNone}) {
+    ASSERT_OK(DestroyDB(db_name_, Options()));
+    Options options;
+    options.compaction_style = compaction_style;
+    // L0: Snappy, L1: ZSTD, L2: Snappy
+    options.compression_per_level = {CompressionType::kSnappyCompression,
+                                     CompressionType::kZlibCompression,
+                                     CompressionType::kSnappyCompression};
+    options.create_if_missing = true;
+    FlushedFileCollector* collector = new FlushedFileCollector();
+    options.listeners.emplace_back(collector);
+    DB* db = nullptr;
+    ASSERT_OK(DB::Open(options, db_name_, &db));
+
+    ASSERT_OK(db->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForBackgroundWork());
+    auto l0_files = collector->GetFlushedFiles();
+    ASSERT_EQ(1, l0_files.size());
+
+    // L0->L1 compaction, so output should be ZSTD-compressed
+    CompactionOptions compaction_opts;
+    compaction_opts.compression = CompressionType::kDisableCompressionOption;
+    ASSERT_OK(db->CompactFiles(compaction_opts, l0_files, 1));
+
+    ROCKSDB_NAMESPACE::TablePropertiesCollection all_tables_props;
+    ASSERT_OK(db->GetPropertiesOfAllTables(&all_tables_props));
+    for (const auto& name_and_table_props : all_tables_props) {
+      ASSERT_EQ(CompressionTypeToString(CompressionType::kZlibCompression),
+                name_and_table_props.second->compression_name);
+    }
+    delete db;
+  }
+}
+
+TEST_F(CompactFilesTest, GetCompactionJobInfo) {
+  Options options;
+  options.create_if_missing = true;
+  // Disable RocksDB background compaction.
+  options.compaction_style = kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 1000;
+  options.level0_stop_writes_trigger = 1000;
+  options.write_buffer_size = 65536;
+  options.max_write_buffer_number = 2;
+  options.compression = kNoCompression;
+  options.max_compaction_bytes = 5000;
+
+  // Add listener
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DB* db = nullptr;
+  ASSERT_OK(DestroyDB(db_name_, options));
+  Status s = DB::Open(options, db_name_, &db);
+  ASSERT_OK(s);
+  assert(db);
+
+  // create couple files
+  for (int i = 0; i < 500; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), std::to_string(i),
+                      std::string(1000, 'a' + (i % 26))));
+  }
+  ASSERT_OK(static_cast_with_check<DBImpl>(db)->TEST_WaitForFlushMemTable());
+  auto l0_files_1 = collector->GetFlushedFiles();
+  CompactionOptions co;
+  co.compression = CompressionType::kLZ4Compression;
+  CompactionJobInfo compaction_job_info{};
+  ASSERT_OK(
+      db->CompactFiles(co, l0_files_1, 0, -1, nullptr, &compaction_job_info));
+  ASSERT_EQ(compaction_job_info.base_input_level, 0);
+  ASSERT_EQ(compaction_job_info.cf_id, db->DefaultColumnFamily()->GetID());
+  ASSERT_EQ(compaction_job_info.cf_name, db->DefaultColumnFamily()->GetName());
+  ASSERT_EQ(compaction_job_info.compaction_reason,
+            CompactionReason::kManualCompaction);
+  ASSERT_EQ(compaction_job_info.compression, CompressionType::kLZ4Compression);
+  ASSERT_EQ(compaction_job_info.output_level, 0);
+  ASSERT_OK(compaction_job_info.status);
+  // no assertion failure
+  delete db;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/clipping_iterator.h b/src/rocksdb/db/compaction/clipping_iterator.h
new file mode 100644
index 000000000..1ed465c2c
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator.h
@@ -0,0 +1,276 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cassert>
+
+#include "rocksdb/comparator.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// An internal iterator that wraps another one and ensures that any keys
+// returned are strictly within a range [start, end). If the underlying
+// iterator has already performed the bounds checking, it relies on that result;
+// otherwise, it performs the necessary key comparisons itself. Both bounds
+// are optional.
+class ClippingIterator : public InternalIterator {
+ public:
+  ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
+                   const CompareInterface* cmp)
+      : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+    assert(iter_);
+    assert(cmp_);
+    assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+
+    UpdateAndEnforceBounds();
+  }
+
+  bool Valid() const override { return valid_; }
+
+  void SeekToFirst() override {
+    if (start_) {
+      iter_->Seek(*start_);
+    } else {
+      iter_->SeekToFirst();
+    }
+
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekToLast() override {
+    if (end_) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+    } else {
+      iter_->SeekToLast();
+    }
+
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Seek(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      iter_->Seek(*start_);
+      UpdateAndEnforceUpperBound();
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      valid_ = false;
+      return;
+    }
+
+    iter_->Seek(target);
+    UpdateAndEnforceUpperBound();
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    if (start_ && cmp_->Compare(target, *start_) < 0) {
+      valid_ = false;
+      return;
+    }
+
+    if (end_ && cmp_->Compare(target, *end_) >= 0) {
+      iter_->SeekForPrev(*end_);
+
+      // Upper bound is exclusive, so we need a key which is strictly smaller
+      if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+        iter_->Prev();
+      }
+
+      UpdateAndEnforceLowerBound();
+      return;
+    }
+
+    iter_->SeekForPrev(target);
+    UpdateAndEnforceLowerBound();
+  }
+
+  void Next() override {
+    assert(valid_);
+    iter_->Next();
+    UpdateAndEnforceUpperBound();
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(valid_);
+    assert(result);
+
+    IterateResult res;
+    valid_ = iter_->NextAndGetResult(&res);
+
+    if (!valid_) {
+      return false;
+    }
+
+    if (end_) {
+      EnforceUpperBoundImpl(res.bound_check_result);
+
+      if (!valid_) {
+        return false;
+      }
+    }
+
+    res.bound_check_result = IterBoundCheck::kInbound;
+    *result = res;
+
+    return true;
+  }
+
+  void Prev() override {
+    assert(valid_);
+    iter_->Prev();
+    UpdateAndEnforceLowerBound();
+  }
+
+  Slice key() const override {
+    assert(valid_);
+    return iter_->key();
+  }
+
+  Slice user_key() const override {
+    assert(valid_);
+    return iter_->user_key();
+  }
+
+  Slice value() const override {
+    assert(valid_);
+    return iter_->value();
+  }
+
+  Status status() const override { return iter_->status(); }
+
+  bool PrepareValue() override {
+    assert(valid_);
+
+    if (iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(valid_);
+    return false;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(valid_);
+    return IterBoundCheck::kInbound;
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    iter_->SetPinnedItersMgr(pinned_iters_mgr);
+  }
+
+  bool IsKeyPinned() const override {
+    assert(valid_);
+    return iter_->IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    assert(valid_);
+    return iter_->IsValuePinned();
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override {
+    return iter_->GetProperty(prop_name, prop);
+  }
+
+ private:
+  void UpdateValid() {
+    assert(!iter_->Valid() || iter_->status().ok());
+
+    valid_ = iter_->Valid();
+  }
+
+  void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
+    if (bound_check_result == IterBoundCheck::kInbound) {
+      return;
+    }
+
+    if (bound_check_result == IterBoundCheck::kOutOfBound) {
+      valid_ = false;
+      return;
+    }
+
+    assert(bound_check_result == IterBoundCheck::kUnknown);
+
+    if (cmp_->Compare(key(), *end_) >= 0) {
+      valid_ = false;
+    }
+  }
+
+  void EnforceUpperBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!end_) {
+      return;
+    }
+
+    EnforceUpperBoundImpl(iter_->UpperBoundCheckResult());
+  }
+
+  void EnforceLowerBound() {
+    if (!valid_) {
+      return;
+    }
+
+    if (!start_) {
+      return;
+    }
+
+    if (!iter_->MayBeOutOfLowerBound()) {
+      return;
+    }
+
+    if (cmp_->Compare(key(), *start_) < 0) {
+      valid_ = false;
+    }
+  }
+
+  void AssertBounds() {
+    assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
+    assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+  }
+
+  void UpdateAndEnforceBounds() {
+    UpdateValid();
+    EnforceUpperBound();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceUpperBound() {
+    UpdateValid();
+    EnforceUpperBound();
+    AssertBounds();
+  }
+
+  void UpdateAndEnforceLowerBound() {
+    UpdateValid();
+    EnforceLowerBound();
+    AssertBounds();
+  }
+
+  InternalIterator* iter_;
+  const Slice* start_;
+  const Slice* end_;
+  const CompareInterface* cmp_;
+  bool valid_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/clipping_iterator_test.cc b/src/rocksdb/db/compaction/clipping_iterator_test.cc
new file mode 100644
index 000000000..b2b167048
--- /dev/null
+++ b/src/rocksdb/db/compaction/clipping_iterator_test.cc
@@ -0,0 +1,259 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/clipping_iterator.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A vector iterator which does its own bounds checking. This is for testing the
+// optimizations in the clipping iterator where we bypass the bounds checking if
+// the input iterator has already performed it.
+class BoundsCheckingVectorIterator : public VectorIterator {
+ public:
+  BoundsCheckingVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values,
+                               const Slice* start, const Slice* end,
+                               const Comparator* cmp)
+      : VectorIterator(keys, values, cmp), start_(start), end_(end), cmp_(cmp) {
+    assert(cmp_);
+  }
+
+  bool NextAndGetResult(IterateResult* result) override {
+    assert(Valid());
+    assert(result);
+
+    Next();
+
+    if (!Valid()) {
+      return false;
+    }
+
+    result->key = key();
+    result->bound_check_result = UpperBoundCheckResult();
+    result->value_prepared = true;
+
+    return true;
+  }
+
+  bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+
+    if (!start_) {
+      return false;
+    }
+
+    return cmp_->Compare(key(), *start_) < 0;
+  }
+
+  IterBoundCheck UpperBoundCheckResult() override {
+    assert(Valid());
+
+    if (!end_) {
+      return IterBoundCheck::kInbound;
+    }
+
+    return cmp_->Compare(key(), *end_) >= 0 ? IterBoundCheck::kOutOfBound
+                                            : IterBoundCheck::kInbound;
+  }
+
+ private:
+  const Slice* start_;
+  const Slice* end_;
+  const Comparator* cmp_;
+};
+
+class ClippingIteratorTest
+    : public ::testing::Test,
+      public ::testing::WithParamInterface<std::tuple<bool, size_t, size_t>> {};
+
+TEST_P(ClippingIteratorTest, Clip) {
+  const std::vector<std::string> keys{"key0", "key1", "key2", "key3", "key4",
+                                      "key5", "key6", "key7", "key8", "key9"};
+  const std::vector<std::string> values{
+      "unused0", "value1",  "value2",  "value3",  "unused4",
+      "unused5", "unused6", "unused7", "unused8", "unused9"};
+
+  assert(keys.size() == values.size());
+
+  // Note: the input always contains key1, key2, and key3; however, the clipping
+  // window is based on the test parameters: its left edge is a value in the
+  // range [0, 4], and its size is a value in the range [0, 5]
+  const std::vector<std::string> input_keys{keys[1], keys[2], keys[3]};
+  const std::vector<std::string> input_values{values[1], values[2], values[3]};
+
+  const bool use_bounds_checking_vec_it = std::get<0>(GetParam());
+
+  const size_t clip_start_idx = std::get<1>(GetParam());
+  const size_t clip_window_size = std::get<2>(GetParam());
+  const size_t clip_end_idx = clip_start_idx + clip_window_size;
+
+  const Slice start(keys[clip_start_idx]);
+  const Slice end(keys[clip_end_idx]);
+
+  std::unique_ptr<InternalIterator> input(
+      use_bounds_checking_vec_it
+          ? new BoundsCheckingVectorIterator(input_keys, input_values, &start,
+                                             &end, BytewiseComparator())
+          : new VectorIterator(input_keys, input_values, BytewiseComparator()));
+
+  ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+
+  // The range the clipping iterator should return values from. This is
+  // essentially the intersection of the input range [1, 4) and the clipping
+  // window [clip_start_idx, clip_end_idx)
+  const size_t data_start_idx =
+      std::max(clip_start_idx, static_cast<size_t>(1));
+  const size_t data_end_idx = std::min(clip_end_idx, static_cast<size_t>(4));
+
+  // Range is empty; all Seeks should fail
+  if (data_start_idx >= data_end_idx) {
+    clip.SeekToFirst();
+    ASSERT_FALSE(clip.Valid());
+
+    clip.SeekToLast();
+    ASSERT_FALSE(clip.Valid());
+
+    for (size_t i = 0; i < keys.size(); ++i) {
+      clip.Seek(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+
+      clip.SeekForPrev(keys[i]);
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    return;
+  }
+
+  // Range is non-empty; call SeekToFirst and iterate forward
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    clip.Next();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Next();
+  ASSERT_FALSE(clip.Valid());
+
+  // Do it again using NextAndGetResult
+  clip.SeekToFirst();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_start_idx]);
+  ASSERT_EQ(clip.value(), values[data_start_idx]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
+    IterateResult result;
+    ASSERT_TRUE(clip.NextAndGetResult(&result));
+    ASSERT_EQ(result.key, keys[i]);
+    ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  IterateResult result;
+  ASSERT_FALSE(clip.NextAndGetResult(&result));
+  ASSERT_FALSE(clip.Valid());
+
+  // Call SeekToLast and iterate backward
+  clip.SeekToLast();
+  ASSERT_TRUE(clip.Valid());
+  ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+  ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+  ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+  ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+
+  for (size_t i = data_end_idx - 2; i >= data_start_idx; --i) {
+    clip.Prev();
+    ASSERT_TRUE(clip.Valid());
+    ASSERT_EQ(clip.key(), keys[i]);
+    ASSERT_EQ(clip.value(), values[i]);
+    ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+    ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+  }
+
+  clip.Prev();
+  ASSERT_FALSE(clip.Valid());
+
+  // Call Seek/SeekForPrev for all keys; Seek should return the smallest key
+  // which is >= the target; SeekForPrev should return the largest key which is
+  // <= the target
+  for (size_t i = 0; i < keys.size(); ++i) {
+    clip.Seek(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_start_idx]);
+      ASSERT_EQ(clip.value(), values[data_start_idx]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_FALSE(clip.Valid());
+    }
+
+    clip.SeekForPrev(keys[i]);
+
+    if (i < data_start_idx) {
+      ASSERT_FALSE(clip.Valid());
+    } else if (i < data_end_idx) {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[i]);
+      ASSERT_EQ(clip.value(), values[i]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    } else {
+      ASSERT_TRUE(clip.Valid());
+      ASSERT_EQ(clip.key(), keys[data_end_idx - 1]);
+      ASSERT_EQ(clip.value(), values[data_end_idx - 1]);
+      ASSERT_FALSE(clip.MayBeOutOfLowerBound());
+      ASSERT_EQ(clip.UpperBoundCheckResult(), IterBoundCheck::kInbound);
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ClippingIteratorTest, ClippingIteratorTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(5)),
+        ::testing::Range(static_cast<size_t>(0), static_cast<size_t>(6))));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction.cc b/src/rocksdb/db/compaction/compaction.cc
new file mode 100644
index 000000000..a32b529f7
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.cc
@@ -0,0 +1,855 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/sst_partitioner.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b) {
+  auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
+  if (c != 0) {
+    return c;
+  }
+  auto a_footer = ExtractInternalKeyFooter(a.Encode());
+  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  if (a_footer == kRangeTombstoneSentinel) {
+    if (b_footer != kRangeTombstoneSentinel) {
+      return -1;
+    }
+  } else if (b_footer == kRangeTombstoneSentinel) {
+    return 1;
+  }
+  return 0;
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b) {
+  if (a == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, *a, b);
+}
+
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b) {
+  if (b == nullptr) {
+    return -1;
+  }
+  return sstableKeyCompare(user_cmp, a, *b);
+}
+
+uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
+  uint64_t sum = 0;
+  for (size_t i = 0; i < files.size() && files[i]; i++) {
+    sum += files[i]->fd.GetFileSize();
+  }
+  return sum;
+}
+
+void Compaction::SetInputVersion(Version* _input_version) {
+  input_version_ = _input_version;
+  cfd_ = input_version_->cfd();
+
+  cfd_->Ref();
+  input_version_->Ref();
+  edit_.SetColumnFamily(cfd_->GetID());
+}
+
+void Compaction::GetBoundaryKeys(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
+    Slice* largest_user_key, int exclude_level) {
+  bool initialized = false;
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
+      continue;
+    }
+    if (inputs[i].level == 0) {
+      // we need to consider all files on level 0
+      for (const auto* f : inputs[i].files) {
+        const Slice& start_user_key = f->smallest.user_key();
+        if (!initialized ||
+            ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+          *smallest_user_key = start_user_key;
+        }
+        const Slice& end_user_key = f->largest.user_key();
+        if (!initialized ||
+            ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+          *largest_user_key = end_user_key;
+        }
+        initialized = true;
+      }
+    } else {
+      // we only need to consider the first and last file
+      const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
+      if (!initialized ||
+          ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
+        *smallest_user_key = start_user_key;
+      }
+      const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
+      if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
+        *largest_user_key = end_user_key;
+      }
+      initialized = true;
+    }
+  }
+}
+
+std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
+    VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
+  const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i].level == 0 || inputs[i].files.empty()) {
+      continue;
+    }
+    inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
+    AtomicCompactionUnitBoundary cur_boundary;
+    size_t first_atomic_idx = 0;
+    auto add_unit_boundary = [&](size_t to) {
+      if (first_atomic_idx == to) return;
+      for (size_t k = first_atomic_idx; k < to; k++) {
+        inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
+      }
+      first_atomic_idx = to;
+    };
+    for (size_t j = 0; j < inputs[i].files.size(); j++) {
+      const auto* f = inputs[i].files[j];
+      if (j == 0) {
+        // First file in a level.
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      } else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
+                 0) {
+        // SSTs overlap but the end key of the previous file was not
+        // artificially extended by a range tombstone. Extend the current
+        // boundary.
+        cur_boundary.largest = &f->largest;
+      } else {
+        // Atomic compaction unit has ended.
+        add_unit_boundary(j);
+        cur_boundary.smallest = &f->smallest;
+        cur_boundary.largest = &f->largest;
+      }
+    }
+    add_unit_boundary(inputs[i].files.size());
+    assert(inputs[i].files.size() ==
+           inputs[i].atomic_compaction_unit_boundaries.size());
+  }
+  return inputs;
+}
+
+// helper function to determine if compaction is creating files at the
+// bottommost level
+bool Compaction::IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  int output_l0_idx;
+  if (output_level == 0) {
+    output_l0_idx = 0;
+    for (const auto* file : vstorage->LevelFiles(0)) {
+      if (inputs[0].files.back() == file) {
+        break;
+      }
+      ++output_l0_idx;
+    }
+    assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
+  } else {
+    output_l0_idx = -1;
+  }
+  Slice smallest_key, largest_key;
+  GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
+  return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
+                                                  output_level, output_l0_idx);
+}
+
+// test function to validate the functionality of IsBottommostLevel()
+// function -- determines if compaction with inputs and storage is bottommost
+bool Compaction::TEST_IsBottommostLevel(
+    int output_level, VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  return IsBottommostLevel(output_level, vstorage, inputs);
+}
+
+bool Compaction::IsFullCompaction(
+    VersionStorageInfo* vstorage,
+    const std::vector<CompactionInputFiles>& inputs) {
+  size_t num_files_in_compaction = 0;
+  size_t total_num_files = 0;
+  for (int l = 0; l < vstorage->num_levels(); l++) {
+    total_num_files += vstorage->NumLevelFiles(l);
+  }
+  for (size_t i = 0; i < inputs.size(); i++) {
+    num_files_in_compaction += inputs[i].size();
+  }
+  return num_files_in_compaction == total_num_files;
+}
+
+Compaction::Compaction(
+    VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
+    const MutableCFOptions& _mutable_cf_options,
+    const MutableDBOptions& _mutable_db_options,
+    std::vector<CompactionInputFiles> _inputs, int _output_level,
+    uint64_t _target_file_size, uint64_t _max_compaction_bytes,
+    uint32_t _output_path_id, CompressionType _compression,
+    CompressionOptions _compression_opts, Temperature _output_temperature,
+    uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
+    bool _manual_compaction, const std::string& _trim_ts, double _score,
+    bool _deletion_compaction, bool l0_files_might_overlap,
+    CompactionReason _compaction_reason,
+    BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
+    double _blob_garbage_collection_age_cutoff)
+    : input_vstorage_(vstorage),
+      start_level_(_inputs[0].level),
+      output_level_(_output_level),
+      target_output_file_size_(_target_file_size),
+      max_compaction_bytes_(_max_compaction_bytes),
+      max_subcompactions_(_max_subcompactions),
+      immutable_options_(_immutable_options),
+      mutable_cf_options_(_mutable_cf_options),
+      input_version_(nullptr),
+      number_levels_(vstorage->num_levels()),
+      cfd_(nullptr),
+      output_path_id_(_output_path_id),
+      output_compression_(_compression),
+      output_compression_opts_(_compression_opts),
+      output_temperature_(_output_temperature),
+      deletion_compaction_(_deletion_compaction),
+      l0_files_might_overlap_(l0_files_might_overlap),
+      inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
+      grandparents_(std::move(_grandparents)),
+      score_(_score),
+      bottommost_level_(IsBottommostLevel(output_level_, vstorage, inputs_)),
+      is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
+      is_manual_compaction_(_manual_compaction),
+      trim_ts_(_trim_ts),
+      is_trivial_move_(false),
+
+      compaction_reason_(_compaction_reason),
+      notify_on_compaction_completion_(false),
+      enable_blob_garbage_collection_(
+          _blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce
+              ? true
+              : (_blob_garbage_collection_policy ==
+                         BlobGarbageCollectionPolicy::kDisable
+                     ? false
+                     : mutable_cf_options()->enable_blob_garbage_collection)),
+      blob_garbage_collection_age_cutoff_(
+          _blob_garbage_collection_age_cutoff < 0 ||
+                  _blob_garbage_collection_age_cutoff > 1
+              ? mutable_cf_options()->blob_garbage_collection_age_cutoff
+              : _blob_garbage_collection_age_cutoff),
+      penultimate_level_(EvaluatePenultimateLevel(
+          vstorage, immutable_options_, start_level_, output_level_)) {
+  MarkFilesBeingCompacted(true);
+  if (is_manual_compaction_) {
+    compaction_reason_ = CompactionReason::kManualCompaction;
+  }
+  if (max_subcompactions_ == 0) {
+    max_subcompactions_ = _mutable_db_options.max_subcompactions;
+  }
+
+  // for the non-bottommost levels, it tries to build files match the target
+  // file size, but not guaranteed. It could be 2x the size of the target size.
+  max_output_file_size_ =
+      bottommost_level_ || grandparents_.empty() ||
+              !_immutable_options.level_compaction_dynamic_file_size
+          ? target_output_file_size_
+          : 2 * target_output_file_size_;
+
+#ifndef NDEBUG
+  for (size_t i = 1; i < inputs_.size(); ++i) {
+    assert(inputs_[i].level > inputs_[i - 1].level);
+  }
+#endif
+
+  // setup input_levels_
+  {
+    input_levels_.resize(num_input_levels());
+    for (size_t which = 0; which < num_input_levels(); which++) {
+      DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
+                                &arena_);
+    }
+  }
+
+  GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
+
+  // Every compaction regardless of any compaction reason may respect the
+  // existing compact cursor in the output level to split output files
+  output_split_key_ = nullptr;
+  if (immutable_options_.compaction_style == kCompactionStyleLevel &&
+      immutable_options_.compaction_pri == kRoundRobin) {
+    const InternalKey* cursor =
+        &input_vstorage_->GetCompactCursors()[output_level_];
+    if (cursor->size() != 0) {
+      const Slice& cursor_user_key = ExtractUserKey(cursor->Encode());
+      auto ucmp = vstorage->InternalComparator()->user_comparator();
+      // May split output files according to the cursor if it in the user-key
+      // range
+      if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) >
+              0 &&
+          ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <=
+              0) {
+        output_split_key_ = cursor;
+      }
+    }
+  }
+
+  PopulatePenultimateLevelOutputRange();
+}
+
+void Compaction::PopulatePenultimateLevelOutputRange() {
+  if (!SupportsPerKeyPlacement()) {
+    return;
+  }
+
+  // exclude the last level, the range of all input levels is the safe range
+  // of keys that can be moved up.
+  int exclude_level = number_levels_ - 1;
+  penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
+
+  // For universal compaction, the penultimate_output_range could be extended if
+  // all penultimate level files are included in the compaction (which includes
+  // the case that the penultimate level is empty).
+  if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
+    exclude_level = kInvalidLevel;
+    std::set<uint64_t> penultimate_inputs;
+    for (const auto& input_lvl : inputs_) {
+      if (input_lvl.level == penultimate_level_) {
+        for (const auto& file : input_lvl.files) {
+          penultimate_inputs.emplace(file->fd.GetNumber());
+        }
+      }
+    }
+    auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+    for (const auto& file : penultimate_files) {
+      if (penultimate_inputs.find(file->fd.GetNumber()) ==
+          penultimate_inputs.end()) {
+        exclude_level = number_levels_ - 1;
+        penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
+        break;
+      }
+    }
+  }
+
+  GetBoundaryKeys(input_vstorage_, inputs_,
+                  &penultimate_level_smallest_user_key_,
+                  &penultimate_level_largest_user_key_, exclude_level);
+
+  // If there's a case that the penultimate level output range is overlapping
+  // with the existing files, disable the penultimate level output by setting
+  // the range to empty. One example is the range delete could have overlap
+  // boundary with the next file. (which is actually a false overlap)
+  // TODO: Exclude such false overlap, so it won't disable the penultimate
+  //  output.
+  std::set<uint64_t> penultimate_inputs;
+  for (const auto& input_lvl : inputs_) {
+    if (input_lvl.level == penultimate_level_) {
+      for (const auto& file : input_lvl.files) {
+        penultimate_inputs.emplace(file->fd.GetNumber());
+      }
+    }
+  }
+
+  auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
+  for (const auto& file : penultimate_files) {
+    if (penultimate_inputs.find(file->fd.GetNumber()) ==
+            penultimate_inputs.end() &&
+        OverlapPenultimateLevelOutputRange(file->smallest.user_key(),
+                                           file->largest.user_key())) {
+      // basically disable the penultimate range output. which should be rare
+      // or a false overlap caused by range del
+      penultimate_level_smallest_user_key_ = "";
+      penultimate_level_largest_user_key_ = "";
+      penultimate_output_range_type_ = PenultimateOutputRangeType::kDisabled;
+    }
+  }
+}
+
+Compaction::~Compaction() {
+  if (input_version_ != nullptr) {
+    input_version_->Unref();
+  }
+  if (cfd_ != nullptr) {
+    cfd_->UnrefAndTryDelete();
+  }
+}
+
+bool Compaction::SupportsPerKeyPlacement() const {
+  return penultimate_level_ != kInvalidLevel;
+}
+
+int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
+
+// smallest_key and largest_key include timestamps if user-defined timestamp is
+// enabled.
+bool Compaction::OverlapPenultimateLevelOutputRange(
+    const Slice& smallest_key, const Slice& largest_key) const {
+  if (!SupportsPerKeyPlacement()) {
+    return false;
+  }
+  const Comparator* ucmp =
+      input_vstorage_->InternalComparator()->user_comparator();
+
+  return ucmp->CompareWithoutTimestamp(
+             smallest_key, penultimate_level_largest_user_key_) <= 0 &&
+         ucmp->CompareWithoutTimestamp(
+             largest_key, penultimate_level_smallest_user_key_) >= 0;
+}
+
+// key includes timestamp if user-defined timestamp is enabled.
+bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
+  if (!SupportsPerKeyPlacement()) {
+    return false;
+  }
+
+  if (penultimate_level_smallest_user_key_.empty() ||
+      penultimate_level_largest_user_key_.empty()) {
+    return false;
+  }
+
+  const Comparator* ucmp =
+      input_vstorage_->InternalComparator()->user_comparator();
+
+  return ucmp->CompareWithoutTimestamp(
+             key, penultimate_level_smallest_user_key_) >= 0 &&
+         ucmp->CompareWithoutTimestamp(
+             key, penultimate_level_largest_user_key_) <= 0;
+}
+
+bool Compaction::InputCompressionMatchesOutput() const {
+  int base_level = input_vstorage_->base_level();
+  bool matches =
+      (GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_,
+                          base_level) == output_compression_);
+  if (matches) {
+    TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
+    return true;
+  }
+  TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
+  return matches;
+}
+
+bool Compaction::IsTrivialMove() const {
+  // Avoid a move if there is lots of overlapping grandparent data.
+  // Otherwise, the move could create a parent file that will require
+  // a very expensive merge later on.
+  // If start_level_== output_level_, the purpose is to force compaction
+  // filter to be applied to that level, and thus cannot be a trivial move.
+
+  // Check if start level have files with overlapping ranges
+  if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false &&
+      l0_files_might_overlap_) {
+    // We cannot move files from L0 to L1 if the L0 files in the LSM-tree are
+    // overlapping, unless we are sure that files picked in L0 don't overlap.
+    return false;
+  }
+
+  if (is_manual_compaction_ &&
+      (immutable_options_.compaction_filter != nullptr ||
+       immutable_options_.compaction_filter_factory != nullptr)) {
+    // This is a manual compaction and we have a compaction filter that should
+    // be executed, we cannot do a trivial move
+    return false;
+  }
+
+  if (start_level_ == output_level_) {
+    // It doesn't make sense if compaction picker picks files just to trivial
+    // move to the same level.
+    return false;
+  }
+
+  // Used in universal compaction, where trivial move can be done if the
+  // input files are non overlapping
+  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
+      (output_level_ != 0) &&
+      (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) {
+    return is_trivial_move_;
+  }
+
+  if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
+        input(0, 0)->fd.GetPathId() == output_path_id() &&
+        InputCompressionMatchesOutput())) {
+    return false;
+  }
+
+  // assert inputs_.size() == 1
+
+  std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+
+  for (const auto& file : inputs_.front().files) {
+    std::vector<FileMetaData*> file_grand_parents;
+    if (output_level_ + 1 >= number_levels_) {
+      continue;
+    }
+    input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+                                          &file->largest, &file_grand_parents);
+    const auto compaction_size =
+        file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+    if (compaction_size > max_compaction_bytes_) {
+      return false;
+    }
+
+    if (partitioner.get() != nullptr) {
+      if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+                                         file->largest.user_key())) {
+        return false;
+      }
+    }
+  }
+
+  // PerKeyPlacement compaction should never be trivial move.
+  if (SupportsPerKeyPlacement()) {
+    return false;
+  }
+
+  return true;
+}
+
+void Compaction::AddInputDeletions(VersionEdit* out_edit) {
+  for (size_t which = 0; which < num_input_levels(); which++) {
+    for (size_t i = 0; i < inputs_[which].size(); i++) {
+      out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
+    }
+  }
+}
+
+bool Compaction::KeyNotExistsBeyondOutputLevel(
+    const Slice& user_key, std::vector<size_t>* level_ptrs) const {
+  assert(input_version_ != nullptr);
+  assert(level_ptrs != nullptr);
+  assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+  if (bottommost_level_) {
+    return true;
+  } else if (output_level_ != 0 &&
+             cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    // Maybe use binary search to find right entry instead of linear search?
+    const Comparator* user_cmp = cfd_->user_comparator();
+    for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+      const std::vector<FileMetaData*>& files =
+          input_vstorage_->LevelFiles(lvl);
+      for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+        auto* f = files[level_ptrs->at(lvl)];
+        if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
+          // We've advanced far enough
+          // In the presence of user-defined timestamp, we may need to handle
+          // the case in which f->smallest.user_key() (including ts) has the
+          // same user key, but the ts part is smaller. If so,
+          // Compare(user_key, f->smallest.user_key()) returns -1.
+          // That's why we need CompareWithoutTimestamp().
+          if (user_cmp->CompareWithoutTimestamp(user_key,
+                                                f->smallest.user_key()) >= 0) {
+            // Key falls in this file's range, so it may
+            // exist beyond output level
+            return false;
+          }
+          break;
+        }
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+// Mark (or clear) each file that is being compacted
+void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
+  for (size_t i = 0; i < num_input_levels(); i++) {
+    for (size_t j = 0; j < inputs_[i].size(); j++) {
+      assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
+                               : inputs_[i][j]->being_compacted);
+      inputs_[i][j]->being_compacted = mark_as_compacted;
+    }
+  }
+}
+
+// Sample output:
+// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
+// print: "3@0 + 2@3 + 1@4 files to L5"
+const char* Compaction::InputLevelSummary(
+    InputLevelSummaryBuffer* scratch) const {
+  int len = 0;
+  bool is_first = true;
+  for (auto& input_level : inputs_) {
+    if (input_level.empty()) {
+      continue;
+    }
+    if (!is_first) {
+      len +=
+          snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
+      len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+    } else {
+      is_first = false;
+    }
+    len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                    "%" ROCKSDB_PRIszt "@%d", input_level.size(),
+                    input_level.level);
+    len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+           " files to L%d", output_level());
+
+  return scratch->buffer;
+}
+
+uint64_t Compaction::CalculateTotalInputSize() const {
+  uint64_t size = 0;
+  for (auto& input_level : inputs_) {
+    for (auto f : input_level.files) {
+      size += f->fd.GetFileSize();
+    }
+  }
+  return size;
+}
+
+void Compaction::ReleaseCompactionFiles(Status status) {
+  MarkFilesBeingCompacted(false);
+  cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
+}
+
+void Compaction::ResetNextCompactionIndex() {
+  assert(input_version_ != nullptr);
+  input_vstorage_->ResetNextCompactionIndex(start_level_);
+}
+
+namespace {
+int InputSummary(const std::vector<FileMetaData*>& files, char* output,
+                 int len) {
+  *output = '\0';
+  int write = 0;
+  for (size_t i = 0; i < files.size(); i++) {
+    int sz = len - write;
+    int ret;
+    char sztxt[16];
+    AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
+    ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
+                   files.at(i)->fd.GetNumber(), sztxt);
+    if (ret < 0 || ret >= sz) break;
+    write += ret;
+  }
+  // if files.size() is non-zero, overwrite the last space
+  return write - !!files.size();
+}
+}  // namespace
+
+void Compaction::Summary(char* output, int len) {
+  int write =
+      snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
+               input_version_->GetVersionNumber(), start_level_);
+  if (write < 0 || write >= len) {
+    return;
+  }
+
+  for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
+    if (level_iter > 0) {
+      write += snprintf(output + write, len - write, "], [");
+      if (write < 0 || write >= len) {
+        return;
+      }
+    }
+    write +=
+        InputSummary(inputs_[level_iter].files, output + write, len - write);
+    if (write < 0 || write >= len) {
+      return;
+    }
+  }
+
+  snprintf(output + write, len - write, "]");
+}
+
+uint64_t Compaction::OutputFilePreallocationSize() const {
+  uint64_t preallocation_size = 0;
+
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      preallocation_size += file->fd.GetFileSize();
+    }
+  }
+
+  if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() &&
+      (immutable_options_.compaction_style == kCompactionStyleLevel ||
+       output_level() > 0)) {
+    preallocation_size = std::min(max_output_file_size_, preallocation_size);
+  }
+
+  // Over-estimate slightly so we don't end up just barely crossing
+  // the threshold
+  // No point to preallocate more than 1GB.
+  return std::min(uint64_t{1073741824},
+                  preallocation_size + (preallocation_size / 10));
+}
+
+std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
+  if (!cfd_->ioptions()->compaction_filter_factory) {
+    return nullptr;
+  }
+
+  if (!cfd_->ioptions()
+           ->compaction_filter_factory->ShouldFilterTableFileCreation(
+               TableFileCreationReason::kCompaction)) {
+    return nullptr;
+  }
+
+  CompactionFilter::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.column_family_id = cfd_->GetID();
+  context.reason = TableFileCreationReason::kCompaction;
+  return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
+      context);
+}
+
+std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
+  if (!immutable_options_.sst_partitioner_factory) {
+    return nullptr;
+  }
+
+  SstPartitioner::Context context;
+  context.is_full_compaction = is_full_compaction_;
+  context.is_manual_compaction = is_manual_compaction_;
+  context.output_level = output_level_;
+  context.smallest_user_key = smallest_user_key_;
+  context.largest_user_key = largest_user_key_;
+  return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
+}
+
+bool Compaction::IsOutputLevelEmpty() const {
+  return inputs_.back().level != output_level_ || inputs_.back().empty();
+}
+
+bool Compaction::ShouldFormSubcompactions() const {
+  if (cfd_ == nullptr) {
+    return false;
+  }
+
+  // Round-Robin pri under leveled compaction allows subcompactions by default
+  // and the number of subcompactions can be larger than max_subcompactions_
+  if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
+      cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return output_level_ > 0;
+  }
+
+  if (max_subcompactions_ <= 1) {
+    return false;
+  }
+
+  if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
+  } else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
+    return number_levels_ > 1 && output_level_ > 0;
+  } else {
+    return false;
+  }
+}
+
+bool Compaction::DoesInputReferenceBlobFiles() const {
+  assert(input_version_);
+
+  const VersionStorageInfo* storage_info = input_version_->storage_info();
+  assert(storage_info);
+
+  if (storage_info->GetBlobFiles().empty()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    for (const FileMetaData* meta : inputs_[i].files) {
+      assert(meta);
+
+      if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+uint64_t Compaction::MinInputFileOldestAncesterTime(
+    const InternalKey* start, const InternalKey* end) const {
+  uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
+  const InternalKeyComparator& icmp =
+      column_family_data()->internal_comparator();
+  for (const auto& level_files : inputs_) {
+    for (const auto& file : level_files.files) {
+      if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
+        continue;
+      }
+      if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
+        continue;
+      }
+      uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+      if (oldest_ancester_time != 0) {
+        min_oldest_ancester_time =
+            std::min(min_oldest_ancester_time, oldest_ancester_time);
+      }
+    }
+  }
+  return min_oldest_ancester_time;
+}
+
+int Compaction::EvaluatePenultimateLevel(
+    const VersionStorageInfo* vstorage,
+    const ImmutableOptions& immutable_options, const int start_level,
+    const int output_level) {
+  // TODO: currently per_key_placement feature only support level and universal
+  //  compaction
+  if (immutable_options.compaction_style != kCompactionStyleLevel &&
+      immutable_options.compaction_style != kCompactionStyleUniversal) {
+    return kInvalidLevel;
+  }
+  if (output_level != immutable_options.num_levels - 1) {
+    return kInvalidLevel;
+  }
+
+  int penultimate_level = output_level - 1;
+  assert(penultimate_level < immutable_options.num_levels);
+  if (penultimate_level <= 0) {
+    return kInvalidLevel;
+  }
+
+  // If the penultimate level is not within input level -> output level range
+  // check if the penultimate output level is empty, if it's empty, it could
+  // also be locked for the penultimate output.
+  // TODO: ideally, it only needs to check if there's a file within the
+  //  compaction output key range. For simplicity, it just check if there's any
+  //  file on the penultimate level.
+  if (start_level == immutable_options.num_levels - 1 &&
+      (immutable_options.compaction_style != kCompactionStyleUniversal ||
+       !vstorage->LevelFiles(penultimate_level).empty())) {
+    return kInvalidLevel;
+  }
+
+  bool supports_per_key_placement =
+      immutable_options.preclude_last_level_data_seconds > 0;
+
+  // it could be overridden by unittest
+  TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
+                           &supports_per_key_placement);
+  if (!supports_per_key_placement) {
+    return kInvalidLevel;
+  }
+
+  return penultimate_level;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction.h b/src/rocksdb/db/compaction/compaction.h
new file mode 100644
index 000000000..21d1190ac
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction.h
@@ -0,0 +1,559 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include "db/version_set.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/sst_partitioner.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+// The file contains class Compaction, as well as some helper functions
+// and data structures used by the class.
+
+// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
+// null which provides the property that a==null indicates a key that is less
+// than any key and b==null indicates a key that is greater than any key. Note
+// that the comparison is performed primarily on the user-key portion of the
+// key. If the user-keys compare equal, an additional test is made to sort
+// range tombstone sentinel keys before other keys with the same user-key. The
+// result is that 2 user-keys will compare equal if they differ purely on
+// their sequence number and value, but the range tombstone sentinel for that
+// user-key will compare not equal. This is necessary because the range
+// tombstone sentinel key is set as the largest key for an sstable even though
+// that key never appears in the database. We don't want adjacent sstables to
+// be considered overlapping if they are separated by the range tombstone
+// sentinel.
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
+                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                      const InternalKey* b);
+
+// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
+// largest] that exactly spans one ore more neighbouring SSTs on the same
+// level. Every pair of  SSTs in this range "overlap" (i.e., the largest
+// user key of one file is the smallest user key of the next file). These
+// boundaries are propagated down to RangeDelAggregator during compaction
+// to provide safe truncation boundaries for range tombstones.
+struct AtomicCompactionUnitBoundary {
+  const InternalKey* smallest = nullptr;
+  const InternalKey* largest = nullptr;
+};
+
+// The structure that manages compaction input files associated
+// with the same physical level.
+struct CompactionInputFiles {
+  int level;
+  std::vector<FileMetaData*> files;
+  std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
+  inline bool empty() const { return files.empty(); }
+  inline size_t size() const { return files.size(); }
+  inline void clear() { files.clear(); }
+  inline FileMetaData* operator[](size_t i) const { return files[i]; }
+};
+
+class Version;
+class ColumnFamilyData;
+class VersionStorageInfo;
+class CompactionFilter;
+
+// A Compaction encapsulates metadata about a compaction.
+class Compaction {
+ public:
+  Compaction(VersionStorageInfo* input_version,
+             const ImmutableOptions& immutable_options,
+             const MutableCFOptions& mutable_cf_options,
+             const MutableDBOptions& mutable_db_options,
+             std::vector<CompactionInputFiles> inputs, int output_level,
+             uint64_t target_file_size, uint64_t max_compaction_bytes,
+             uint32_t output_path_id, CompressionType compression,
+             CompressionOptions compression_opts,
+             Temperature output_temperature, uint32_t max_subcompactions,
+             std::vector<FileMetaData*> grandparents,
+             bool manual_compaction = false, const std::string& trim_ts = "",
+             double score = -1, bool deletion_compaction = false,
+             bool l0_files_might_overlap = true,
+             CompactionReason compaction_reason = CompactionReason::kUnknown,
+             BlobGarbageCollectionPolicy blob_garbage_collection_policy =
+                 BlobGarbageCollectionPolicy::kUseDefault,
+             double blob_garbage_collection_age_cutoff = -1);
+
+  // The type of the penultimate level output range
+  enum class PenultimateOutputRangeType : int {
+    kNotSupported,  // it cannot output to the penultimate level
+    kFullRange,     // any data could be output to the penultimate level
+    kNonLastRange,  // only the keys within non_last_level compaction inputs can
+                    // be outputted to the penultimate level
+    kDisabled,      // no data can be outputted to the penultimate level
+  };
+
+  // No copying allowed
+  Compaction(const Compaction&) = delete;
+  void operator=(const Compaction&) = delete;
+
+  ~Compaction();
+
+  // Returns the level associated to the specified compaction input level.
+  // If compaction_input_level is not specified, then input_level is set to 0.
+  int level(size_t compaction_input_level = 0) const {
+    return inputs_[compaction_input_level].level;
+  }
+
+  int start_level() const { return start_level_; }
+
+  // Outputs will go to this level
+  int output_level() const { return output_level_; }
+
+  // Returns the number of input levels in this compaction.
+  size_t num_input_levels() const { return inputs_.size(); }
+
+  // Return the object that holds the edits to the descriptor done
+  // by this compaction.
+  VersionEdit* edit() { return &edit_; }
+
+  // Returns the number of input files associated to the specified
+  // compaction input level.
+  // The function will return 0 if when "compaction_input_level" < 0
+  // or "compaction_input_level" >= "num_input_levels()".
+  size_t num_input_files(size_t compaction_input_level) const {
+    if (compaction_input_level < inputs_.size()) {
+      return inputs_[compaction_input_level].size();
+    }
+    return 0;
+  }
+
+  // Returns input version of the compaction
+  Version* input_version() const { return input_version_; }
+
+  // Returns the ColumnFamilyData associated with the compaction.
+  ColumnFamilyData* column_family_data() const { return cfd_; }
+
+  // Returns the file meta data of the 'i'th input file at the
+  // specified compaction input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  FileMetaData* input(size_t compaction_input_level, size_t i) const {
+    assert(compaction_input_level < inputs_.size());
+    return inputs_[compaction_input_level][i];
+  }
+
+  const std::vector<AtomicCompactionUnitBoundary>* boundaries(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
+  }
+
+  // Returns the list of file meta data of the specified compaction
+  // input level.
+  // REQUIREMENT: "compaction_input_level" must be >= 0 and
+  //              < "input_levels()"
+  const std::vector<FileMetaData*>* inputs(
+      size_t compaction_input_level) const {
+    assert(compaction_input_level < inputs_.size());
+    return &inputs_[compaction_input_level].files;
+  }
+
+  const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
+
+  // Returns the LevelFilesBrief of the specified compaction input level.
+  const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
+    return &input_levels_[compaction_input_level];
+  }
+
+  // Maximum size of files to build during this compaction.
+  uint64_t max_output_file_size() const { return max_output_file_size_; }
+
+  // Target output file size for this compaction
+  uint64_t target_output_file_size() const { return target_output_file_size_; }
+
+  // What compression for output
+  CompressionType output_compression() const { return output_compression_; }
+
+  // What compression options for output
+  const CompressionOptions& output_compression_opts() const {
+    return output_compression_opts_;
+  }
+
+  // Whether need to write output file to second DB path.
+  uint32_t output_path_id() const { return output_path_id_; }
+
+  // Is this a trivial compaction that can be implemented by just
+  // moving a single input file to the next level (no merging or splitting)
+  bool IsTrivialMove() const;
+
+  // The split user key in the output level if this compaction is required to
+  // split the output files according to the existing cursor in the output
+  // level under round-robin compaction policy. Empty indicates no required
+  // splitting key
+  const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
+
+  // If true, then the compaction can be done by simply deleting input files.
+  bool deletion_compaction() const { return deletion_compaction_; }
+
+  // Add all inputs to this compaction as delete operations to *edit.
+  void AddInputDeletions(VersionEdit* edit);
+
+  // Returns true if the available information we have guarantees that
+  // the input "user_key" does not exist in any level beyond "output_level()".
+  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
+                                     std::vector<size_t>* level_ptrs) const;
+
+  // Clear all files to indicate that they are not being compacted
+  // Delete this compaction from the list of running compactions.
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Status status);
+
+  // Returns the summary of the compaction in "output" with maximum "len"
+  // in bytes.  The caller is responsible for the memory management of
+  // "output".
+  void Summary(char* output, int len);
+
+  // Return the score that was used to pick this compaction run.
+  double score() const { return score_; }
+
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level() const { return bottommost_level_; }
+
+  // Is the compaction compact to the last level
+  bool is_last_level() const {
+    return output_level_ == immutable_options_.num_levels - 1;
+  }
+
+  // Does this compaction include all sst files?
+  bool is_full_compaction() const { return is_full_compaction_; }
+
+  // Was this compaction triggered manually by the client?
+  bool is_manual_compaction() const { return is_manual_compaction_; }
+
+  std::string trim_ts() const { return trim_ts_; }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. If all the input files are
+  // non overlapping, then is_trivial_move_ variable
+  // will be set true, else false
+  void set_is_trivial_move(bool trivial_move) {
+    is_trivial_move_ = trivial_move;
+  }
+
+  // Used when allow_trivial_move option is set in
+  // Universal compaction. Returns true, if the input files
+  // are non-overlapping and can be trivially moved.
+  bool is_trivial_move() const { return is_trivial_move_; }
+
+  // How many total levels are there?
+  int number_levels() const { return number_levels_; }
+
+  // Return the ImmutableOptions that should be used throughout the compaction
+  // procedure
+  const ImmutableOptions* immutable_options() const {
+    return &immutable_options_;
+  }
+
+  // Return the MutableCFOptions that should be used throughout the compaction
+  // procedure
+  const MutableCFOptions* mutable_cf_options() const {
+    return &mutable_cf_options_;
+  }
+
+  // Returns the size in bytes that the output file should be preallocated to.
+  // In level compaction, that is max_file_size_. In universal compaction, that
+  // is the sum of all input file sizes.
+  uint64_t OutputFilePreallocationSize() const;
+
+  void SetInputVersion(Version* input_version);
+
+  struct InputLevelSummaryBuffer {
+    char buffer[128];
+  };
+
+  const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
+
+  uint64_t CalculateTotalInputSize() const;
+
+  // In case of compaction error, reset the nextIndex that is used
+  // to pick up the next file to be compacted from files_by_size_
+  void ResetNextCompactionIndex();
+
+  // Create a CompactionFilter from compaction_filter_factory
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
+
+  // Create a SstPartitioner from sst_partitioner_factory
+  std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
+
+  // Is the input level corresponding to output_level_ empty?
+  bool IsOutputLevelEmpty() const;
+
+  // Should this compaction be broken up into smaller ones run in parallel?
+  bool ShouldFormSubcompactions() const;
+
+  // Returns true iff at least one input file references a blob file.
+  //
+  // PRE: input version has been set.
+  bool DoesInputReferenceBlobFiles() const;
+
+  // test function to validate the functionality of IsBottommostLevel()
+  // function -- determines if compaction with inputs and storage is bottommost
+  static bool TEST_IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  TablePropertiesCollection GetOutputTableProperties() const {
+    return output_table_properties_;
+  }
+
+  void SetOutputTableProperties(TablePropertiesCollection tp) {
+    output_table_properties_ = std::move(tp);
+  }
+
+  Slice GetSmallestUserKey() const { return smallest_user_key_; }
+
+  Slice GetLargestUserKey() const { return largest_user_key_; }
+
+  Slice GetPenultimateLevelSmallestUserKey() const {
+    return penultimate_level_smallest_user_key_;
+  }
+
+  Slice GetPenultimateLevelLargestUserKey() const {
+    return penultimate_level_largest_user_key_;
+  }
+
+  PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
+    return penultimate_output_range_type_;
+  }
+
+  // Return true if the compaction supports per_key_placement
+  bool SupportsPerKeyPlacement() const;
+
+  // Get per_key_placement penultimate output level, which is `last_level - 1`
+  // if per_key_placement feature is supported. Otherwise, return -1.
+  int GetPenultimateLevel() const;
+
+  // Return true if the given range is overlap with penultimate level output
+  // range.
+  // Both smallest_key and largest_key include timestamps if user-defined
+  // timestamp is enabled.
+  bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
+                                          const Slice& largest_key) const;
+
+  // Return true if the key is within penultimate level output range for
+  // per_key_placement feature, which is safe to place the key to the
+  // penultimate level. different compaction strategy has different rules.
+  // If per_key_placement is not supported, always return false.
+  // TODO: currently it doesn't support moving data from the last level to the
+  //  penultimate level
+  //  key includes timestamp if user-defined timestamp is enabled.
+  bool WithinPenultimateLevelOutputRange(const Slice& key) const;
+
+  CompactionReason compaction_reason() const { return compaction_reason_; }
+
+  const std::vector<FileMetaData*>& grandparents() const {
+    return grandparents_;
+  }
+
+  uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
+
+  Temperature output_temperature() const { return output_temperature_; }
+
+  uint32_t max_subcompactions() const { return max_subcompactions_; }
+
+  bool enable_blob_garbage_collection() const {
+    return enable_blob_garbage_collection_;
+  }
+
+  double blob_garbage_collection_age_cutoff() const {
+    return blob_garbage_collection_age_cutoff_;
+  }
+
+  // start and end are sub compact range. Null if no boundary.
+  // This is used to filter out some input files' ancester's time range.
+  uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
+                                          const InternalKey* end) const;
+
+  // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
+  // compaction begin and compaction completion callbacks match.
+  void SetNotifyOnCompactionCompleted() {
+    notify_on_compaction_completion_ = true;
+  }
+
+  bool ShouldNotifyOnCompactionCompleted() const {
+    return notify_on_compaction_completion_;
+  }
+
+  static constexpr int kInvalidLevel = -1;
+
+  // Evaluate penultimate output level. If the compaction supports
+  // per_key_placement feature, it returns the penultimate level number.
+  // Otherwise, it's set to kInvalidLevel (-1), which means
+  // output_to_penultimate_level is not supported.
+  // Note: even the penultimate level output is supported (PenultimateLevel !=
+  // kInvalidLevel), some key range maybe unsafe to be outputted to the
+  // penultimate level. The safe key range is populated by
+  // `PopulatePenultimateLevelOutputRange()`.
+  // Which could potentially disable all penultimate level output.
+  static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
+                                      const ImmutableOptions& immutable_options,
+                                      const int start_level,
+                                      const int output_level);
+
+ private:
+  // mark (or clear) all files that are being compacted
+  void MarkFilesBeingCompacted(bool mark_as_compacted);
+
+  // get the smallest and largest key present in files to be compacted
+  static void GetBoundaryKeys(VersionStorageInfo* vstorage,
+                              const std::vector<CompactionInputFiles>& inputs,
+                              Slice* smallest_key, Slice* largest_key,
+                              int exclude_level = -1);
+
+  // populate penultimate level output range, which will be used to determine if
+  // a key is safe to output to the penultimate level (details see
+  // `Compaction::WithinPenultimateLevelOutputRange()`.
+  void PopulatePenultimateLevelOutputRange();
+
+  // Get the atomic file boundaries for all files in the compaction. Necessary
+  // in order to avoid the scenario described in
+  // https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
+  // plumb down appropriate key boundaries to RangeDelAggregator during
+  // compaction.
+  static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
+      VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
+
+  // helper function to determine if compaction with inputs and storage is
+  // bottommost
+  static bool IsBottommostLevel(
+      int output_level, VersionStorageInfo* vstorage,
+      const std::vector<CompactionInputFiles>& inputs);
+
+  static bool IsFullCompaction(VersionStorageInfo* vstorage,
+                               const std::vector<CompactionInputFiles>& inputs);
+
+  VersionStorageInfo* input_vstorage_;
+
+  const int start_level_;   // the lowest level to be compacted
+  const int output_level_;  // levels to which output files are stored
+  uint64_t target_output_file_size_;
+  uint64_t max_output_file_size_;
+  uint64_t max_compaction_bytes_;
+  uint32_t max_subcompactions_;
+  const ImmutableOptions immutable_options_;
+  const MutableCFOptions mutable_cf_options_;
+  Version* input_version_;
+  VersionEdit edit_;
+  const int number_levels_;
+  ColumnFamilyData* cfd_;
+  Arena arena_;  // Arena used to allocate space for file_levels_
+
+  const uint32_t output_path_id_;
+  CompressionType output_compression_;
+  CompressionOptions output_compression_opts_;
+  Temperature output_temperature_;
+  // If true, then the compaction can be done by simply deleting input files.
+  const bool deletion_compaction_;
+  // should it split the output file using the compact cursor?
+  const InternalKey* output_split_key_;
+
+  // L0 files in LSM-tree might be overlapping. But the compaction picking
+  // logic might pick a subset of the files that aren't overlapping. if
+  // that is the case, set the value to false. Otherwise, set it true.
+  bool l0_files_might_overlap_;
+
+  // Compaction input files organized by level. Constant after construction
+  const std::vector<CompactionInputFiles> inputs_;
+
+  // A copy of inputs_, organized more closely in memory
+  autovector<LevelFilesBrief, 2> input_levels_;
+
+  // State used to check for number of overlapping grandparent files
+  // (grandparent == "output_level_ + 1")
+  std::vector<FileMetaData*> grandparents_;
+  const double score_;  // score that was used to pick this compaction.
+
+  // Is this compaction creating a file in the bottom most level?
+  const bool bottommost_level_;
+  // Does this compaction include all sst files?
+  const bool is_full_compaction_;
+
+  // Is this compaction requested by the client?
+  const bool is_manual_compaction_;
+
+  // The data with timestamp > trim_ts_ will be removed
+  const std::string trim_ts_;
+
+  // True if we can do trivial move in Universal multi level
+  // compaction
+  bool is_trivial_move_;
+
+  // Does input compression match the output compression?
+  bool InputCompressionMatchesOutput() const;
+
+  // table properties of output files
+  TablePropertiesCollection output_table_properties_;
+
+  // smallest user keys in compaction
+  // includes timestamp if user-defined timestamp is enabled.
+  Slice smallest_user_key_;
+
+  // largest user keys in compaction
+  // includes timestamp if user-defined timestamp is enabled.
+  Slice largest_user_key_;
+
+  // Reason for compaction
+  CompactionReason compaction_reason_;
+
+  // Notify on compaction completion only if listener was notified on compaction
+  // begin.
+  bool notify_on_compaction_completion_;
+
+  // Enable/disable GC collection for blobs during compaction.
+  bool enable_blob_garbage_collection_;
+
+  // Blob garbage collection age cutoff.
+  double blob_garbage_collection_age_cutoff_;
+
+  // only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
+  // means not supported.
+  const int penultimate_level_;
+
+  // Key range for penultimate level output
+  // includes timestamp if user-defined timestamp is enabled.
+  // penultimate_output_range_type_ shows the range type
+  Slice penultimate_level_smallest_user_key_;
+  Slice penultimate_level_largest_user_key_;
+  PenultimateOutputRangeType penultimate_output_range_type_ =
+      PenultimateOutputRangeType::kNotSupported;
+};
+
+#ifndef NDEBUG
+// Helper struct only for tests, which contains the data to decide if a key
+// should be output to the penultimate level.
+// TODO: remove this when the public feature knob is available
+struct PerKeyPlacementContext {
+  const int level;
+  const Slice key;
+  const Slice value;
+  const SequenceNumber seq_num;
+
+  bool output_to_penultimate_level;
+
+  PerKeyPlacementContext(int _level, Slice _key, Slice _value,
+                         SequenceNumber _seq_num)
+      : level(_level), key(_key), value(_value), seq_num(_seq_num) {
+    output_to_penultimate_level = false;
+  }
+};
+#endif /* !NDEBUG */
+
+// Return sum of sizes of all files in `files`.
+extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iteration_stats.h b/src/rocksdb/db/compaction/compaction_iteration_stats.h
new file mode 100644
index 000000000..1b1c28b57
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iteration_stats.h
@@ -0,0 +1,49 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct CompactionIterationStats {
+  // Compaction statistics
+
+  // Doesn't include records skipped because of
+  // CompactionFilter::Decision::kRemoveAndSkipUntil.
+  int64_t num_record_drop_user = 0;
+
+  int64_t num_record_drop_hidden = 0;
+  int64_t num_record_drop_obsolete = 0;
+  int64_t num_record_drop_range_del = 0;
+  int64_t num_range_del_drop_obsolete = 0;
+  // Deletions obsoleted before bottom level due to file gap optimization.
+  int64_t num_optimized_del_drop_obsolete = 0;
+  uint64_t total_filter_time = 0;
+
+  // Input statistics
+  // TODO(noetzli): The stats are incomplete. They are lacking everything
+  // consumed by MergeHelper.
+  uint64_t num_input_records = 0;
+  uint64_t num_input_deletion_records = 0;
+  uint64_t num_input_corrupt_records = 0;
+  uint64_t total_input_raw_key_bytes = 0;
+  uint64_t total_input_raw_value_bytes = 0;
+
+  // Single-Delete diagnostics for exceptional situations
+  uint64_t num_single_del_fallthru = 0;
+  uint64_t num_single_del_mismatch = 0;
+
+  // Blob related statistics
+  uint64_t num_blobs_read = 0;
+  uint64_t total_blob_bytes_read = 0;
+  uint64_t num_blobs_relocated = 0;
+  uint64_t total_blob_bytes_relocated = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.cc b/src/rocksdb/db/compaction/compaction_iterator.cc
new file mode 100644
index 000000000..9f54f7813
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.cc
@@ -0,0 +1,1338 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <iterator>
+#include <limits>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/snapshot_checker.h"
+#include "logging/logging.h"
+#include "port/likely.h"
+#include "rocksdb/listener.h"
+#include "table/internal_iterator.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+    bool enforce_single_del_contracts,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const Compaction* compaction, const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low,
+    const SequenceNumber preserve_time_min_seqno,
+    const SequenceNumber preclude_last_level_min_seqno)
+    : CompactionIterator(
+          input, cmp, merge_helper, last_sequence, snapshots,
+          earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
+          report_detailed_time, expect_valid_internal_key, range_del_agg,
+          blob_file_builder, allow_data_in_errors, enforce_single_del_contracts,
+          manual_compaction_canceled,
+          std::unique_ptr<CompactionProxy>(
+              compaction ? new RealCompaction(compaction) : nullptr),
+          compaction_filter, shutting_down, info_log, full_history_ts_low,
+          preserve_time_min_seqno, preclude_last_level_min_seqno) {}
+
+CompactionIterator::CompactionIterator(
+    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+    SequenceNumber /*last_sequence*/, std::vector<SequenceNumber>* snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+    Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+    CompactionRangeDelAggregator* range_del_agg,
+    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+    bool enforce_single_del_contracts,
+    const std::atomic<bool>& manual_compaction_canceled,
+    std::unique_ptr<CompactionProxy> compaction,
+    const CompactionFilter* compaction_filter,
+    const std::atomic<bool>* shutting_down,
+    const std::shared_ptr<Logger> info_log,
+    const std::string* full_history_ts_low,
+    const SequenceNumber preserve_time_min_seqno,
+    const SequenceNumber preclude_last_level_min_seqno)
+    : input_(input, cmp,
+             !compaction || compaction->DoesInputReferenceBlobFiles()),
+      cmp_(cmp),
+      merge_helper_(merge_helper),
+      snapshots_(snapshots),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      job_snapshot_(job_snapshot),
+      snapshot_checker_(snapshot_checker),
+      env_(env),
+      clock_(env_->GetSystemClock().get()),
+      report_detailed_time_(report_detailed_time),
+      expect_valid_internal_key_(expect_valid_internal_key),
+      range_del_agg_(range_del_agg),
+      blob_file_builder_(blob_file_builder),
+      compaction_(std::move(compaction)),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      manual_compaction_canceled_(manual_compaction_canceled),
+      bottommost_level_(!compaction_ ? false
+                                     : compaction_->bottommost_level() &&
+                                           !compaction_->allow_ingest_behind()),
+      // snapshots_ cannot be nullptr, but we will assert later in the body of
+      // the constructor.
+      visible_at_tip_(snapshots_ ? snapshots_->empty() : false),
+      earliest_snapshot_(!snapshots_ || snapshots_->empty()
+                             ? kMaxSequenceNumber
+                             : snapshots_->at(0)),
+      info_log_(info_log),
+      allow_data_in_errors_(allow_data_in_errors),
+      enforce_single_del_contracts_(enforce_single_del_contracts),
+      timestamp_size_(cmp_ ? cmp_->timestamp_size() : 0),
+      full_history_ts_low_(full_history_ts_low),
+      current_user_key_sequence_(0),
+      current_user_key_snapshot_(0),
+      merge_out_iter_(merge_helper_),
+      blob_garbage_collection_cutoff_file_number_(
+          ComputeBlobGarbageCollectionCutoffFileNumber(compaction_.get())),
+      blob_fetcher_(CreateBlobFetcherIfNeeded(compaction_.get())),
+      prefetch_buffers_(
+          CreatePrefetchBufferCollectionIfNeeded(compaction_.get())),
+      current_key_committed_(false),
+      cmp_with_history_ts_low_(0),
+      level_(compaction_ == nullptr ? 0 : compaction_->level()),
+      preserve_time_min_seqno_(preserve_time_min_seqno),
+      preclude_last_level_min_seqno_(preclude_last_level_min_seqno) {
+  assert(snapshots_ != nullptr);
+  assert(preserve_time_min_seqno_ <= preclude_last_level_min_seqno_);
+
+  if (compaction_ != nullptr) {
+    level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
+  }
+#ifndef NDEBUG
+  // findEarliestVisibleSnapshot assumes this ordering.
+  for (size_t i = 1; i < snapshots_->size(); ++i) {
+    assert(snapshots_->at(i - 1) < snapshots_->at(i));
+  }
+  assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
+         timestamp_size_ == full_history_ts_low_->size());
+#endif
+  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
+}
+
+CompactionIterator::~CompactionIterator() {
+  // input_ Iterator lifetime is longer than pinned_iters_mgr_ lifetime
+  input_.SetPinnedItersMgr(nullptr);
+}
+
+void CompactionIterator::ResetRecordCounts() {
+  iter_stats_.num_record_drop_user = 0;
+  iter_stats_.num_record_drop_hidden = 0;
+  iter_stats_.num_record_drop_obsolete = 0;
+  iter_stats_.num_record_drop_range_del = 0;
+  iter_stats_.num_range_del_drop_obsolete = 0;
+  iter_stats_.num_optimized_del_drop_obsolete = 0;
+}
+
+void CompactionIterator::SeekToFirst() {
+  NextFromInput();
+  PrepareOutput();
+}
+
+void CompactionIterator::Next() {
+  // If there is a merge output, return it before continuing to process the
+  // input.
+  if (merge_out_iter_.Valid()) {
+    merge_out_iter_.Next();
+
+    // Check if we returned all records of the merge output.
+    if (merge_out_iter_.Valid()) {
+      key_ = merge_out_iter_.key();
+      value_ = merge_out_iter_.value();
+      Status s = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+      // MergeUntil stops when it encounters a corrupt key and does not
+      // include them in the result, so we expect the keys here to be valid.
+      if (!s.ok()) {
+        ROCKS_LOG_FATAL(
+            info_log_, "Invalid ikey %s in compaction. %s",
+            allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+            s.getState());
+        assert(false);
+      }
+
+      // Keep current_key_ in sync.
+      if (0 == timestamp_size_) {
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      } else {
+        Slice ts = ikey_.GetTimestamp(timestamp_size_);
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type, &ts);
+      }
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+      validity_info_.SetValid(ValidContext::kMerge1);
+    } else {
+      // We consumed all pinned merge operands, release pinned iterators
+      pinned_iters_mgr_.ReleasePinnedData();
+      // MergeHelper moves the iterator to the first record after the merged
+      // records, so even though we reached the end of the merge output, we do
+      // not want to advance the iterator.
+      NextFromInput();
+    }
+  } else {
+    // Only advance the input iterator if there is no merge output and the
+    // iterator is not already at the next record.
+    if (!at_next_) {
+      AdvanceInputIter();
+    }
+    NextFromInput();
+  }
+
+  if (Valid()) {
+    // Record that we've outputted a record for the current key.
+    has_outputted_key_ = true;
+  }
+
+  PrepareOutput();
+}
+
+bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
+                                              Slice* skip_until) {
+  // TODO: support compaction filter for wide-column entities
+  if (!compaction_filter_ ||
+      (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+    return true;
+  }
+  bool error = false;
+  // If the user has specified a compaction filter and the sequence
+  // number is greater than any external snapshot, then invoke the
+  // filter. If the return value of the compaction filter is true,
+  // replace the entry with a deletion marker.
+  CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  CompactionFilter::ValueType value_type =
+      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
+                               : CompactionFilter::ValueType::kBlobIndex;
+  // Hack: pass internal key to BlobIndexCompactionFilter since it needs
+  // to get sequence number.
+  assert(compaction_filter_);
+  Slice& filter_key =
+      (ikey_.type == kTypeValue ||
+       !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
+          ? ikey_.user_key
+          : key_;
+  {
+    StopWatchNano timer(clock_, report_detailed_time_);
+    if (kTypeBlobIndex == ikey_.type) {
+      filter = compaction_filter_->FilterBlobByKey(
+          level_, filter_key, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+      if (CompactionFilter::Decision::kUndetermined == filter &&
+          !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+        if (compaction_ == nullptr) {
+          status_ =
+              Status::Corruption("Unexpected blob index outside of compaction");
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        TEST_SYNC_POINT_CALLBACK(
+            "CompactionIterator::InvokeFilterIfNeeded::TamperWithBlobIndex",
+            &value_);
+
+        // For integrated BlobDB impl, CompactionIterator reads blob value.
+        // For Stacked BlobDB impl, the corresponding CompactionFilter's
+        // FilterV2 method should read the blob value.
+        BlobIndex blob_index;
+        Status s = blob_index.DecodeFrom(value_);
+        if (!s.ok()) {
+          status_ = s;
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        FilePrefetchBuffer* prefetch_buffer =
+            prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                    blob_index.file_number())
+                              : nullptr;
+
+        uint64_t bytes_read = 0;
+
+        assert(blob_fetcher_);
+
+        s = blob_fetcher_->FetchBlob(ikey_.user_key, blob_index,
+                                     prefetch_buffer, &blob_value_,
+                                     &bytes_read);
+        if (!s.ok()) {
+          status_ = s;
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        ++iter_stats_.num_blobs_read;
+        iter_stats_.total_blob_bytes_read += bytes_read;
+
+        value_type = CompactionFilter::ValueType::kValue;
+      }
+    }
+    if (CompactionFilter::Decision::kUndetermined == filter) {
+      filter = compaction_filter_->FilterV2(
+          level_, filter_key, value_type,
+          blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+          compaction_filter_skip_until_.rep());
+    }
+    iter_stats_.total_filter_time +=
+        env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
+  }
+
+  if (CompactionFilter::Decision::kUndetermined == filter) {
+    // Should not reach here, since FilterV2 should never return kUndetermined.
+    status_ =
+        Status::NotSupported("FilterV2() should never return kUndetermined");
+    validity_info_.Invalidate();
+    return false;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+      cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+          0) {
+    // Can't skip to a key smaller than the current one.
+    // Keep the key as per FilterV2 documentation.
+    filter = CompactionFilter::Decision::kKeep;
+  }
+
+  if (filter == CompactionFilter::Decision::kRemove) {
+    // convert the current key to a delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeDeletion);
+    // no value associated with delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kPurge) {
+    // convert the current key to a single delete; key_ is pointing into
+    // current_key_ at this point, so updating current_key_ updates key()
+    ikey_.type = kTypeSingleDeletion;
+    current_key_.UpdateInternalKey(ikey_.sequence, kTypeSingleDeletion);
+    // no value associated with single delete
+    value_.clear();
+    iter_stats_.num_record_drop_user++;
+  } else if (filter == CompactionFilter::Decision::kChangeValue) {
+    if (ikey_.type == kTypeBlobIndex) {
+      // value transfer from blob file to inlined data
+      ikey_.type = kTypeValue;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    *need_skip = true;
+    compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                     kValueTypeForSeek);
+    *skip_until = compaction_filter_skip_until_.Encode();
+  } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+    // Only the StackableDB-based BlobDB impl's compaction filter should return
+    // kChangeBlobIndex. Decision about rewriting blob and changing blob index
+    // in the integrated BlobDB impl is made in subsequent call to
+    // PrepareOutput() and its callees.
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "Only stacked BlobDB's internal compaction filter can return "
+          "kChangeBlobIndex.");
+      validity_info_.Invalidate();
+      return false;
+    }
+    if (ikey_.type == kTypeValue) {
+      // value transfer from inlined data to blob file
+      ikey_.type = kTypeBlobIndex;
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+    }
+    value_ = compaction_filter_value_;
+  } else if (filter == CompactionFilter::Decision::kIOError) {
+    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+      status_ = Status::NotSupported(
+          "CompactionFilter for integrated BlobDB should not return kIOError");
+      validity_info_.Invalidate();
+      return false;
+    }
+    status_ = Status::IOError("Failed to access blob during compaction filter");
+    error = true;
+  }
+  return !error;
+}
+
+void CompactionIterator::NextFromInput() {
+  at_next_ = false;
+  validity_info_.Invalidate();
+
+  while (!Valid() && input_.Valid() && !IsPausingManualCompaction() &&
+         !IsShuttingDown()) {
+    key_ = input_.key();
+    value_ = input_.value();
+    blob_value_.Reset();
+    iter_stats_.num_input_records++;
+
+    Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+    if (!pik_status.ok()) {
+      iter_stats_.num_input_corrupt_records++;
+
+      // If `expect_valid_internal_key_` is false, return the corrupted key
+      // and let the caller decide what to do with it.
+      if (expect_valid_internal_key_) {
+        status_ = pik_status;
+        return;
+      }
+      key_ = current_key_.SetInternalKey(key_);
+      has_current_user_key_ = false;
+      current_user_key_sequence_ = kMaxSequenceNumber;
+      current_user_key_snapshot_ = 0;
+      validity_info_.SetValid(ValidContext::kParseKeyError);
+      break;
+    }
+    TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
+
+    // Update input statistics
+    if (ikey_.type == kTypeDeletion || ikey_.type == kTypeSingleDeletion ||
+        ikey_.type == kTypeDeletionWithTimestamp) {
+      iter_stats_.num_input_deletion_records++;
+    }
+    iter_stats_.total_input_raw_key_bytes += key_.size();
+    iter_stats_.total_input_raw_value_bytes += value_.size();
+
+    // If need_skip is true, we should seek the input iterator
+    // to internal key skip_until and continue from there.
+    bool need_skip = false;
+    // Points either into compaction_filter_skip_until_ or into
+    // merge_helper_->compaction_filter_skip_until_.
+    Slice skip_until;
+
+    bool user_key_equal_without_ts = false;
+    int cmp_ts = 0;
+    if (has_current_user_key_) {
+      user_key_equal_without_ts =
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, current_user_key_);
+      // if timestamp_size_ > 0, then curr_ts_ has been initialized by a
+      // previous key.
+      cmp_ts = timestamp_size_ ? cmp_->CompareTimestamp(
+                                     ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_),
+                                     curr_ts_)
+                               : 0;
+    }
+
+    // Check whether the user key changed. After this if statement current_key_
+    // is a copy of the current input key (maybe converted to a delete by the
+    // compaction filter). ikey_.user_key is pointing to the copy.
+    if (!has_current_user_key_ || !user_key_equal_without_ts || cmp_ts != 0) {
+      // First occurrence of this user key
+      // Copy key for output
+      key_ = current_key_.SetInternalKey(key_, &ikey_);
+
+      int prev_cmp_with_ts_low =
+          !full_history_ts_low_ ? 0
+          : curr_ts_.empty()
+              ? 0
+              : cmp_->CompareTimestamp(curr_ts_, *full_history_ts_low_);
+
+      // If timestamp_size_ > 0, then copy from ikey_ to curr_ts_ for the use
+      // in next iteration to compare with the timestamp of next key.
+      UpdateTimestampAndCompareWithFullHistoryLow();
+
+      // If
+      // (1) !has_current_user_key_, OR
+      // (2) timestamp is disabled, OR
+      // (3) all history will be preserved, OR
+      // (4) user key (excluding timestamp) is different from previous key, OR
+      // (5) timestamp is NO older than *full_history_ts_low_, OR
+      // (6) timestamp is the largest one older than full_history_ts_low_,
+      // then current_user_key_ must be treated as a different user key.
+      // This means, if a user key (excluding ts) is the same as the previous
+      // user key, and its ts is older than *full_history_ts_low_, then we
+      // consider this key for GC, e.g. it may be dropped if certain conditions
+      // match.
+      if (!has_current_user_key_ || !timestamp_size_ || !full_history_ts_low_ ||
+          !user_key_equal_without_ts || cmp_with_history_ts_low_ >= 0 ||
+          prev_cmp_with_ts_low >= 0) {
+        // Initialize for future comparison for rule (A) and etc.
+        current_user_key_sequence_ = kMaxSequenceNumber;
+        current_user_key_snapshot_ = 0;
+        has_current_user_key_ = true;
+      }
+      current_user_key_ = ikey_.user_key;
+
+      has_outputted_key_ = false;
+
+      last_key_seq_zeroed_ = false;
+
+      current_key_committed_ = KeyCommitted(ikey_.sequence);
+
+      // Apply the compaction filter to the first committed version of the user
+      // key.
+      if (current_key_committed_ &&
+          !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+        break;
+      }
+    } else {
+      // Update the current key to reflect the new sequence number/type without
+      // copying the user key.
+      // TODO(rven): Compaction filter does not process keys in this path
+      // Need to have the compaction filter process multiple versions
+      // if we have versions on both sides of a snapshot
+      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      key_ = current_key_.GetInternalKey();
+      ikey_.user_key = current_key_.GetUserKey();
+
+      // Note that newer version of a key is ordered before older versions. If a
+      // newer version of a key is committed, so as the older version. No need
+      // to query snapshot_checker_ in that case.
+      if (UNLIKELY(!current_key_committed_)) {
+        assert(snapshot_checker_ != nullptr);
+        current_key_committed_ = KeyCommitted(ikey_.sequence);
+        // Apply the compaction filter to the first committed version of the
+        // user key.
+        if (current_key_committed_ &&
+            !InvokeFilterIfNeeded(&need_skip, &skip_until)) {
+          break;
+        }
+      }
+    }
+
+    if (UNLIKELY(!current_key_committed_)) {
+      assert(snapshot_checker_ != nullptr);
+      validity_info_.SetValid(ValidContext::kCurrentKeyUncommitted);
+      break;
+    }
+
+    // If there are no snapshots, then this kv affect visibility at tip.
+    // Otherwise, search though all existing snapshots to find the earliest
+    // snapshot that is affected by this kv.
+    SequenceNumber last_sequence = current_user_key_sequence_;
+    current_user_key_sequence_ = ikey_.sequence;
+    SequenceNumber last_snapshot = current_user_key_snapshot_;
+    SequenceNumber prev_snapshot = 0;  // 0 means no previous snapshot
+    current_user_key_snapshot_ =
+        visible_at_tip_
+            ? earliest_snapshot_
+            : findEarliestVisibleSnapshot(ikey_.sequence, &prev_snapshot);
+
+    if (need_skip) {
+      // This case is handled below.
+    } else if (clear_and_output_next_key_) {
+      // In the previous iteration we encountered a single delete that we could
+      // not compact out.  We will keep this Put, but can drop it's data.
+      // (See Optimization 3, below.)
+      if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
+          ikey_.type != kTypeWideColumnEntity) {
+        ROCKS_LOG_FATAL(info_log_, "Unexpected key %s for compaction output",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str());
+        assert(false);
+      }
+      if (current_user_key_snapshot_ < last_snapshot) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "key %s, current_user_key_snapshot_ (%" PRIu64
+                        ") < last_snapshot (%" PRIu64 ")",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+                        current_user_key_snapshot_, last_snapshot);
+        assert(false);
+      }
+
+      if (ikey_.type == kTypeBlobIndex || ikey_.type == kTypeWideColumnEntity) {
+        ikey_.type = kTypeValue;
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      }
+
+      value_.clear();
+      validity_info_.SetValid(ValidContext::kKeepSDAndClearPut);
+      clear_and_output_next_key_ = false;
+    } else if (ikey_.type == kTypeSingleDeletion) {
+      // We can compact out a SingleDelete if:
+      // 1) We encounter the corresponding PUT -OR- we know that this key
+      //    doesn't appear past this output level
+      // =AND=
+      // 2) We've already returned a record in this snapshot -OR-
+      //    there are no earlier earliest_write_conflict_snapshot.
+      //
+      // A note about 2) above:
+      // we try to determine whether there is any earlier write conflict
+      // checking snapshot by calling DefinitelyInSnapshot() with seq and
+      // earliest_write_conflict_snapshot as arguments. For write-prepared
+      // and write-unprepared transactions, if earliest_write_conflict_snapshot
+      // is evicted from WritePreparedTxnDB::commit_cache, then
+      // DefinitelyInSnapshot(seq, earliest_write_conflict_snapshot) returns
+      // false, even if the seq is actually visible within
+      // earliest_write_conflict_snapshot. Consequently, CompactionIterator
+      // may try to zero out its sequence number, thus hitting assertion error
+      // in debug mode or cause incorrect DBIter return result.
+      // We observe that earliest_write_conflict_snapshot >= earliest_snapshot,
+      // and the seq zeroing logic depends on
+      // DefinitelyInSnapshot(seq, earliest_snapshot). Therefore, if we cannot
+      // determine whether seq is **definitely** in
+      // earliest_write_conflict_snapshot, then we can additionally check if
+      // seq is definitely in earliest_snapshot. If the latter holds, then the
+      // former holds too.
+      //
+      // Rule 1 is needed for SingleDelete correctness.  Rule 2 is needed to
+      // allow Transactions to do write-conflict checking (if we compacted away
+      // all keys, then we wouldn't know that a write happened in this
+      // snapshot).  If there is no earlier snapshot, then we know that there
+      // are no active transactions that need to know about any writes.
+      //
+      // Optimization 3:
+      // If we encounter a SingleDelete followed by a PUT and Rule 2 is NOT
+      // true, then we must output a SingleDelete.  In this case, we will decide
+      // to also output the PUT.  While we are compacting less by outputting the
+      // PUT now, hopefully this will lead to better compaction in the future
+      // when Rule 2 is later true (Ie, We are hoping we can later compact out
+      // both the SingleDelete and the Put, while we couldn't if we only
+      // outputted the SingleDelete now).
+      // In this case, we can save space by removing the PUT's value as it will
+      // never be read.
+      //
+      // Deletes and Merges are not supported on the same key that has a
+      // SingleDelete as it is not possible to correctly do any partial
+      // compaction of such a combination of operations.  The result of mixing
+      // those operations for a given key is documented as being undefined.  So
+      // we can choose how to handle such a combinations of operations.  We will
+      // try to compact out as much as we can in these cases.
+      // We will report counts on these anomalous cases.
+      //
+      // Note: If timestamp is enabled, then record will be eligible for
+      // deletion, only if, along with above conditions (Rule 1 and Rule 2)
+      // full_history_ts_low_ is specified and timestamp for that key is less
+      // than *full_history_ts_low_. If it's not eligible for deletion, then we
+      // will output the SingleDelete. For Optimization 3 also, if
+      // full_history_ts_low_ is specified and timestamp for the key is less
+      // than *full_history_ts_low_ then only optimization will be applied.
+
+      // The easiest way to process a SingleDelete during iteration is to peek
+      // ahead at the next key.
+      const bool is_timestamp_eligible_for_gc =
+          (timestamp_size_ == 0 ||
+           (full_history_ts_low_ && cmp_with_history_ts_low_ < 0));
+
+      ParsedInternalKey next_ikey;
+      AdvanceInputIter();
+
+      // Check whether the next key exists, is not corrupt, and is the same key
+      // as the single delete.
+      if (input_.Valid() &&
+          ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+              .ok() &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+#ifndef NDEBUG
+        const Compaction* c =
+            compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+        TEST_SYNC_POINT_CALLBACK(
+            "CompactionIterator::NextFromInput:SingleDelete:1",
+            const_cast<Compaction*>(c));
+        if (last_key_seq_zeroed_) {
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
+          AdvanceInputIter();
+        } else if (prev_snapshot == 0 ||
+                   DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot)) {
+          // Check whether the next key belongs to the same snapshot as the
+          // SingleDelete.
+
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:2", nullptr);
+          if (next_ikey.type == kTypeSingleDeletion) {
+            // We encountered two SingleDeletes for same key in a row. This
+            // could be due to unexpected user input. If write-(un)prepared
+            // transaction is used, this could also be due to releasing an old
+            // snapshot between a Put and its matching SingleDelete.
+            // Skip the first SingleDelete and let the next iteration decide
+            // how to handle the second SingleDelete.
+
+            // First SingleDelete has been skipped since we already called
+            // input_.Next().
+            ++iter_stats_.num_record_drop_obsolete;
+            ++iter_stats_.num_single_del_mismatch;
+          } else if (next_ikey.type == kTypeDeletion) {
+            std::ostringstream oss;
+            oss << "Found SD and type: " << static_cast<int>(next_ikey.type)
+                << " on the same key, violating the contract "
+                   "of SingleDelete. Check your application to make sure the "
+                   "application does not mix SingleDelete and Delete for "
+                   "the same key. If you are using "
+                   "write-prepared/write-unprepared transactions, and use "
+                   "SingleDelete to delete certain keys, then make sure "
+                   "TransactionDBOptions::rollback_deletion_type_callback is "
+                   "configured properly. Mixing SD and DEL can lead to "
+                   "undefined behaviors";
+            ++iter_stats_.num_record_drop_obsolete;
+            ++iter_stats_.num_single_del_mismatch;
+            if (enforce_single_del_contracts_) {
+              ROCKS_LOG_ERROR(info_log_, "%s", oss.str().c_str());
+              validity_info_.Invalidate();
+              status_ = Status::Corruption(oss.str());
+              return;
+            }
+            ROCKS_LOG_WARN(info_log_, "%s", oss.str().c_str());
+          } else if (!is_timestamp_eligible_for_gc) {
+            // We cannot drop the SingleDelete as timestamp is enabled, and
+            // timestamp of this key is greater than or equal to
+            // *full_history_ts_low_. We will output the SingleDelete.
+            validity_info_.SetValid(ValidContext::kKeepTsHistory);
+          } else if (has_outputted_key_ ||
+                     DefinitelyInSnapshot(ikey_.sequence,
+                                          earliest_write_conflict_snapshot_) ||
+                     (earliest_snapshot_ < earliest_write_conflict_snapshot_ &&
+                      DefinitelyInSnapshot(ikey_.sequence,
+                                           earliest_snapshot_))) {
+            // Found a matching value, we can drop the single delete and the
+            // value.  It is safe to drop both records since we've already
+            // outputted a key in this snapshot, or there is no earlier
+            // snapshot (Rule 2 above).
+
+            // Note: it doesn't matter whether the second key is a Put or if it
+            // is an unexpected Merge or Delete.  We will compact it out
+            // either way. We will maintain counts of how many mismatches
+            // happened
+            if (next_ikey.type != kTypeValue &&
+                next_ikey.type != kTypeBlobIndex &&
+                next_ikey.type != kTypeWideColumnEntity) {
+              ++iter_stats_.num_single_del_mismatch;
+            }
+
+            ++iter_stats_.num_record_drop_hidden;
+            ++iter_stats_.num_record_drop_obsolete;
+            // Already called input_.Next() once.  Call it a second time to
+            // skip past the second key.
+            AdvanceInputIter();
+          } else {
+            // Found a matching value, but we cannot drop both keys since
+            // there is an earlier snapshot and we need to leave behind a record
+            // to know that a write happened in this snapshot (Rule 2 above).
+            // Clear the value and output the SingleDelete. (The value will be
+            // outputted on the next iteration.)
+
+            // Setting valid_ to true will output the current SingleDelete
+            validity_info_.SetValid(ValidContext::kKeepSDForConflictCheck);
+
+            // Set up the Put to be outputted in the next iteration.
+            // (Optimization 3).
+            clear_and_output_next_key_ = true;
+            TEST_SYNC_POINT_CALLBACK(
+                "CompactionIterator::NextFromInput:KeepSDForWW",
+                /*arg=*/nullptr);
+          }
+        } else {
+          // We hit the next snapshot without hitting a put, so the iterator
+          // returns the single delete.
+          validity_info_.SetValid(ValidContext::kKeepSDForSnapshot);
+          TEST_SYNC_POINT_CALLBACK(
+              "CompactionIterator::NextFromInput:SingleDelete:3",
+              const_cast<Compaction*>(c));
+        }
+      } else {
+        // We are at the end of the input, could not parse the next key, or hit
+        // a different key. The iterator returns the single delete if the key
+        // possibly exists beyond the current output level.  We set
+        // has_current_user_key to false so that if the iterator is at the next
+        // key, we do not compare it again against the previous key at the next
+        // iteration. If the next key is corrupt, we return before the
+        // comparison, so the value of has_current_user_key does not matter.
+        has_current_user_key_ = false;
+        if (compaction_ != nullptr &&
+            DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+            compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                       &level_ptrs_) &&
+            is_timestamp_eligible_for_gc) {
+          // Key doesn't exist outside of this range.
+          // Can compact out this SingleDelete.
+          ++iter_stats_.num_record_drop_obsolete;
+          ++iter_stats_.num_single_del_fallthru;
+          if (!bottommost_level_) {
+            ++iter_stats_.num_optimized_del_drop_obsolete;
+          }
+        } else if (last_key_seq_zeroed_) {
+          // Skip.
+          ++iter_stats_.num_record_drop_hidden;
+          ++iter_stats_.num_record_drop_obsolete;
+          assert(bottommost_level_);
+        } else {
+          // Output SingleDelete
+          validity_info_.SetValid(ValidContext::kKeepSD);
+        }
+      }
+
+      if (Valid()) {
+        at_next_ = true;
+      }
+    } else if (last_snapshot == current_user_key_snapshot_ ||
+               (last_snapshot > 0 &&
+                last_snapshot < current_user_key_snapshot_)) {
+      // If the earliest snapshot is which this key is visible in
+      // is the same as the visibility of a previous instance of the
+      // same key, then this kv is not visible in any snapshot.
+      // Hidden by an newer entry for same user key
+      //
+      // Note: Dropping this key will not affect TransactionDB write-conflict
+      // checking since there has already been a record returned for this key
+      // in this snapshot.
+      if (last_sequence < current_user_key_sequence_) {
+        ROCKS_LOG_FATAL(info_log_,
+                        "key %s, last_sequence (%" PRIu64
+                        ") < current_user_key_sequence_ (%" PRIu64 ")",
+                        ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+                        last_sequence, current_user_key_sequence_);
+        assert(false);
+      }
+
+      ++iter_stats_.num_record_drop_hidden;  // rule (A)
+      AdvanceInputIter();
+    } else if (compaction_ != nullptr &&
+               (ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+               compaction_->KeyNotExistsBeyondOutputLevel(ikey_.user_key,
+                                                          &level_ptrs_)) {
+      // TODO(noetzli): This is the only place where we use compaction_
+      // (besides the constructor). We should probably get rid of this
+      // dependency and find a way to do similar filtering during flushes.
+      //
+      // For this user key:
+      // (1) there is no data in higher levels
+      // (2) data in lower levels will have larger sequence numbers
+      // (3) data in layers that are being compacted here and have
+      //     smaller sequence numbers will be dropped in the next
+      //     few iterations of this loop (by rule (A) above).
+      // Therefore this deletion marker is obsolete and can be dropped.
+      //
+      // Note:  Dropping this Delete will not affect TransactionDB
+      // write-conflict checking since it is earlier than any snapshot.
+      //
+      // It seems that we can also drop deletion later than earliest snapshot
+      // given that:
+      // (1) The deletion is earlier than earliest_write_conflict_snapshot, and
+      // (2) No value exist earlier than the deletion.
+      //
+      // Note also that a deletion marker of type kTypeDeletionWithTimestamp
+      // will be treated as a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
+      ++iter_stats_.num_record_drop_obsolete;
+      if (!bottommost_level_) {
+        ++iter_stats_.num_optimized_del_drop_obsolete;
+      }
+      AdvanceInputIter();
+    } else if ((ikey_.type == kTypeDeletion ||
+                (ikey_.type == kTypeDeletionWithTimestamp &&
+                 cmp_with_history_ts_low_ < 0)) &&
+               bottommost_level_) {
+      // Handle the case where we have a delete key at the bottom most level
+      // We can skip outputting the key iff there are no subsequent puts for
+      // this key
+      assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel(
+                                 ikey_.user_key, &level_ptrs_));
+      ParsedInternalKey next_ikey;
+      AdvanceInputIter();
+#ifndef NDEBUG
+      const Compaction* c =
+          compaction_ ? compaction_->real_compaction() : nullptr;
+#endif
+      TEST_SYNC_POINT_CALLBACK(
+          "CompactionIterator::NextFromInput:BottommostDelete:1",
+          const_cast<Compaction*>(c));
+      // Skip over all versions of this key that happen to occur in the same
+      // snapshot range as the delete.
+      //
+      // Note that a deletion marker of type kTypeDeletionWithTimestamp will be
+      // considered to have a different user key unless the timestamp is older
+      // than *full_history_ts_low_.
+      while (!IsPausingManualCompaction() && !IsShuttingDown() &&
+             input_.Valid() &&
+             (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+                  .ok()) &&
+             cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key) &&
+             (prev_snapshot == 0 ||
+              DefinitelyNotInSnapshot(next_ikey.sequence, prev_snapshot))) {
+        AdvanceInputIter();
+      }
+      // If you find you still need to output a row with this key, we need to
+      // output the delete too
+      if (input_.Valid() &&
+          (ParseInternalKey(input_.key(), &next_ikey, allow_data_in_errors_)
+               .ok()) &&
+          cmp_->EqualWithoutTimestamp(ikey_.user_key, next_ikey.user_key)) {
+        validity_info_.SetValid(ValidContext::kKeepDel);
+        at_next_ = true;
+      }
+    } else if (ikey_.type == kTypeMerge) {
+      if (!merge_helper_->HasOperator()) {
+        status_ = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        return;
+      }
+
+      pinned_iters_mgr_.StartPinning();
+
+      // We know the merge type entry is not hidden, otherwise we would
+      // have hit (A)
+      // We encapsulate the merge related state machine in a different
+      // object to minimize change to the existing flow.
+      Status s = merge_helper_->MergeUntil(
+          &input_, range_del_agg_, prev_snapshot, bottommost_level_,
+          allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
+          prefetch_buffers_.get(), &iter_stats_);
+      merge_out_iter_.SeekToFirst();
+
+      if (!s.ok() && !s.IsMergeInProgress()) {
+        status_ = s;
+        return;
+      } else if (merge_out_iter_.Valid()) {
+        // NOTE: key, value, and ikey_ refer to old entries.
+        //       These will be correctly set below.
+        key_ = merge_out_iter_.key();
+        value_ = merge_out_iter_.value();
+        pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
+        // MergeUntil stops when it encounters a corrupt key and does not
+        // include them in the result, so we expect the keys here to valid.
+        if (!pik_status.ok()) {
+          ROCKS_LOG_FATAL(
+              info_log_, "Invalid key %s in compaction. %s",
+              allow_data_in_errors_ ? key_.ToString(true).c_str() : "hidden",
+              pik_status.getState());
+          assert(false);
+        }
+        // Keep current_key_ in sync.
+        current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+        key_ = current_key_.GetInternalKey();
+        ikey_.user_key = current_key_.GetUserKey();
+        validity_info_.SetValid(ValidContext::kMerge2);
+      } else {
+        // all merge operands were filtered out. reset the user key, since the
+        // batch consumed by the merge operator should not shadow any keys
+        // coming after the merges
+        has_current_user_key_ = false;
+        pinned_iters_mgr_.ReleasePinnedData();
+
+        if (merge_helper_->FilteredUntil(&skip_until)) {
+          need_skip = true;
+        }
+      }
+    } else {
+      // 1. new user key -OR-
+      // 2. different snapshot stripe
+      // If user-defined timestamp is enabled, we consider keys for GC if they
+      // are below history_ts_low_. CompactionRangeDelAggregator::ShouldDelete()
+      // only considers range deletions that are at or below history_ts_low_ and
+      // trim_ts_. We drop keys here that are below history_ts_low_ and are
+      // covered by a range tombstone that is at or below history_ts_low_ and
+      // trim_ts.
+      bool should_delete = false;
+      if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) {
+        should_delete = range_del_agg_->ShouldDelete(
+            key_, RangeDelPositioningMode::kForwardTraversal);
+      }
+      if (should_delete) {
+        ++iter_stats_.num_record_drop_hidden;
+        ++iter_stats_.num_record_drop_range_del;
+        AdvanceInputIter();
+      } else {
+        validity_info_.SetValid(ValidContext::kNewUserKey);
+      }
+    }
+
+    if (need_skip) {
+      SkipUntil(skip_until);
+    }
+  }
+
+  if (!Valid() && IsShuttingDown()) {
+    status_ = Status::ShutdownInProgress();
+  }
+
+  if (IsPausingManualCompaction()) {
+    status_ = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  // Propagate corruption status from memtable itereator
+  if (!input_.Valid() && input_.status().IsCorruption()) {
+    status_ = input_.status();
+  }
+}
+
+bool CompactionIterator::ExtractLargeValueIfNeededImpl() {
+  if (!blob_file_builder_) {
+    return false;
+  }
+
+  blob_index_.clear();
+  const Status s = blob_file_builder_->Add(user_key(), value_, &blob_index_);
+
+  if (!s.ok()) {
+    status_ = s;
+    validity_info_.Invalidate();
+
+    return false;
+  }
+
+  if (blob_index_.empty()) {
+    return false;
+  }
+
+  value_ = blob_index_;
+
+  return true;
+}
+
+void CompactionIterator::ExtractLargeValueIfNeeded() {
+  assert(ikey_.type == kTypeValue);
+
+  if (!ExtractLargeValueIfNeededImpl()) {
+    return;
+  }
+
+  ikey_.type = kTypeBlobIndex;
+  current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+}
+
+void CompactionIterator::GarbageCollectBlobIfNeeded() {
+  assert(ikey_.type == kTypeBlobIndex);
+
+  if (!compaction_) {
+    return;
+  }
+
+  // GC for integrated BlobDB
+  if (compaction_->enable_blob_garbage_collection()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+        &value_);
+
+    BlobIndex blob_index;
+
+    {
+      const Status s = blob_index.DecodeFrom(value_);
+
+      if (!s.ok()) {
+        status_ = s;
+        validity_info_.Invalidate();
+
+        return;
+      }
+    }
+
+    if (blob_index.file_number() >=
+        blob_garbage_collection_cutoff_file_number_) {
+      return;
+    }
+
+    FilePrefetchBuffer* prefetch_buffer =
+        prefetch_buffers_ ? prefetch_buffers_->GetOrCreatePrefetchBuffer(
+                                blob_index.file_number())
+                          : nullptr;
+
+    uint64_t bytes_read = 0;
+
+    {
+      assert(blob_fetcher_);
+
+      const Status s = blob_fetcher_->FetchBlob(
+          user_key(), blob_index, prefetch_buffer, &blob_value_, &bytes_read);
+
+      if (!s.ok()) {
+        status_ = s;
+        validity_info_.Invalidate();
+
+        return;
+      }
+    }
+
+    ++iter_stats_.num_blobs_read;
+    iter_stats_.total_blob_bytes_read += bytes_read;
+
+    ++iter_stats_.num_blobs_relocated;
+    iter_stats_.total_blob_bytes_relocated += blob_index.size();
+
+    value_ = blob_value_;
+
+    if (ExtractLargeValueIfNeededImpl()) {
+      return;
+    }
+
+    ikey_.type = kTypeValue;
+    current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+
+    return;
+  }
+
+  // GC for stacked BlobDB
+  if (compaction_filter_ &&
+      compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
+    const auto blob_decision = compaction_filter_->PrepareBlobOutput(
+        user_key(), value_, &compaction_filter_value_);
+
+    if (blob_decision == CompactionFilter::BlobDecision::kCorruption) {
+      status_ =
+          Status::Corruption("Corrupted blob reference encountered during GC");
+      validity_info_.Invalidate();
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kIOError) {
+      status_ = Status::IOError("Could not relocate blob during GC");
+      validity_info_.Invalidate();
+
+      return;
+    }
+
+    if (blob_decision == CompactionFilter::BlobDecision::kChangeValue) {
+      value_ = compaction_filter_value_;
+
+      return;
+    }
+  }
+}
+
+void CompactionIterator::DecideOutputLevel() {
+  assert(compaction_->SupportsPerKeyPlacement());
+#ifndef NDEBUG
+  // Could be overridden by unittest
+  PerKeyPlacementContext context(level_, ikey_.user_key, value_,
+                                 ikey_.sequence);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
+                           &context);
+  output_to_penultimate_level_ = context.output_to_penultimate_level;
+#else
+  output_to_penultimate_level_ = false;
+#endif  // NDEBUG
+
+  // if the key is newer than the cutoff sequence or within the earliest
+  // snapshot, it should output to the penultimate level.
+  if (ikey_.sequence > preclude_last_level_min_seqno_ ||
+      ikey_.sequence > earliest_snapshot_) {
+    output_to_penultimate_level_ = true;
+  }
+
+  if (output_to_penultimate_level_) {
+    // If it's decided to output to the penultimate level, but unsafe to do so,
+    // still output to the last level. For example, moving the data from a lower
+    // level to a higher level outside of the higher-level input key range is
+    // considered unsafe, because the key may conflict with higher-level SSTs
+    // not from this compaction.
+    // TODO: add statistic for declined output_to_penultimate_level
+    bool safe_to_penultimate_level =
+        compaction_->WithinPenultimateLevelOutputRange(ikey_.user_key);
+    if (!safe_to_penultimate_level) {
+      output_to_penultimate_level_ = false;
+      // It could happen when disable/enable `last_level_temperature` while
+      // holding a snapshot. When `last_level_temperature` is not set
+      // (==kUnknown), the data newer than any snapshot is pushed to the last
+      // level, but when the per_key_placement feature is enabled on the fly,
+      // the data later than the snapshot has to be moved to the penultimate
+      // level, which may or may not be safe. So the user needs to make sure all
+      // snapshot is released before enabling `last_level_temperature` feature
+      // We will migrate the feature to `last_level_temperature` and maybe make
+      // it not dynamically changeable.
+      if (ikey_.sequence > earliest_snapshot_) {
+        status_ = Status::Corruption(
+            "Unsafe to store Seq later than snapshot in the last level if "
+            "per_key_placement is enabled");
+      }
+    }
+  }
+}
+
+void CompactionIterator::PrepareOutput() {
+  if (Valid()) {
+    if (ikey_.type == kTypeValue) {
+      ExtractLargeValueIfNeeded();
+    } else if (ikey_.type == kTypeBlobIndex) {
+      GarbageCollectBlobIfNeeded();
+    }
+
+    if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
+      DecideOutputLevel();
+    }
+
+    // Zeroing out the sequence number leads to better compression.
+    // If this is the bottommost level (no files in lower levels)
+    // and the earliest snapshot is larger than this seqno
+    // and the userkey differs from the last userkey in compaction
+    // then we can squash the seqno to zero.
+    //
+    // This is safe for TransactionDB write-conflict checking since transactions
+    // only care about sequence number larger than any active snapshots.
+    //
+    // Can we do the same for levels above bottom level as long as
+    // KeyNotExistsBeyondOutputLevel() return true?
+    if (Valid() && compaction_ != nullptr &&
+        !compaction_->allow_ingest_behind() && bottommost_level_ &&
+        DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
+        ikey_.type != kTypeMerge && current_key_committed_ &&
+        !output_to_penultimate_level_ &&
+        ikey_.sequence < preserve_time_min_seqno_) {
+      if (ikey_.type == kTypeDeletion ||
+          (ikey_.type == kTypeSingleDeletion && timestamp_size_ == 0)) {
+        ROCKS_LOG_FATAL(
+            info_log_,
+            "Unexpected key %s for seq-zero optimization. "
+            "earliest_snapshot %" PRIu64
+            ", earliest_write_conflict_snapshot %" PRIu64
+            " job_snapshot %" PRIu64
+            ". timestamp_size: %d full_history_ts_low_ %s. validity %x",
+            ikey_.DebugString(allow_data_in_errors_, true).c_str(),
+            earliest_snapshot_, earliest_write_conflict_snapshot_,
+            job_snapshot_, static_cast<int>(timestamp_size_),
+            full_history_ts_low_ != nullptr
+                ? Slice(*full_history_ts_low_).ToString(true).c_str()
+                : "null",
+            validity_info_.rep);
+        assert(false);
+      }
+      ikey_.sequence = 0;
+      last_key_seq_zeroed_ = true;
+      TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput:ZeroingSeq",
+                               &ikey_);
+      if (!timestamp_size_) {
+        current_key_.UpdateInternalKey(0, ikey_.type);
+      } else if (full_history_ts_low_ && cmp_with_history_ts_low_ < 0) {
+        // We can also zero out timestamp for better compression.
+        // For the same user key (excluding timestamp), the timestamp-based
+        // history can be collapsed to save some space if the timestamp is
+        // older than *full_history_ts_low_.
+        const std::string kTsMin(timestamp_size_, static_cast<char>(0));
+        const Slice ts_slice = kTsMin;
+        ikey_.SetTimestamp(ts_slice);
+        current_key_.UpdateInternalKey(0, ikey_.type, &ts_slice);
+      }
+    }
+  }
+}
+
+inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
+    SequenceNumber in, SequenceNumber* prev_snapshot) {
+  assert(snapshots_->size());
+  if (snapshots_->size() == 0) {
+    ROCKS_LOG_FATAL(info_log_,
+                    "No snapshot left in findEarliestVisibleSnapshot");
+  }
+  auto snapshots_iter =
+      std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+  assert(prev_snapshot != nullptr);
+  if (snapshots_iter == snapshots_->begin()) {
+    *prev_snapshot = 0;
+  } else {
+    *prev_snapshot = *std::prev(snapshots_iter);
+    if (*prev_snapshot >= in) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "*prev_snapshot (%" PRIu64 ") >= in (%" PRIu64
+                      ") in findEarliestVisibleSnapshot",
+                      *prev_snapshot, in);
+      assert(false);
+    }
+  }
+  if (snapshot_checker_ == nullptr) {
+    return snapshots_iter != snapshots_->end() ? *snapshots_iter
+                                               : kMaxSequenceNumber;
+  }
+  bool has_released_snapshot = !released_snapshots_.empty();
+  for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+    auto cur = *snapshots_iter;
+    if (in > cur) {
+      ROCKS_LOG_FATAL(info_log_,
+                      "in (%" PRIu64 ") > cur (%" PRIu64
+                      ") in findEarliestVisibleSnapshot",
+                      in, cur);
+      assert(false);
+    }
+    // Skip if cur is in released_snapshots.
+    if (has_released_snapshot && released_snapshots_.count(cur) > 0) {
+      continue;
+    }
+    auto res = snapshot_checker_->CheckInSnapshot(in, cur);
+    if (res == SnapshotCheckerResult::kInSnapshot) {
+      return cur;
+    } else if (res == SnapshotCheckerResult::kSnapshotReleased) {
+      released_snapshots_.insert(cur);
+    }
+    *prev_snapshot = cur;
+  }
+  return kMaxSequenceNumber;
+}
+
+uint64_t CompactionIterator::ComputeBlobGarbageCollectionCutoffFileNumber(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return 0;
+  }
+
+  if (!compaction->enable_blob_garbage_collection()) {
+    return 0;
+  }
+
+  const Version* const version = compaction->input_version();
+  assert(version);
+
+  const VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  const size_t cutoff_index = static_cast<size_t>(
+      compaction->blob_garbage_collection_age_cutoff() * blob_files.size());
+
+  if (cutoff_index >= blob_files.size()) {
+    return std::numeric_limits<uint64_t>::max();
+  }
+
+  const auto& meta = blob_files[cutoff_index];
+  assert(meta);
+
+  return meta->GetBlobFileNumber();
+}
+
+std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  const Version* const version = compaction->input_version();
+  if (!version) {
+    return nullptr;
+  }
+
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+
+  return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));
+}
+
+std::unique_ptr<PrefetchBufferCollection>
+CompactionIterator::CreatePrefetchBufferCollectionIfNeeded(
+    const CompactionProxy* compaction) {
+  if (!compaction) {
+    return nullptr;
+  }
+
+  if (!compaction->input_version()) {
+    return nullptr;
+  }
+
+  if (compaction->allow_mmap_reads()) {
+    return nullptr;
+  }
+
+  const uint64_t readahead_size = compaction->blob_compaction_readahead_size();
+  if (!readahead_size) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<PrefetchBufferCollection>(
+      new PrefetchBufferCollection(readahead_size));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator.h b/src/rocksdb/db/compaction/compaction_iterator.h
new file mode 100644
index 000000000..c215d2bbb
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator.h
@@ -0,0 +1,513 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "options/cf_options.h"
+#include "rocksdb/compaction_filter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class BlobFileBuilder;
+class BlobFetcher;
+class PrefetchBufferCollection;
+
+// A wrapper of internal iterator whose purpose is to count how
+// many entries there are in the iterator.
+class SequenceIterWrapper : public InternalIterator {
+ public:
+  SequenceIterWrapper(InternalIterator* iter, const Comparator* cmp,
+                      bool need_count_entries)
+      : icmp_(cmp),
+        inner_iter_(iter),
+        need_count_entries_(need_count_entries) {}
+  bool Valid() const override { return inner_iter_->Valid(); }
+  Status status() const override { return inner_iter_->status(); }
+  void Next() override {
+    num_itered_++;
+    inner_iter_->Next();
+  }
+  void Seek(const Slice& target) override {
+    if (!need_count_entries_) {
+      inner_iter_->Seek(target);
+    } else {
+      // For flush cases, we need to count total number of entries, so we
+      // do Next() rather than Seek().
+      while (inner_iter_->Valid() &&
+             icmp_.Compare(inner_iter_->key(), target) < 0) {
+        Next();
+      }
+    }
+  }
+  Slice key() const override { return inner_iter_->key(); }
+  Slice value() const override { return inner_iter_->value(); }
+
+  // Unused InternalIterator methods
+  void SeekToFirst() override { assert(false); }
+  void Prev() override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+  uint64_t num_itered() const { return num_itered_; }
+
+ private:
+  InternalKeyComparator icmp_;
+  InternalIterator* inner_iter_;  // not owned
+  uint64_t num_itered_ = 0;
+  bool need_count_entries_;
+};
+
+class CompactionIterator {
+ public:
+  // A wrapper around Compaction. Has a much smaller interface, only what
+  // CompactionIterator uses. Tests can override it.
+  class CompactionProxy {
+   public:
+    virtual ~CompactionProxy() = default;
+
+    virtual int level() const = 0;
+
+    virtual bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const = 0;
+
+    virtual bool bottommost_level() const = 0;
+
+    virtual int number_levels() const = 0;
+
+    // Result includes timestamp if user-defined timestamp is enabled.
+    virtual Slice GetLargestUserKey() const = 0;
+
+    virtual bool allow_ingest_behind() const = 0;
+
+    virtual bool allow_mmap_reads() const = 0;
+
+    virtual bool enable_blob_garbage_collection() const = 0;
+
+    virtual double blob_garbage_collection_age_cutoff() const = 0;
+
+    virtual uint64_t blob_compaction_readahead_size() const = 0;
+
+    virtual const Version* input_version() const = 0;
+
+    virtual bool DoesInputReferenceBlobFiles() const = 0;
+
+    virtual const Compaction* real_compaction() const = 0;
+
+    virtual bool SupportsPerKeyPlacement() const = 0;
+
+    // `key` includes timestamp if user-defined timestamp is enabled.
+    virtual bool WithinPenultimateLevelOutputRange(const Slice& key) const = 0;
+  };
+
+  class RealCompaction : public CompactionProxy {
+   public:
+    explicit RealCompaction(const Compaction* compaction)
+        : compaction_(compaction) {
+      assert(compaction_);
+      assert(compaction_->immutable_options());
+      assert(compaction_->mutable_cf_options());
+    }
+
+    int level() const override { return compaction_->level(); }
+
+    bool KeyNotExistsBeyondOutputLevel(
+        const Slice& user_key, std::vector<size_t>* level_ptrs) const override {
+      return compaction_->KeyNotExistsBeyondOutputLevel(user_key, level_ptrs);
+    }
+
+    bool bottommost_level() const override {
+      return compaction_->bottommost_level();
+    }
+
+    int number_levels() const override { return compaction_->number_levels(); }
+
+    // Result includes timestamp if user-defined timestamp is enabled.
+    Slice GetLargestUserKey() const override {
+      return compaction_->GetLargestUserKey();
+    }
+
+    bool allow_ingest_behind() const override {
+      return compaction_->immutable_options()->allow_ingest_behind;
+    }
+
+    bool allow_mmap_reads() const override {
+      return compaction_->immutable_options()->allow_mmap_reads;
+    }
+
+    bool enable_blob_garbage_collection() const override {
+      return compaction_->enable_blob_garbage_collection();
+    }
+
+    double blob_garbage_collection_age_cutoff() const override {
+      return compaction_->blob_garbage_collection_age_cutoff();
+    }
+
+    uint64_t blob_compaction_readahead_size() const override {
+      return compaction_->mutable_cf_options()->blob_compaction_readahead_size;
+    }
+
+    const Version* input_version() const override {
+      return compaction_->input_version();
+    }
+
+    bool DoesInputReferenceBlobFiles() const override {
+      return compaction_->DoesInputReferenceBlobFiles();
+    }
+
+    const Compaction* real_compaction() const override { return compaction_; }
+
+    bool SupportsPerKeyPlacement() const override {
+      return compaction_->SupportsPerKeyPlacement();
+    }
+
+    // Check if key is within penultimate level output range, to see if it's
+    // safe to output to the penultimate level for per_key_placement feature.
+    // `key` includes timestamp if user-defined timestamp is enabled.
+    bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+      return compaction_->WithinPenultimateLevelOutputRange(key);
+    }
+
+   private:
+    const Compaction* compaction_;
+  };
+
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      bool enforce_single_del_contracts,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const Compaction* compaction = nullptr,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr,
+      const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+      const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+  // Constructor with custom CompactionProxy, used for tests.
+  CompactionIterator(
+      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
+      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SequenceNumber job_snapshot, const SnapshotChecker* snapshot_checker,
+      Env* env, bool report_detailed_time, bool expect_valid_internal_key,
+      CompactionRangeDelAggregator* range_del_agg,
+      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
+      bool enforce_single_del_contracts,
+      const std::atomic<bool>& manual_compaction_canceled,
+      std::unique_ptr<CompactionProxy> compaction,
+      const CompactionFilter* compaction_filter = nullptr,
+      const std::atomic<bool>* shutting_down = nullptr,
+      const std::shared_ptr<Logger> info_log = nullptr,
+      const std::string* full_history_ts_low = nullptr,
+      const SequenceNumber preserve_time_min_seqno = kMaxSequenceNumber,
+      const SequenceNumber preclude_last_level_min_seqno = kMaxSequenceNumber);
+
+  ~CompactionIterator();
+
+  void ResetRecordCounts();
+
+  // Seek to the beginning of the compaction iterator output.
+  //
+  // REQUIRED: Call only once.
+  void SeekToFirst();
+
+  // Produces the next record in the compaction.
+  //
+  // REQUIRED: SeekToFirst() has been called.
+  void Next();
+
+  // Getters
+  const Slice& key() const { return key_; }
+  const Slice& value() const { return value_; }
+  const Status& status() const { return status_; }
+  const ParsedInternalKey& ikey() const { return ikey_; }
+  inline bool Valid() const { return validity_info_.IsValid(); }
+  const Slice& user_key() const { return current_user_key_; }
+  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
+  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
+  // If the current key should be placed on penultimate level, only valid if
+  // per_key_placement is supported
+  bool output_to_penultimate_level() const {
+    return output_to_penultimate_level_;
+  }
+  Status InputStatus() const { return input_.status(); }
+
+ private:
+  // Processes the input stream to find the next output
+  void NextFromInput();
+
+  // Do final preparations before presenting the output to the callee.
+  void PrepareOutput();
+
+  // Decide the current key should be output to the last level or penultimate
+  // level, only call for compaction supports per key placement
+  void DecideOutputLevel();
+
+  // Passes the output value to the blob file builder (if any), and replaces it
+  // with the corresponding blob reference if it has been actually written to a
+  // blob file (i.e. if it passed the value size check). Returns true if the
+  // value got extracted to a blob file, false otherwise.
+  bool ExtractLargeValueIfNeededImpl();
+
+  // Extracts large values as described above, and updates the internal key's
+  // type to kTypeBlobIndex if the value got extracted. Should only be called
+  // for regular values (kTypeValue).
+  void ExtractLargeValueIfNeeded();
+
+  // Relocates valid blobs residing in the oldest blob files if garbage
+  // collection is enabled. Relocated blobs are written to new blob files or
+  // inlined in the LSM tree depending on the current settings (i.e.
+  // enable_blob_files and min_blob_size). Should only be called for blob
+  // references (kTypeBlobIndex).
+  //
+  // Note: the stacked BlobDB implementation's compaction filter based GC
+  // algorithm is also called from here.
+  void GarbageCollectBlobIfNeeded();
+
+  // Invoke compaction filter if needed.
+  // Return true on success, false on failures (e.g.: kIOError).
+  bool InvokeFilterIfNeeded(bool* need_skip, Slice* skip_until);
+
+  // Given a sequence number, return the sequence number of the
+  // earliest snapshot that this sequence number is visible in.
+  // The snapshots themselves are arranged in ascending order of
+  // sequence numbers.
+  // Employ a sequential search because the total number of
+  // snapshots are typically small.
+  inline SequenceNumber findEarliestVisibleSnapshot(
+      SequenceNumber in, SequenceNumber* prev_snapshot);
+
+  inline bool KeyCommitted(SequenceNumber sequence) {
+    return snapshot_checker_ == nullptr ||
+           snapshot_checker_->CheckInSnapshot(sequence, job_snapshot_) ==
+               SnapshotCheckerResult::kInSnapshot;
+  }
+
+  bool DefinitelyInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  bool DefinitelyNotInSnapshot(SequenceNumber seq, SequenceNumber snapshot);
+
+  // Extract user-defined timestamp from user key if possible and compare it
+  // with *full_history_ts_low_ if applicable.
+  inline void UpdateTimestampAndCompareWithFullHistoryLow() {
+    if (!timestamp_size_) {
+      return;
+    }
+    Slice ts = ExtractTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+    curr_ts_.assign(ts.data(), ts.size());
+    if (full_history_ts_low_) {
+      cmp_with_history_ts_low_ =
+          cmp_->CompareTimestamp(ts, *full_history_ts_low_);
+    }
+  }
+
+  static uint64_t ComputeBlobGarbageCollectionCutoffFileNumber(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<BlobFetcher> CreateBlobFetcherIfNeeded(
+      const CompactionProxy* compaction);
+  static std::unique_ptr<PrefetchBufferCollection>
+  CreatePrefetchBufferCollectionIfNeeded(const CompactionProxy* compaction);
+
+  SequenceIterWrapper input_;
+  const Comparator* cmp_;
+  MergeHelper* merge_helper_;
+  const std::vector<SequenceNumber>* snapshots_;
+  // List of snapshots released during compaction.
+  // findEarliestVisibleSnapshot() find them out from return of
+  // snapshot_checker, and make sure they will not be returned as
+  // earliest visible snapshot of an older value.
+  // See WritePreparedTransactionTest::ReleaseSnapshotDuringCompaction3.
+  std::unordered_set<SequenceNumber> released_snapshots_;
+  const SequenceNumber earliest_write_conflict_snapshot_;
+  const SequenceNumber job_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  Env* env_;
+  SystemClock* clock_;
+  const bool report_detailed_time_;
+  const bool expect_valid_internal_key_;
+  CompactionRangeDelAggregator* range_del_agg_;
+  BlobFileBuilder* blob_file_builder_;
+  std::unique_ptr<CompactionProxy> compaction_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>& manual_compaction_canceled_;
+  const bool bottommost_level_;
+  const bool visible_at_tip_;
+  const SequenceNumber earliest_snapshot_;
+
+  std::shared_ptr<Logger> info_log_;
+
+  const bool allow_data_in_errors_;
+
+  const bool enforce_single_del_contracts_;
+
+  // Comes from comparator.
+  const size_t timestamp_size_;
+
+  // Lower bound timestamp to retain full history in terms of user-defined
+  // timestamp. If a key's timestamp is older than full_history_ts_low_, then
+  // the key *may* be eligible for garbage collection (GC). The skipping logic
+  // is in `NextFromInput()` and `PrepareOutput()`.
+  // If nullptr, NO GC will be performed and all history will be preserved.
+  const std::string* const full_history_ts_low_;
+
+  // State
+  //
+  enum ValidContext : uint8_t {
+    kMerge1 = 0,
+    kMerge2 = 1,
+    kParseKeyError = 2,
+    kCurrentKeyUncommitted = 3,
+    kKeepSDAndClearPut = 4,
+    kKeepTsHistory = 5,
+    kKeepSDForConflictCheck = 6,
+    kKeepSDForSnapshot = 7,
+    kKeepSD = 8,
+    kKeepDel = 9,
+    kNewUserKey = 10,
+  };
+
+  struct ValidityInfo {
+    inline bool IsValid() const { return rep & 1; }
+    ValidContext GetContext() const {
+      return static_cast<ValidContext>(rep >> 1);
+    }
+    inline void SetValid(uint8_t ctx) { rep = (ctx << 1) | 1; }
+    inline void Invalidate() { rep = 0; }
+
+    uint8_t rep{0};
+  } validity_info_;
+
+  // Points to a copy of the current compaction iterator output (current_key_)
+  // if valid.
+  Slice key_;
+  // Points to the value in the underlying iterator that corresponds to the
+  // current output.
+  Slice value_;
+  // The status is OK unless compaction iterator encounters a merge operand
+  // while not having a merge operator defined.
+  Status status_;
+  // Stores the user key, sequence number and type of the current compaction
+  // iterator output (or current key in the underlying iterator during
+  // NextFromInput()).
+  ParsedInternalKey ikey_;
+  // Stores whether ikey_.user_key is valid. If set to false, the user key is
+  // not compared against the current key in the underlying iterator.
+  bool has_current_user_key_ = false;
+  // If false, the iterator holds a copy of the current compaction iterator
+  // output (or current key in the underlying iterator during NextFromInput()).
+  bool at_next_ = false;
+
+  IterKey current_key_;
+  Slice current_user_key_;
+  std::string curr_ts_;
+  SequenceNumber current_user_key_sequence_;
+  SequenceNumber current_user_key_snapshot_;
+
+  // True if the iterator has already returned a record for the current key.
+  bool has_outputted_key_ = false;
+
+  // truncated the value of the next key and output it without applying any
+  // compaction rules.  This is used for outputting a put after a single delete.
+  bool clear_and_output_next_key_ = false;
+
+  MergeOutputIterator merge_out_iter_;
+  // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
+  // merge operands and then releasing them after consuming them.
+  PinnedIteratorsManager pinned_iters_mgr_;
+
+  uint64_t blob_garbage_collection_cutoff_file_number_;
+
+  std::unique_ptr<BlobFetcher> blob_fetcher_;
+  std::unique_ptr<PrefetchBufferCollection> prefetch_buffers_;
+
+  std::string blob_index_;
+  PinnableSlice blob_value_;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+  // "level_ptrs" holds indices that remember which file of an associated
+  // level we were last checking during the last call to compaction->
+  // KeyNotExistsBeyondOutputLevel(). This allows future calls to the function
+  // to pick off where it left off since each subcompaction's key range is
+  // increasing so a later call to the function must be looking for a key that
+  // is in or beyond the last file checked during the previous call
+  std::vector<size_t> level_ptrs_;
+  CompactionIterationStats iter_stats_;
+
+  // Used to avoid purging uncommitted values. The application can specify
+  // uncommitted values by providing a SnapshotChecker object.
+  bool current_key_committed_;
+
+  // Saved result of ucmp->CompareTimestamp(current_ts_, *full_history_ts_low_)
+  int cmp_with_history_ts_low_;
+
+  const int level_;
+
+  // True if the previous internal key (same user key)'s sequence number has
+  // just been zeroed out during bottommost compaction.
+  bool last_key_seq_zeroed_{false};
+
+  // True if the current key should be output to the penultimate level if
+  // possible, compaction logic makes the final decision on which level to
+  // output to.
+  bool output_to_penultimate_level_{false};
+
+  // min seqno for preserving the time information.
+  const SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+  // min seqno to preclude the data from the last level, if the key seqno larger
+  // than this, it will be output to penultimate level
+  const SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+  void AdvanceInputIter() { input_.Next(); }
+
+  void SkipUntil(const Slice& skip_until) { input_.Seek(skip_until); }
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+
+  bool IsPausingManualCompaction() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return manual_compaction_canceled_.load(std::memory_order_relaxed);
+  }
+};
+
+inline bool CompactionIterator::DefinitelyInSnapshot(SequenceNumber seq,
+                                                     SequenceNumber snapshot) {
+  return ((seq) <= (snapshot) &&
+          (snapshot_checker_ == nullptr ||
+           LIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                  SnapshotCheckerResult::kInSnapshot)));
+}
+
+inline bool CompactionIterator::DefinitelyNotInSnapshot(
+    SequenceNumber seq, SequenceNumber snapshot) {
+  return ((seq) > (snapshot) ||
+          (snapshot_checker_ != nullptr &&
+           UNLIKELY(snapshot_checker_->CheckInSnapshot((seq), (snapshot)) ==
+                    SnapshotCheckerResult::kNotInSnapshot)));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_iterator_test.cc b/src/rocksdb/db/compaction/compaction_iterator_test.cc
new file mode 100644
index 000000000..81362d792
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_iterator_test.cc
@@ -0,0 +1,1618 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/compaction/compaction_iterator.h"
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Expects no merging attempts.
+class NoMergingMergeOp : public MergeOperator {
+ public:
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* /*merge_out*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  bool PartialMergeMulti(const Slice& /*key*/,
+                         const std::deque<Slice>& /*operand_list*/,
+                         std::string* /*new_value*/,
+                         Logger* /*logger*/) const override {
+    ADD_FAILURE();
+    return false;
+  }
+  const char* Name() const override {
+    return "CompactionIteratorTest NoMergingMergeOp";
+  }
+};
+
+// Compaction filter that gets stuck when it sees a particular key,
+// then gets unstuck when told to.
+// Always returns Decision::kRemove.
+class StallingFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    int k = std::atoi(key.ToString().c_str());
+    last_seen.store(k);
+    while (k >= stall_at.load()) {
+      std::this_thread::yield();
+    }
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override {
+    return "CompactionIteratorTest StallingFilter";
+  }
+
+  // Wait until the filter sees a key >= k and stalls at that key.
+  // If `exact`, asserts that the seen key is equal to k.
+  void WaitForStall(int k, bool exact = true) {
+    stall_at.store(k);
+    while (last_seen.load() < k) {
+      std::this_thread::yield();
+    }
+    if (exact) {
+      EXPECT_EQ(k, last_seen.load());
+    }
+  }
+
+  // Filter will stall on key >= stall_at. Advance stall_at to unstall.
+  mutable std::atomic<int> stall_at{0};
+  // Last key the filter was called with.
+  mutable std::atomic<int> last_seen{0};
+};
+
+// Compaction filter that filter out all keys.
+class FilterAllKeysCompactionFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& /*key*/, ValueType /*type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* /*skip_until*/) const override {
+    return Decision::kRemove;
+  }
+
+  const char* Name() const override { return "AllKeysCompactionFilter"; }
+};
+
+class LoggingForwardVectorIterator : public VectorIterator {
+ public:
+  struct Action {
+    enum class Type {
+      SEEK_TO_FIRST,
+      SEEK,
+      NEXT,
+    };
+
+    Type type;
+    std::string arg;
+
+    explicit Action(Type _type, std::string _arg = "")
+        : type(_type), arg(_arg) {}
+
+    bool operator==(const Action& rhs) const {
+      return std::tie(type, arg) == std::tie(rhs.type, rhs.arg);
+    }
+  };
+
+  LoggingForwardVectorIterator(const std::vector<std::string>& keys,
+                               const std::vector<std::string>& values)
+      : VectorIterator(keys, values) {
+    current_ = keys_.size();
+  }
+
+  void SeekToFirst() override {
+    log.emplace_back(Action::Type::SEEK_TO_FIRST);
+    VectorIterator::SeekToFirst();
+  }
+  void SeekToLast() override { assert(false); }
+
+  void Seek(const Slice& target) override {
+    log.emplace_back(Action::Type::SEEK, target.ToString());
+    VectorIterator::Seek(target);
+  }
+
+  void SeekForPrev(const Slice& /*target*/) override { assert(false); }
+
+  void Next() override {
+    assert(Valid());
+    log.emplace_back(Action::Type::NEXT);
+    VectorIterator::Next();
+  }
+  void Prev() override { assert(false); }
+
+  Slice key() const override {
+    assert(Valid());
+    return VectorIterator::key();
+  }
+  Slice value() const override {
+    assert(Valid());
+    return VectorIterator::value();
+  }
+
+  std::vector<Action> log;
+};
+
+class FakeCompaction : public CompactionIterator::CompactionProxy {
+ public:
+  int level() const override { return 0; }
+
+  bool KeyNotExistsBeyondOutputLevel(
+      const Slice& /*user_key*/,
+      std::vector<size_t>* /*level_ptrs*/) const override {
+    return is_bottommost_level || key_not_exists_beyond_output_level;
+  }
+
+  bool bottommost_level() const override { return is_bottommost_level; }
+
+  int number_levels() const override { return 1; }
+
+  Slice GetLargestUserKey() const override {
+    return "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  }
+
+  bool allow_ingest_behind() const override { return is_allow_ingest_behind; }
+
+  bool allow_mmap_reads() const override { return false; }
+
+  bool enable_blob_garbage_collection() const override { return false; }
+
+  double blob_garbage_collection_age_cutoff() const override { return 0.0; }
+
+  uint64_t blob_compaction_readahead_size() const override { return 0; }
+
+  const Version* input_version() const override { return nullptr; }
+
+  bool DoesInputReferenceBlobFiles() const override { return false; }
+
+  const Compaction* real_compaction() const override { return nullptr; }
+
+  bool SupportsPerKeyPlacement() const override {
+    return supports_per_key_placement;
+  }
+
+  bool WithinPenultimateLevelOutputRange(const Slice& key) const override {
+    return (!key.starts_with("unsafe_pb"));
+  }
+
+  bool key_not_exists_beyond_output_level = false;
+
+  bool is_bottommost_level = false;
+
+  bool is_allow_ingest_behind = false;
+
+  bool supports_per_key_placement = false;
+};
+
+// A simplified snapshot checker which assumes each snapshot has a global
+// last visible sequence.
+class TestSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit TestSnapshotChecker(
+      SequenceNumber last_committed_sequence,
+      const std::unordered_map<SequenceNumber, SequenceNumber>& snapshots =
+          {{}})
+      : last_committed_sequence_(last_committed_sequence),
+        snapshots_(snapshots) {}
+
+  SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+    if (snapshot_seq == kMaxSequenceNumber) {
+      return seq <= last_committed_sequence_
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+    assert(snapshots_.count(snapshot_seq) > 0);
+    return seq <= snapshots_.at(snapshot_seq)
+               ? SnapshotCheckerResult::kInSnapshot
+               : SnapshotCheckerResult::kNotInSnapshot;
+  }
+
+ private:
+  SequenceNumber last_committed_sequence_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshots_;
+};
+
+// Test param:
+//   bool: whether to pass snapshot_checker to compaction iterator.
+class CompactionIteratorTest : public testing::TestWithParam<bool> {
+ public:
+  CompactionIteratorTest()
+      : cmp_(BytewiseComparator()), icmp_(cmp_), snapshots_({}) {}
+
+  explicit CompactionIteratorTest(const Comparator* ucmp)
+      : cmp_(ucmp), icmp_(cmp_), snapshots_({}) {}
+
+  void InitIterators(
+      const std::vector<std::string>& ks, const std::vector<std::string>& vs,
+      const std::vector<std::string>& range_del_ks,
+      const std::vector<std::string>& range_del_vs,
+      SequenceNumber last_sequence,
+      SequenceNumber last_committed_sequence = kMaxSequenceNumber,
+      MergeOperator* merge_op = nullptr, CompactionFilter* filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
+    std::unique_ptr<InternalIterator> unfragmented_range_del_iter(
+        new VectorIterator(range_del_ks, range_del_vs, &icmp_));
+    auto tombstone_list = std::make_shared<FragmentedRangeTombstoneList>(
+        std::move(unfragmented_range_del_iter), icmp_);
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        new FragmentedRangeTombstoneIterator(tombstone_list, icmp_,
+                                             kMaxSequenceNumber));
+    range_del_agg_.reset(new CompactionRangeDelAggregator(&icmp_, snapshots_));
+    range_del_agg_->AddTombstones(std::move(range_del_iter));
+
+    std::unique_ptr<CompactionIterator::CompactionProxy> compaction;
+    if (filter || bottommost_level || key_not_exists_beyond_output_level) {
+      compaction_proxy_ = new FakeCompaction();
+      compaction_proxy_->is_bottommost_level = bottommost_level;
+      compaction_proxy_->is_allow_ingest_behind = AllowIngestBehind();
+      compaction_proxy_->key_not_exists_beyond_output_level =
+          key_not_exists_beyond_output_level;
+      compaction_proxy_->supports_per_key_placement = SupportsPerKeyPlacement();
+      compaction.reset(compaction_proxy_);
+    }
+    bool use_snapshot_checker = UseSnapshotChecker() || GetParam();
+    if (use_snapshot_checker || last_committed_sequence < kMaxSequenceNumber) {
+      snapshot_checker_.reset(
+          new TestSnapshotChecker(last_committed_sequence, snapshot_map_));
+    }
+    merge_helper_.reset(
+        new MergeHelper(Env::Default(), cmp_, merge_op, filter, nullptr, false,
+                        0 /*latest_snapshot*/, snapshot_checker_.get(),
+                        0 /*level*/, nullptr /*statistics*/, &shutting_down_));
+
+    if (c_iter_) {
+      // Since iter_ is still used in ~CompactionIterator(), we call
+      // ~CompactionIterator() first.
+      c_iter_.reset();
+    }
+    iter_.reset(new LoggingForwardVectorIterator(ks, vs));
+    iter_->SeekToFirst();
+    c_iter_.reset(new CompactionIterator(
+        iter_.get(), cmp_, merge_helper_.get(), last_sequence, &snapshots_,
+        earliest_write_conflict_snapshot, kMaxSequenceNumber,
+        snapshot_checker_.get(), Env::Default(),
+        false /* report_detailed_time */, false, range_del_agg_.get(),
+        nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
+        true /*enforce_single_del_contracts*/,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
+        std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
+        full_history_ts_low));
+  }
+
+  void AddSnapshot(SequenceNumber snapshot,
+                   SequenceNumber last_visible_seq = kMaxSequenceNumber) {
+    snapshots_.push_back(snapshot);
+    snapshot_map_[snapshot] = last_visible_seq;
+  }
+
+  virtual bool UseSnapshotChecker() const { return false; }
+
+  virtual bool AllowIngestBehind() const { return false; }
+
+  virtual bool SupportsPerKeyPlacement() const { return false; }
+
+  void RunTest(
+      const std::vector<std::string>& input_keys,
+      const std::vector<std::string>& input_values,
+      const std::vector<std::string>& expected_keys,
+      const std::vector<std::string>& expected_values,
+      SequenceNumber last_committed_seq = kMaxSequenceNumber,
+      MergeOperator* merge_operator = nullptr,
+      CompactionFilter* compaction_filter = nullptr,
+      bool bottommost_level = false,
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      bool key_not_exists_beyond_output_level = false,
+      const std::string* full_history_ts_low = nullptr) {
+    InitIterators(input_keys, input_values, {}, {}, kMaxSequenceNumber,
+                  last_committed_seq, merge_operator, compaction_filter,
+                  bottommost_level, earliest_write_conflict_snapshot,
+                  key_not_exists_beyond_output_level, full_history_ts_low);
+    c_iter_->SeekToFirst();
+    for (size_t i = 0; i < expected_keys.size(); i++) {
+      std::string info = "i = " + std::to_string(i);
+      ASSERT_TRUE(c_iter_->Valid()) << info;
+      ASSERT_OK(c_iter_->status()) << info;
+      ASSERT_EQ(expected_keys[i], c_iter_->key().ToString()) << info;
+      ASSERT_EQ(expected_values[i], c_iter_->value().ToString()) << info;
+      c_iter_->Next();
+    }
+    ASSERT_OK(c_iter_->status());
+    ASSERT_FALSE(c_iter_->Valid());
+  }
+
+  void ClearSnapshots() {
+    snapshots_.clear();
+    snapshot_map_.clear();
+  }
+
+  const Comparator* cmp_;
+  const InternalKeyComparator icmp_;
+  std::vector<SequenceNumber> snapshots_;
+  // A map of valid snapshot to last visible sequence to the snapshot.
+  std::unordered_map<SequenceNumber, SequenceNumber> snapshot_map_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::unique_ptr<LoggingForwardVectorIterator> iter_;
+  std::unique_ptr<CompactionIterator> c_iter_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_;
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+  std::atomic<bool> shutting_down_{false};
+  const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+  FakeCompaction* compaction_proxy_;
+};
+
+// It is possible that the output of the compaction iterator is empty even if
+// the input is not.
+TEST_P(CompactionIteratorTest, EmptyResult) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue)},
+                {"", "val"}, {}, {}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+// If there is a corruption after a single deletion, the corrupted key should
+// be preserved.
+TEST_P(CompactionIteratorTest, CorruptionAfterSingleDeletion) {
+  InitIterators({test::KeyStr("a", 5, kTypeSingleDeletion),
+                 test::KeyStr("a", 3, kTypeValue, true),
+                 test::KeyStr("b", 10, kTypeValue)},
+                {"", "val", "val2"}, {}, {}, 10);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 5, kTypeSingleDeletion),
+            c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 3, kTypeValue, true), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 10, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, SimpleRangeDeletion) {
+  InitIterators({test::KeyStr("morning", 5, kTypeValue),
+                 test::KeyStr("morning", 2, kTypeValue),
+                 test::KeyStr("night", 3, kTypeValue)},
+                {"zao", "zao", "wan"},
+                {test::KeyStr("ma", 4, kTypeRangeDeletion)}, {"mz"}, 5);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 3, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, RangeDeletionWithSnapshots) {
+  AddSnapshot(10);
+  std::vector<std::string> ks1;
+  ks1.push_back(test::KeyStr("ma", 28, kTypeRangeDeletion));
+  std::vector<std::string> vs1{"mz"};
+  std::vector<std::string> ks2{test::KeyStr("morning", 15, kTypeValue),
+                               test::KeyStr("morning", 5, kTypeValue),
+                               test::KeyStr("night", 40, kTypeValue),
+                               test::KeyStr("night", 20, kTypeValue)};
+  std::vector<std::string> vs2{"zao 15", "zao 5", "wan 40", "wan 20"};
+  InitIterators(ks2, vs2, ks1, vs1, 40);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("morning", 5, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("night", 40, kTypeValue), c_iter_->key().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(CompactionIteratorTest, CompactionFilterSkipUntil) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* skip_until) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("av50", v);
+        return Decision::kKeep;
+      }
+      if (k == "b") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("bv60", v);
+        *skip_until = "d+";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "e") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("em71", v);
+        return Decision::kKeep;
+      }
+      if (k == "f") {
+        if (v == "fm65") {
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "f";
+        } else {
+          EXPECT_EQ("fm30", v);
+          EXPECT_EQ(ValueType::kMergeOperand, t);
+          *skip_until = "g+";
+        }
+        return Decision::kRemoveAndSkipUntil;
+      }
+      if (k == "h") {
+        EXPECT_EQ(ValueType::kValue, t);
+        EXPECT_EQ("hv91", v);
+        return Decision::kKeep;
+      }
+      if (k == "i") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("im95", v);
+        *skip_until = "z";
+        return Decision::kRemoveAndSkipUntil;
+      }
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.CompactionFilterSkipUntil::Filter";
+    }
+  };
+
+  NoMergingMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      {test::KeyStr("a", 50, kTypeValue),  // keep
+       test::KeyStr("a", 45, kTypeMerge),
+       test::KeyStr("b", 60, kTypeValue),  // skip to "d+"
+       test::KeyStr("b", 40, kTypeValue), test::KeyStr("c", 35, kTypeValue),
+       test::KeyStr("d", 70, kTypeMerge),
+       test::KeyStr("e", 71, kTypeMerge),  // keep
+       test::KeyStr("f", 65, kTypeMerge),  // skip to "f", aka keep
+       test::KeyStr("f", 30, kTypeMerge),  // skip to "g+"
+       test::KeyStr("f", 25, kTypeValue), test::KeyStr("g", 90, kTypeValue),
+       test::KeyStr("h", 91, kTypeValue),  // keep
+       test::KeyStr("i", 95, kTypeMerge),  // skip to "z"
+       test::KeyStr("j", 99, kTypeValue)},
+      {"av50", "am45", "bv60", "bv40", "cv35", "dm70", "em71", "fm65", "fm30",
+       "fv25", "gv90", "hv91", "im95", "jv99"},
+      {}, {}, kMaxSequenceNumber, kMaxSequenceNumber, &merge_op, &filter);
+
+  // Compaction should output just "a", "e" and "h" keys.
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("av50", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("e", 71, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("em71", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("h", 91, kTypeValue), c_iter_->key().ToString());
+  ASSERT_EQ("hv91", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+
+  // Check that the compaction iterator did the correct sequence of calls on
+  // the underlying iterator.
+  using A = LoggingForwardVectorIterator::Action;
+  using T = A::Type;
+  std::vector<A> expected_actions = {
+      A(T::SEEK_TO_FIRST),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("d+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("g+", kMaxSequenceNumber, kValueTypeForSeek)),
+      A(T::NEXT),
+      A(T::SEEK, test::KeyStr("z", kMaxSequenceNumber, kValueTypeForSeek))};
+  ASSERT_EQ(expected_actions, iter_->log);
+}
+
+TEST_P(CompactionIteratorTest, ShuttingDownInFilter) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeValue),
+       test::KeyStr("3", 3, kTypeValue), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  // Don't leave tombstones (kTypeDeletion) for filtered keys.
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    EXPECT_FALSE(c_iter_->Valid());
+    EXPECT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+// Same as ShuttingDownInFilter, but shutdown happens during filter call for
+// a merge operand, not for a value.
+TEST_P(CompactionIteratorTest, ShuttingDownInMerge) {
+  NoMergingMergeOp merge_op;
+  StallingFilter filter;
+  InitIterators(
+      {test::KeyStr("1", 1, kTypeValue), test::KeyStr("2", 2, kTypeMerge),
+       test::KeyStr("3", 3, kTypeMerge), test::KeyStr("4", 4, kTypeValue)},
+      {"v1", "v2", "v3", "v4"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      &merge_op, &filter);
+  compaction_proxy_->key_not_exists_beyond_output_level = true;
+
+  std::atomic<bool> seek_done{false};
+  ROCKSDB_NAMESPACE::port::Thread compaction_thread([&] {
+    c_iter_->SeekToFirst();
+    ASSERT_FALSE(c_iter_->Valid());
+    ASSERT_TRUE(c_iter_->status().IsShutdownInProgress());
+    seek_done.store(true);
+  });
+
+  // Let key 1 through.
+  filter.WaitForStall(1);
+
+  // Shutdown during compaction filter call for key 2.
+  filter.WaitForStall(2);
+  shutting_down_.store(true);
+  EXPECT_FALSE(seek_done.load());
+
+  // Unstall filter and wait for SeekToFirst() to return.
+  filter.stall_at.store(3);
+  compaction_thread.join();
+  assert(seek_done.load());
+
+  // Check that filter was never called again.
+  EXPECT_EQ(2, filter.last_seen.load());
+}
+
+TEST_P(CompactionIteratorTest, SingleMergeOperand) {
+  class Filter : public CompactionFilter {
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType t,
+                      const Slice& existing_value, std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
+      std::string k = key.ToString();
+      std::string v = existing_value.ToString();
+
+      // See InitIterators() call below for the sequence of keys and their
+      // filtering decisions. Here we closely assert that compaction filter is
+      // called with the expected keys and only them, and with the right values.
+      if (k == "a") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        EXPECT_EQ("av1", v);
+        return Decision::kKeep;
+      } else if (k == "b") {
+        EXPECT_EQ(ValueType::kMergeOperand, t);
+        return Decision::kKeep;
+      } else if (k == "c") {
+        return Decision::kKeep;
+      }
+
+      ADD_FAILURE();
+      return Decision::kKeep;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest.SingleMergeOperand::Filter";
+    }
+  };
+
+  class SingleMergeOp : public MergeOperator {
+   public:
+    bool FullMergeV2(const MergeOperationInput& merge_in,
+                     MergeOperationOutput* merge_out) const override {
+      // See InitIterators() call below for why "c" is the only key for which
+      // FullMergeV2 should be called.
+      EXPECT_EQ("c", merge_in.key.ToString());
+
+      std::string temp_value;
+      if (merge_in.existing_value != nullptr) {
+        temp_value = merge_in.existing_value->ToString();
+      }
+
+      for (auto& operand : merge_in.operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      merge_out->new_value = temp_value;
+
+      return true;
+    }
+
+    bool PartialMergeMulti(const Slice& key,
+                           const std::deque<Slice>& operand_list,
+                           std::string* new_value,
+                           Logger* /*logger*/) const override {
+      std::string string_key = key.ToString();
+      EXPECT_TRUE(string_key == "a" || string_key == "b");
+
+      if (string_key == "a") {
+        EXPECT_EQ(1, operand_list.size());
+      } else if (string_key == "b") {
+        EXPECT_EQ(2, operand_list.size());
+      }
+
+      std::string temp_value;
+      for (auto& operand : operand_list) {
+        temp_value.append(operand.ToString());
+      }
+      swap(temp_value, *new_value);
+
+      return true;
+    }
+
+    const char* Name() const override {
+      return "CompactionIteratorTest SingleMergeOp";
+    }
+
+    bool AllowSingleOperand() const override { return true; }
+  };
+
+  SingleMergeOp merge_op;
+  Filter filter;
+  InitIterators(
+      // a should invoke PartialMergeMulti with a single merge operand.
+      {test::KeyStr("a", 50, kTypeMerge),
+       // b should invoke PartialMergeMulti with two operands.
+       test::KeyStr("b", 70, kTypeMerge), test::KeyStr("b", 60, kTypeMerge),
+       // c should invoke FullMerge due to kTypeValue at the beginning.
+       test::KeyStr("c", 90, kTypeMerge), test::KeyStr("c", 80, kTypeValue)},
+      {"av1", "bv2", "bv1", "cv2", "cv1"}, {}, {}, kMaxSequenceNumber,
+      kMaxSequenceNumber, &merge_op, &filter);
+
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), c_iter_->key().ToString());
+  ASSERT_EQ("av1", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ("bv1bv2", c_iter_->value().ToString());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_EQ("cv1cv2", c_iter_->value().ToString());
+}
+
+// In bottommost level, values earlier than earliest snapshot can be output
+// with sequence = 0.
+TEST_P(CompactionIteratorTest, ZeroOutSequenceAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue)},
+          {"v1", "v2"}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// In bottommost level, deletions earlier than earliest snapshot can be removed
+// permanently.
+TEST_P(CompactionIteratorTest, RemoveDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 3, kTypeDeletion),
+       test::KeyStr("b", 1, kTypeValue)},
+      {"", "", ""},
+      {test::KeyStr("b", 3, kTypeDeletion), test::KeyStr("b", 0, kTypeValue)},
+      {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+// In bottommost level, single deletions earlier than earliest snapshot can be
+// removed permanently.
+TEST_P(CompactionIteratorTest, RemoveSingleDeletionAtBottomLevel) {
+  AddSnapshot(1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion)},
+          {"", ""}, {test::KeyStr("b", 2, kTypeSingleDeletion)}, {""},
+          kMaxSequenceNumber /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, true /*bottommost_level*/);
+}
+
+TEST_P(CompactionIteratorTest, ConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 0, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTestInstance, CompactionIteratorTest,
+                        testing::Values(true, false));
+
+class PerKeyPlacementCompIteratorTest : public CompactionIteratorTest {
+ public:
+  bool SupportsPerKeyPlacement() const override { return true; }
+};
+
+TEST_P(PerKeyPlacementCompIteratorTest, SplitLastLevelData) {
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  latest_cold_seq = 5;
+
+  InitIterators(
+      {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeValue),
+       test::KeyStr("c", 5, kTypeValue)},
+      {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  // the first 2 keys are hot, which should has
+  // `output_to_penultimate_level()==true` and seq num not zeroed out
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 6, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  // `a` is cold data, which should be output to bottommost
+  ASSERT_EQ(test::KeyStr("c", 0, kTypeValue), c_iter_->key().ToString());
+  ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, SnapshotData) {
+  AddSnapshot(5);
+
+  InitIterators(
+      {test::KeyStr("a", 7, kTypeValue), test::KeyStr("b", 6, kTypeDeletion),
+       test::KeyStr("b", 5, kTypeValue)},
+      {"vala", "", "valb"}, {}, {}, kMaxSequenceNumber, kMaxSequenceNumber,
+      nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  // The first key and the tombstone are within snapshot, which should output
+  // to the penultimate level (and seq num cannot be zeroed out).
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  ASSERT_EQ(test::KeyStr("b", 6, kTypeDeletion), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->Valid());
+  // `a` is not protected by the snapshot, the sequence number is zero out and
+  // should output bottommost
+  ASSERT_EQ(test::KeyStr("b", 0, kTypeValue), c_iter_->key().ToString());
+  ASSERT_FALSE(c_iter_->output_to_penultimate_level());
+  c_iter_->Next();
+  ASSERT_OK(c_iter_->status());
+  ASSERT_FALSE(c_iter_->Valid());
+}
+
+TEST_P(PerKeyPlacementCompIteratorTest, ConflictWithSnapshot) {
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  latest_cold_seq = 6;
+
+  AddSnapshot(5);
+
+  InitIterators({test::KeyStr("a", 7, kTypeValue),
+                 test::KeyStr("unsafe_pb", 6, kTypeValue),
+                 test::KeyStr("c", 5, kTypeValue)},
+                {"vala", "valb", "valc"}, {}, {}, kMaxSequenceNumber,
+                kMaxSequenceNumber, nullptr, nullptr, true);
+  c_iter_->SeekToFirst();
+  ASSERT_TRUE(c_iter_->Valid());
+
+  ASSERT_EQ(test::KeyStr("a", 7, kTypeValue), c_iter_->key().ToString());
+  ASSERT_TRUE(c_iter_->output_to_penultimate_level());
+  // the 2nd key is unsafe to output_to_penultimate_level, but it's within
+  // snapshot so for per_key_placement feature it has to be outputted to the
+  // penultimate level. which is a corruption. We should never see
+  // such case as the data with seq num (within snapshot) should always come
+  // from higher compaction input level, which makes it safe to
+  // output_to_penultimate_level.
+  c_iter_->Next();
+  ASSERT_TRUE(c_iter_->status().IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompIteratorTest,
+                        PerKeyPlacementCompIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how CompactionIterator work together with SnapshotChecker.
+class CompactionIteratorWithSnapshotCheckerTest
+    : public CompactionIteratorTest {
+ public:
+  bool UseSnapshotChecker() const override { return true; }
+};
+
+// Uncommitted keys (keys with seq > last_committed_seq) should be output as-is
+// while committed version of these keys should get compacted as usual.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Value) {
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeValue), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Deletion) {
+  RunTest({test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_Merge) {
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeValue)},
+      {"v3", "v1,v2"}, 2 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_SingleDelete) {
+  RunTest({test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("foo", 2, kTypeSingleDeletion),
+           test::KeyStr("foo", 1, kTypeValue)},
+          {"", "v1"}, 1 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       PreserveUncommittedKeys_BlobIndex) {
+  RunTest({test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v3", "v2", "v1"},
+          {test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex)},
+          {"v3", "v2"}, 2 /*last_committed_seq*/);
+}
+
+// Test compaction iterator dedup keys visible to the same snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Value) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 3, kTypeValue),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Deletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeDeletion),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_Merge) {
+  AddSnapshot(2, 1);
+  AddSnapshot(4, 3);
+  auto merge_op = MergeOperators::CreateStringAppendOperator();
+  RunTest(
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 2, kTypeMerge),
+       test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v3", "v2", "v1"},
+      {test::KeyStr("foo", 5, kTypeMerge), test::KeyStr("foo", 4, kTypeMerge),
+       test::KeyStr("foo", 3, kTypeMerge), test::KeyStr("foo", 1, kTypeValue)},
+      {"v5", "v4", "v2,v3", "v1"}, 4 /*last_committed_seq*/, merge_op.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       DedupSameSnapshot_SingleDeletion) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("foo", 4, kTypeValue),
+       test::KeyStr("foo", 3, kTypeSingleDeletion),
+       test::KeyStr("foo", 2, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "", "v2", "v1"},
+      {test::KeyStr("foo", 4, kTypeValue), test::KeyStr("foo", 1, kTypeValue)},
+      {"v4", "v1"}, 3 /*last_committed_seq*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, DedupSameSnapshot_BlobIndex) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 2, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v2", "v1"},
+          {test::KeyStr("foo", 4, kTypeBlobIndex),
+           test::KeyStr("foo", 3, kTypeBlobIndex),
+           test::KeyStr("foo", 1, kTypeBlobIndex)},
+          {"v4", "v3", "v1"}, 3 /*last_committed_seq*/);
+}
+
+// At bottom level, sequence numbers can be zero out, and deletions can be
+// removed, but only when they are visible to earliest snapshot.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotZeroOutSequenceIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"},
+          {test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 2, kTypeValue),
+           test::KeyStr("c", 3, kTypeValue)},
+          {"v1", "v2", "v3"}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest(
+      {test::KeyStr("a", 1, kTypeDeletion), test::KeyStr("b", 2, kTypeDeletion),
+       test::KeyStr("c", 3, kTypeDeletion)},
+      {"", "", ""}, {}, {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+      nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+      true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveDeletionIfValuePresentToEarlierSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 1, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""},
+          {test::KeyStr("a", 4, kTypeDeletion),
+           test::KeyStr("a", 0, kTypeValue), test::KeyStr("b", 3, kTypeValue)},
+          {"", "", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       NotRemoveSingleDeletionIfNotVisibleToEarliestSnapshot) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 1, kTypeSingleDeletion),
+           test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", "", ""},
+          {test::KeyStr("b", 2, kTypeSingleDeletion),
+           test::KeyStr("c", 3, kTypeSingleDeletion)},
+          {"", ""}, kMaxSequenceNumber /*last_committed_seq*/,
+          nullptr /*merge_operator*/, nullptr /*compaction_filter*/,
+          true /*bottommost_level*/);
+}
+
+// Single delete should not cancel out values that not visible to the
+// same set of snapshots
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       SingleDeleteAcrossSnapshotBoundary) {
+  AddSnapshot(2, 1);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"}, 2 /*last_committed_seq*/);
+}
+
+// Single delete should be kept in case it is not visible to the
+// earliest write conflict snapshot. If a single delete is kept for this reason,
+// corresponding value can be trimmed to save space.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", "v1"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a blob index. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_BlobIndex) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeBlobIndex)},
+          {"", "fake_blob_index"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /*last_committed_seq*/, nullptr /*merge_operator*/,
+          nullptr /*compaction_filter*/, false /*bottommost_level*/,
+          2 /*earliest_write_conflict_snapshot*/);
+}
+
+// Same as above but with a wide-column entity. In addition to the value getting
+// trimmed, the type of the KV is changed to kTypeValue.
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       KeepSingleDeletionForWriteConflictChecking_WideColumnEntity) {
+  AddSnapshot(2, 0);
+  RunTest({test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeWideColumnEntity)},
+          {"", "fake_entity"},
+          {test::KeyStr("a", 2, kTypeSingleDeletion),
+           test::KeyStr("a", 1, kTypeValue)},
+          {"", ""}, 2 /* last_committed_seq */, nullptr /* merge_operator */,
+          nullptr /* compaction_filter */, false /* bottommost_level */,
+          2 /* earliest_write_conflict_snapshot */);
+}
+
+// Compaction filter should keep uncommitted key as-is, and
+//   * Convert the latest value to deletion, and/or
+//   * if latest value is a merge, apply filter to all subsequent merges.
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Value) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeValue),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeValue)},
+      {"v2", "v1", "v3", "v4"},
+      {test::KeyStr("a", 2, kTypeValue), test::KeyStr("a", 1, kTypeDeletion),
+       test::KeyStr("b", 3, kTypeValue), test::KeyStr("c", 1, kTypeDeletion)},
+      {"v2", "", "v3", ""}, 1 /*last_committed_seq*/,
+      nullptr /*merge_operator*/, compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_Deletion) {
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 2, kTypeDeletion), test::KeyStr("a", 1, kTypeValue)},
+      {"", "v1"},
+      {test::KeyStr("a", 2, kTypeDeletion),
+       test::KeyStr("a", 1, kTypeDeletion)},
+      {"", ""}, 1 /*last_committed_seq*/, nullptr /*merge_operator*/,
+      compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest,
+       CompactionFilter_PartialMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest({test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+           test::KeyStr("a", 1, kTypeMerge)},
+          {"v3", "v2", "v1"}, {test::KeyStr("a", 3, kTypeMerge)}, {"v3"},
+          2 /*last_committed_seq*/, merge_op.get(), compaction_filter.get());
+}
+
+TEST_F(CompactionIteratorWithSnapshotCheckerTest, CompactionFilter_FullMerge) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  std::unique_ptr<CompactionFilter> compaction_filter(
+      new FilterAllKeysCompactionFilter());
+  RunTest(
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 2, kTypeMerge),
+       test::KeyStr("a", 1, kTypeValue)},
+      {"v3", "v2", "v1"},
+      {test::KeyStr("a", 3, kTypeMerge), test::KeyStr("a", 1, kTypeDeletion)},
+      {"v3", ""}, 2 /*last_committed_seq*/, merge_op.get(),
+      compaction_filter.get());
+}
+
+// Tests how CompactionIterator work together with AllowIngestBehind.
+class CompactionIteratorWithAllowIngestBehindTest
+    : public CompactionIteratorTest {
+ public:
+  bool AllowIngestBehind() const override { return true; }
+};
+
+// When allow_ingest_behind is set, compaction iterator is not targeting
+// the bottommost level since there is no guarantee there won't be further
+// data ingested under the compaction output in future.
+TEST_P(CompactionIteratorWithAllowIngestBehindTest, NoConvertToPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeMerge), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+TEST_P(CompactionIteratorWithAllowIngestBehindTest,
+       MergeToPutIfEncounteredPutAtBottom) {
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendOperator();
+  RunTest({test::KeyStr("a", 4, kTypeMerge), test::KeyStr("a", 3, kTypeMerge),
+           test::KeyStr("a", 2, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a4", "a3", "a2", "b1"},
+          {test::KeyStr("a", 4, kTypeValue), test::KeyStr("b", 1, kTypeValue)},
+          {"a2,a3,a4", "b1"}, kMaxSequenceNumber /*last_committed_seq*/,
+          merge_op.get(), nullptr /*compaction_filter*/,
+          true /*bottomost_level*/);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorWithAllowIngestBehindTestInstance,
+                        CompactionIteratorWithAllowIngestBehindTest,
+                        testing::Values(true, false));
+
+class CompactionIteratorTsGcTest : public CompactionIteratorTest {
+ public:
+  CompactionIteratorTsGcTest()
+      : CompactionIteratorTest(test::BytewiseComparatorWithU64TsWrapper()) {}
+};
+
+TEST_P(CompactionIteratorTsGcTest, NoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"a3", "", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NoMergeEligibleForGc) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(10002, user_key, 102, kTypeMerge),
+      test::KeyStr(10001, user_key, 101, kTypeMerge),
+      test::KeyStr(10000, user_key, 100, kTypeValue)};
+  const std::vector<std::string> input_values = {"2", "1", "a0"};
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendTESTOperator();
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const auto& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+            /*compaction_filter=*/nullptr, bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level,
+            /*full_history_ts_low=*/nullptr);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, AllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4,
+                   kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key[0], /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot, the deletion marker should be preserved because the user
+    // key may appear beyond output level.
+    const std::vector<std::string> expected_keys = {input_keys[0],
+                                                    input_keys[3]};
+    const std::vector<std::string> expected_values = {"", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+  {
+    // No snapshot, the deletion marker can be dropped because the user key
+    // does not appear in higher levels.
+    const std::vector<std::string> expected_keys = {input_keys[3]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SomeMergesOlderThanThreshold) {
+  constexpr char user_key[][2] = {"a", "f"};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+      test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+      test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+      test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+      test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+      test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+      test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+                   kTypeDeletionWithTimestamp)};
+  const std::vector<std::string> input_values = {"25", "19", "18", "16",
+                                                 "19", "17", ""};
+  std::shared_ptr<MergeOperator> merge_op =
+      MergeOperators::CreateStringAppendTESTOperator();
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 20000);
+
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+
+  {
+    AddSnapshot(1600);
+    AddSnapshot(1900);
+    const std::vector<std::string> expected_keys = {
+        test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeMerge),
+        test::KeyStr(/*ts=*/19000, user_key[0], /*seq=*/2300, kTypeMerge),
+        test::KeyStr(/*ts=*/18000, user_key[0], /*seq=*/1800, kTypeMerge),
+        test::KeyStr(/*ts=*/16000, user_key[0], /*seq=*/1600, kTypeValue),
+        test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeMerge),
+        test::KeyStr(/*ts=*/17000, user_key[1], /*seq=*/1700, kTypeMerge),
+        test::KeyStr(/*ts=*/15000, user_key[1], /*seq=*/1600,
+                     kTypeDeletionWithTimestamp)};
+    const std::vector<std::string> expected_values = {"25", "19", "18", "16",
+                                                      "19", "17", ""};
+    for (const auto& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      auto expected_keys_copy = expected_keys;
+      auto expected_values_copy = expected_values;
+      if (bottommost_level || key_not_exists_beyond_output_level) {
+        // the kTypeDeletionWithTimestamp will be dropped
+        expected_keys_copy.pop_back();
+        expected_values_copy.pop_back();
+        if (bottommost_level) {
+          // seq zero
+          expected_keys_copy[3] =
+              test::KeyStr(/*ts=*/0, user_key[0], /*seq=*/0, kTypeValue);
+        }
+      }
+      RunTest(input_keys, input_values, expected_keys_copy,
+              expected_values_copy,
+              /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+              /*compaction_filter=*/nullptr, bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+    ClearSnapshots();
+  }
+
+  // No snapshots
+  {
+    const std::vector<std::string> expected_keys = {
+        test::KeyStr(/*ts=*/25000, user_key[0], /*seq=*/2500, kTypeValue),
+        test::KeyStr(/*ts=*/19000, user_key[1], /*seq=*/2000, kTypeValue)};
+    const std::vector<std::string> expected_values = {"16,18,19,25", "17,19"};
+    for (const auto& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      auto expected_keys_copy = expected_keys;
+      auto expected_values_copy = expected_values;
+      if (bottommost_level) {
+        expected_keys_copy[1] =
+            test::KeyStr(/*ts=*/0, user_key[1], /*seq=*/0, kTypeValue);
+      }
+      RunTest(input_keys, input_values, expected_keys_copy,
+              expected_values_copy,
+              /*last_committed_seq=*/kMaxSequenceNumber, merge_op.get(),
+              /*compaction_filter=*/nullptr, bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, NewHidesOldSameSnapshot) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeValue),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "a1", "a0"};
+  {
+    std::string full_history_ts_low;
+    // Keys whose timestamps larger than or equal to 102 will be preserved.
+    PutFixed64(&full_history_ts_low, 102);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", input_values[1],
+                                                      input_values[2]};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, DropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+
+  {
+    // Non-bottommost level, but key does not exist beyond output level.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_sequence=*/kMaxSequenceNumber,
+            /*merge_op=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+  {
+    // Bottommost level
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, RewriteTs) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeDeletionWithTimestamp),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {
+      input_keys[0], input_keys[1], input_keys[2],
+      test::KeyStr(/*ts=*/0, user_key, /*seq=*/0, kTypeValue)};
+  const std::vector<std::string> expected_values = {"", "a2", "", "a0"};
+
+  AddSnapshot(1);
+  AddSnapshot(2);
+
+  {
+    // Bottommost level and need to rewrite both ts and seq.
+    std::string full_history_ts_low;
+    PutFixed64(&full_history_ts_low, 102);
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/true,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/true, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteNoKeyEligibleForGC) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/104, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/102, user_key[1], /*seq=*/2, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a3", "b2"};
+  std::string full_history_ts_low;
+  // All keys' timestamps are newer than or equal to 102, thus none of them
+  // will be eligible for GC.
+  PutFixed64(&full_history_ts_low, 102);
+  const std::vector<std::string>& expected_keys = input_keys;
+  const std::vector<std::string>& expected_values = input_values;
+  const std::vector<std::pair<bool, bool>> params = {
+      {false, false}, {false, true}, {true, true}};
+  for (const std::pair<bool, bool>& param : params) {
+    const bool bottommost_level = param.first;
+    const bool key_not_exists_beyond_output_level = param.second;
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            bottommost_level,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            key_not_exists_beyond_output_level, &full_history_ts_low);
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteDropTombstones) {
+  constexpr char user_key[] = "a";
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key, /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key, /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/101, user_key, /*seq=*/2, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/100, user_key, /*seq=*/1, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "", "a0"};
+  const std::vector<std::string> expected_keys = {input_keys[0], input_keys[1]};
+  const std::vector<std::string> expected_values = {"", "a2"};
+
+  // Take a snapshot at seq 2.
+  AddSnapshot(2);
+  {
+    const std::vector<std::pair<bool, bool>> params = {
+        {false, false}, {false, true}, {true, true}};
+    for (const std::pair<bool, bool>& param : params) {
+      const bool bottommost_level = param.first;
+      const bool key_not_exists_beyond_output_level = param.second;
+      std::string full_history_ts_low;
+      PutFixed64(&full_history_ts_low, 102);
+      RunTest(input_keys, input_values, expected_keys, expected_values,
+              /*last_committed_seq=*/kMaxSequenceNumber,
+              /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+              bottommost_level,
+              /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+              key_not_exists_beyond_output_level, &full_history_ts_low);
+    }
+  }
+}
+
+TEST_P(CompactionIteratorTsGcTest, SingleDeleteAllKeysOlderThanThreshold) {
+  constexpr char user_key[][2] = {{'a', '\0'}, {'b', '\0'}};
+  const std::vector<std::string> input_keys = {
+      test::KeyStr(/*ts=*/103, user_key[0], /*seq=*/4, kTypeSingleDeletion),
+      test::KeyStr(/*ts=*/102, user_key[0], /*seq=*/3, kTypeValue),
+      test::KeyStr(/*ts=*/104, user_key[1], /*seq=*/5, kTypeValue)};
+  const std::vector<std::string> input_values = {"", "a2", "b5"};
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  {
+    // With a snapshot at seq 3, both the deletion marker and the key at 3 must
+    // be preserved.
+    AddSnapshot(3);
+    const std::vector<std::string> expected_keys = {
+        input_keys[0], input_keys[1], input_keys[2]};
+    const std::vector<std::string> expected_values = {"", "a2", "b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+    ClearSnapshots();
+  }
+  {
+    // No snapshot.
+    const std::vector<std::string> expected_keys = {input_keys[2]};
+    const std::vector<std::string> expected_values = {"b5"};
+    RunTest(input_keys, input_values, expected_keys, expected_values,
+            /*last_committed_seq=*/kMaxSequenceNumber,
+            /*merge_operator=*/nullptr, /*compaction_filter=*/nullptr,
+            /*bottommost_level=*/false,
+            /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+            /*key_not_exists_beyond_output_level=*/false, &full_history_ts_low);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionIteratorTsGcTestInstance,
+                        CompactionIteratorTsGcTest,
+                        testing::Values(true, false));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_job.cc b/src/rocksdb/db/compaction/compaction_job.cc
new file mode 100644
index 000000000..1da1bcda8
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.cc
@@ -0,0 +1,2060 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <optional>
+#include <set>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_counting_iterator.h"
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_builder.h"
+#include "db/builder.h"
+#include "db/compaction/clipping_iterator.h"
+#include "db/compaction/compaction_state.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/history_trimming_iterator.h"
+#include "db/log_writer.h"
+#include "db/merge_helper.h"
+#include "db/range_del_aggregator.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/configurable_helper.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/options_type.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetCompactionReasonString(CompactionReason compaction_reason) {
+  switch (compaction_reason) {
+    case CompactionReason::kUnknown:
+      return "Unknown";
+    case CompactionReason::kLevelL0FilesNum:
+      return "LevelL0FilesNum";
+    case CompactionReason::kLevelMaxLevelSize:
+      return "LevelMaxLevelSize";
+    case CompactionReason::kUniversalSizeAmplification:
+      return "UniversalSizeAmplification";
+    case CompactionReason::kUniversalSizeRatio:
+      return "UniversalSizeRatio";
+    case CompactionReason::kUniversalSortedRunNum:
+      return "UniversalSortedRunNum";
+    case CompactionReason::kFIFOMaxSize:
+      return "FIFOMaxSize";
+    case CompactionReason::kFIFOReduceNumFiles:
+      return "FIFOReduceNumFiles";
+    case CompactionReason::kFIFOTtl:
+      return "FIFOTtl";
+    case CompactionReason::kManualCompaction:
+      return "ManualCompaction";
+    case CompactionReason::kFilesMarkedForCompaction:
+      return "FilesMarkedForCompaction";
+    case CompactionReason::kBottommostFiles:
+      return "BottommostFiles";
+    case CompactionReason::kTtl:
+      return "Ttl";
+    case CompactionReason::kFlush:
+      return "Flush";
+    case CompactionReason::kExternalSstIngestion:
+      return "ExternalSstIngestion";
+    case CompactionReason::kPeriodicCompaction:
+      return "PeriodicCompaction";
+    case CompactionReason::kChangeTemperature:
+      return "ChangeTemperature";
+    case CompactionReason::kForcedBlobGC:
+      return "ForcedBlobGC";
+    case CompactionReason::kRoundRobinTtl:
+      return "RoundRobinTtl";
+    case CompactionReason::kNumOfReasons:
+      // fall through
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
+const char* GetCompactionPenultimateOutputRangeTypeString(
+    Compaction::PenultimateOutputRangeType range_type) {
+  switch (range_type) {
+    case Compaction::PenultimateOutputRangeType::kNotSupported:
+      return "NotSupported";
+    case Compaction::PenultimateOutputRangeType::kFullRange:
+      return "FullRange";
+    case Compaction::PenultimateOutputRangeType::kNonLastRange:
+      return "NonLastRange";
+    case Compaction::PenultimateOutputRangeType::kDisabled:
+      return "Disabled";
+    default:
+      assert(false);
+      return "Invalid";
+  }
+}
+
+CompactionJob::CompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* db_directory,
+    FSDirectory* output_directory, FSDirectory* blob_output_directory,
+    Statistics* stats, InstrumentedMutex* db_mutex,
+    ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    const SnapshotChecker* snapshot_checker, JobContext* job_context,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
+    CompactionJobStats* compaction_job_stats, Env::Priority thread_pri,
+    const std::shared_ptr<IOTracer>& io_tracer,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string full_history_ts_low, std::string trim_ts,
+    BlobFileCompletionCallback* blob_callback, int* bg_compaction_scheduled,
+    int* bg_bottom_compaction_scheduled)
+    : compact_(new CompactionState(compaction)),
+      compaction_stats_(compaction->compaction_reason(), 1),
+      db_options_(db_options),
+      mutable_db_options_copy_(mutable_db_options),
+      log_buffer_(log_buffer),
+      output_directory_(output_directory),
+      stats_(stats),
+      bottommost_level_(false),
+      write_hint_(Env::WLTH_NOT_SET),
+      compaction_job_stats_(compaction_job_stats),
+      job_id_(job_id),
+      dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
+      file_options_(file_options),
+      env_(db_options.env),
+      io_tracer_(io_tracer),
+      fs_(db_options.fs, io_tracer),
+      file_options_for_read_(
+          fs_->OptimizeForCompactionTableRead(file_options, db_options_)),
+      versions_(versions),
+      shutting_down_(shutting_down),
+      manual_compaction_canceled_(manual_compaction_canceled),
+      db_directory_(db_directory),
+      blob_output_directory_(blob_output_directory),
+      db_mutex_(db_mutex),
+      db_error_handler_(db_error_handler),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      job_context_(job_context),
+      table_cache_(std::move(table_cache)),
+      event_logger_(event_logger),
+      paranoid_file_checks_(paranoid_file_checks),
+      measure_io_stats_(measure_io_stats),
+      thread_pri_(thread_pri),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      trim_ts_(std::move(trim_ts)),
+      blob_callback_(blob_callback),
+      extra_num_subcompaction_threads_reserved_(0),
+      bg_compaction_scheduled_(bg_compaction_scheduled),
+      bg_bottom_compaction_scheduled_(bg_bottom_compaction_scheduled) {
+  assert(compaction_job_stats_ != nullptr);
+  assert(log_buffer_ != nullptr);
+
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+  ReportStartedCompaction(compaction);
+}
+
+CompactionJob::~CompactionJob() {
+  assert(compact_ == nullptr);
+  ThreadStatusUtil::ResetThreadStatus();
+}
+
+void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
+  const auto* cfd = compact_->compaction->column_family_data();
+  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_id_);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
+      (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
+          compact_->compaction->output_level());
+
+  // In the current design, a CompactionJob is always created
+  // for non-trivial compaction.
+  assert(compaction->IsTrivialMove() == false ||
+         compaction->is_manual_compaction() == true);
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_PROP_FLAGS,
+      compaction->is_manual_compaction() +
+          (compaction->deletion_compaction() << 1));
+
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
+      compaction->CalculateTotalInputSize());
+
+  IOSTATS_RESET(bytes_written);
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
+  ThreadStatusUtil::SetThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, 0);
+
+  // Set the thread operation after operation properties
+  // to ensure GetThreadList() can always show them all together.
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+  compaction_job_stats_->is_manual_compaction =
+      compaction->is_manual_compaction();
+  compaction_job_stats_->is_full_compaction = compaction->is_full_compaction();
+}
+
+void CompactionJob::Prepare() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PREPARE);
+
+  // Generate file_levels_ for compaction before making Iterator
+  auto* c = compact_->compaction;
+  ColumnFamilyData* cfd = c->column_family_data();
+  assert(cfd != nullptr);
+  assert(cfd->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ = cfd->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  if (c->ShouldFormSubcompactions()) {
+    StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
+    GenSubcompactionBoundaries();
+  }
+  if (boundaries_.size() > 1) {
+    for (size_t i = 0; i <= boundaries_.size(); i++) {
+      compact_->sub_compact_states.emplace_back(
+          c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
+          (i != boundaries_.size()) ? std::optional<Slice>(boundaries_[i])
+                                    : std::nullopt,
+          static_cast<uint32_t>(i));
+      // assert to validate that boundaries don't have same user keys (without
+      // timestamp part).
+      assert(i == 0 || i == boundaries_.size() ||
+             cfd->user_comparator()->CompareWithoutTimestamp(
+                 boundaries_[i - 1], boundaries_[i]) < 0);
+    }
+    RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
+                      compact_->sub_compact_states.size());
+  } else {
+    compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt,
+                                              /*sub_job_id*/ 0);
+  }
+
+  // collect all seqno->time information from the input files which will be used
+  // to encode seqno->time to the output files.
+  uint64_t preserve_time_duration =
+      std::max(c->immutable_options()->preserve_internal_time_seconds,
+               c->immutable_options()->preclude_last_level_data_seconds);
+
+  if (preserve_time_duration > 0) {
+    // setup seqno_time_mapping_
+    seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
+    for (const auto& each_level : *c->inputs()) {
+      for (const auto& fmd : each_level.files) {
+        std::shared_ptr<const TableProperties> tp;
+        Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
+        if (s.ok()) {
+          seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
+              .PermitUncheckedError();
+          seqno_time_mapping_.Add(fmd->fd.smallest_seqno,
+                                  fmd->oldest_ancester_time);
+        }
+      }
+    }
+
+    auto status = seqno_time_mapping_.Sort();
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Invalid sequence number to time mapping: Status: %s",
+                     status.ToString().c_str());
+    }
+    int64_t _current_time = 0;
+    status = db_options_.clock->GetCurrentTime(&_current_time);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Failed to get current time in compaction: Status: %s",
+                     status.ToString().c_str());
+      // preserve all time information
+      preserve_time_min_seqno_ = 0;
+      preclude_last_level_min_seqno_ = 0;
+    } else {
+      seqno_time_mapping_.TruncateOldEntries(_current_time);
+      uint64_t preserve_time =
+          static_cast<uint64_t>(_current_time) > preserve_time_duration
+              ? _current_time - preserve_time_duration
+              : 0;
+      preserve_time_min_seqno_ =
+          seqno_time_mapping_.GetOldestSequenceNum(preserve_time);
+      if (c->immutable_options()->preclude_last_level_data_seconds > 0) {
+        uint64_t preclude_last_level_time =
+            static_cast<uint64_t>(_current_time) >
+                    c->immutable_options()->preclude_last_level_data_seconds
+                ? _current_time -
+                      c->immutable_options()->preclude_last_level_data_seconds
+                : 0;
+        preclude_last_level_min_seqno_ =
+            seqno_time_mapping_.GetOldestSequenceNum(preclude_last_level_time);
+      }
+    }
+  }
+}
+
+uint64_t CompactionJob::GetSubcompactionsLimit() {
+  return extra_num_subcompaction_threads_reserved_ +
+         std::max(
+             std::uint64_t(1),
+             static_cast<uint64_t>(compact_->compaction->max_subcompactions()));
+}
+
+void CompactionJob::AcquireSubcompactionResources(
+    int num_extra_required_subcompactions) {
+  TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
+  TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
+  int max_db_compactions =
+      DBImpl::GetBGJobLimits(
+          mutable_db_options_copy_.max_background_flushes,
+          mutable_db_options_copy_.max_background_compactions,
+          mutable_db_options_copy_.max_background_jobs,
+          versions_->GetColumnFamilySet()
+              ->write_controller()
+              ->NeedSpeedupCompaction())
+          .max_compactions;
+  InstrumentedMutexLock l(db_mutex_);
+  // Apply min function first since We need to compute the extra subcompaction
+  // against compaction limits. And then try to reserve threads for extra
+  // subcompactions. The actual number of reserved threads could be less than
+  // the desired number.
+  int available_bg_compactions_against_db_limit =
+      std::max(max_db_compactions - *bg_compaction_scheduled_ -
+                   *bg_bottom_compaction_scheduled_,
+               0);
+  // Reservation only supports backgrdoun threads of which the priority is
+  // between BOTTOM and HIGH. Need to degrade the priority to HIGH if the
+  // origin thread_pri_ is higher than that. Similar to ReleaseThreads().
+  extra_num_subcompaction_threads_reserved_ =
+      env_->ReserveThreads(std::min(num_extra_required_subcompactions,
+                                    available_bg_compactions_against_db_limit),
+                           std::min(thread_pri_, Env::Priority::HIGH));
+
+  // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+  // depending on if this compaction has the bottommost priority
+  if (thread_pri_ == Env::Priority::BOTTOM) {
+    *bg_bottom_compaction_scheduled_ +=
+        extra_num_subcompaction_threads_reserved_;
+  } else {
+    *bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
+  }
+}
+
+void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
+  // Do nothing when we have zero resources to shrink
+  if (num_extra_resources == 0) return;
+  db_mutex_->Lock();
+  // We cannot release threads more than what we reserved before
+  int extra_num_subcompaction_threads_released = env_->ReleaseThreads(
+      (int)num_extra_resources, std::min(thread_pri_, Env::Priority::HIGH));
+  // Update the number of reserved threads and the number of background
+  // scheduled compactions for this compaction job
+  extra_num_subcompaction_threads_reserved_ -=
+      extra_num_subcompaction_threads_released;
+  // TODO (zichen): design a test case with new subcompaction partitioning
+  // when the number of actual partitions is less than the number of planned
+  // partitions
+  assert(extra_num_subcompaction_threads_released == (int)num_extra_resources);
+  // Update bg_compaction_scheduled_ or bg_bottom_compaction_scheduled_
+  // depending on if this compaction has the bottommost priority
+  if (thread_pri_ == Env::Priority::BOTTOM) {
+    *bg_bottom_compaction_scheduled_ -=
+        extra_num_subcompaction_threads_released;
+  } else {
+    *bg_compaction_scheduled_ -= extra_num_subcompaction_threads_released;
+  }
+  db_mutex_->Unlock();
+  TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
+}
+
+void CompactionJob::ReleaseSubcompactionResources() {
+  if (extra_num_subcompaction_threads_reserved_ == 0) {
+    return;
+  }
+  {
+    InstrumentedMutexLock l(db_mutex_);
+    // The number of reserved threads becomes larger than 0 only if the
+    // compaction prioity is round robin and there is no sufficient
+    // sub-compactions available
+
+    // The scheduled compaction must be no less than 1 + extra number
+    // subcompactions using acquired resources since this compaction job has not
+    // finished yet
+    assert(*bg_bottom_compaction_scheduled_ >=
+               1 + extra_num_subcompaction_threads_reserved_ ||
+           *bg_compaction_scheduled_ >=
+               1 + extra_num_subcompaction_threads_reserved_);
+  }
+  ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
+}
+
+struct RangeWithSize {
+  Range range;
+  uint64_t size;
+
+  RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
+      : range(a, b), size(s) {}
+};
+
+void CompactionJob::GenSubcompactionBoundaries() {
+  // The goal is to find some boundary keys so that we can evenly partition
+  // the compaction input data into max_subcompactions ranges.
+  // For every input file, we ask TableReader to estimate 128 anchor points
+  // that evenly partition the input file into 128 ranges and the range
+  // sizes. This can be calculated by scanning index blocks of the file.
+  // Once we have the anchor points for all the input files, we merge them
+  // together and try to find keys dividing ranges evenly.
+  // For example, if we have two input files, and each returns following
+  // ranges:
+  //   File1: (a1, 1000), (b1, 1200), (c1, 1100)
+  //   File2: (a2, 1100), (b2, 1000), (c2, 1000)
+  // We total sort the keys to following:
+  //  (a1, 1000), (a2, 1100), (b1, 1200), (b2, 1000), (c1, 1100), (c2, 1000)
+  // We calculate the total size by adding up all ranges' size, which is 6400.
+  // If we would like to partition into 2 subcompactions, the target of the
+  // range size is 3200. Based on the size, we take "b1" as the partition key
+  // since the first three ranges would hit 3200.
+  //
+  // Note that the ranges are actually overlapping. For example, in the example
+  // above, the range ending with "b1" is overlapping with the range ending with
+  // "b2". So the size 1000+1100+1200 is an underestimation of data size up to
+  // "b1". In extreme cases where we only compact N L0 files, a range can
+  // overlap with N-1 other ranges. Since we requested a relatively large number
+  // (128) of ranges from each input files, even N range overlapping would
+  // cause relatively small inaccuracy.
+
+  auto* c = compact_->compaction;
+  if (c->max_subcompactions() <= 1 &&
+      !(c->immutable_options()->compaction_pri == kRoundRobin &&
+        c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
+    return;
+  }
+  auto* cfd = c->column_family_data();
+  const Comparator* cfd_comparator = cfd->user_comparator();
+  const InternalKeyComparator& icomp = cfd->internal_comparator();
+
+  auto* v = compact_->compaction->input_version();
+  int base_level = v->storage_info()->base_level();
+  InstrumentedMutexUnlock unlock_guard(db_mutex_);
+
+  uint64_t total_size = 0;
+  std::vector<TableReader::Anchor> all_anchors;
+  int start_lvl = c->start_level();
+  int out_lvl = c->output_level();
+
+  for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
+    int lvl = c->level(lvl_idx);
+    if (lvl >= start_lvl && lvl <= out_lvl) {
+      const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
+      size_t num_files = flevel->num_files;
+
+      if (num_files == 0) {
+        continue;
+      }
+
+      for (size_t i = 0; i < num_files; i++) {
+        FileMetaData* f = flevel->files[i].file_metadata;
+        std::vector<TableReader::Anchor> my_anchors;
+        Status s = cfd->table_cache()->ApproximateKeyAnchors(
+            ReadOptions(), icomp, *f, my_anchors);
+        if (!s.ok() || my_anchors.empty()) {
+          my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
+        }
+        for (auto& ac : my_anchors) {
+          // Can be optimize to avoid this loop.
+          total_size += ac.range_size;
+        }
+
+        all_anchors.insert(all_anchors.end(), my_anchors.begin(),
+                           my_anchors.end());
+      }
+    }
+  }
+  // Here we total sort all the anchor points across all files and go through
+  // them in the sorted order to find partitioning boundaries.
+  // Not the most efficient implementation. A much more efficient algorithm
+  // probably exists. But they are more complex. If performance turns out to
+  // be a problem, we can optimize.
+  std::sort(
+      all_anchors.begin(), all_anchors.end(),
+      [cfd_comparator](TableReader::Anchor& a, TableReader::Anchor& b) -> bool {
+        return cfd_comparator->CompareWithoutTimestamp(a.user_key, b.user_key) <
+               0;
+      });
+
+  // Remove duplicated entries from boundaries.
+  all_anchors.erase(
+      std::unique(all_anchors.begin(), all_anchors.end(),
+                  [cfd_comparator](TableReader::Anchor& a,
+                                   TableReader::Anchor& b) -> bool {
+                    return cfd_comparator->CompareWithoutTimestamp(
+                               a.user_key, b.user_key) == 0;
+                  }),
+      all_anchors.end());
+
+  // Get the number of planned subcompactions, may update reserve threads
+  // and update extra_num_subcompaction_threads_reserved_ for round-robin
+  uint64_t num_planned_subcompactions;
+  if (c->immutable_options()->compaction_pri == kRoundRobin &&
+      c->immutable_options()->compaction_style == kCompactionStyleLevel) {
+    // For round-robin compaction prioity, we need to employ more
+    // subcompactions (may exceed the max_subcompaction limit). The extra
+    // subcompactions will be executed using reserved threads and taken into
+    // account bg_compaction_scheduled or bg_bottom_compaction_scheduled.
+
+    // Initialized by the number of input files
+    num_planned_subcompactions = static_cast<uint64_t>(c->num_input_files(0));
+    uint64_t max_subcompactions_limit = GetSubcompactionsLimit();
+    if (max_subcompactions_limit < num_planned_subcompactions) {
+      // Assert two pointers are not empty so that we can use extra
+      // subcompactions against db compaction limits
+      assert(bg_bottom_compaction_scheduled_ != nullptr);
+      assert(bg_compaction_scheduled_ != nullptr);
+      // Reserve resources when max_subcompaction is not sufficient
+      AcquireSubcompactionResources(
+          (int)(num_planned_subcompactions - max_subcompactions_limit));
+      // Subcompactions limit changes after acquiring additional resources.
+      // Need to call GetSubcompactionsLimit() again to update the number
+      // of planned subcompactions
+      num_planned_subcompactions =
+          std::min(num_planned_subcompactions, GetSubcompactionsLimit());
+    } else {
+      num_planned_subcompactions = max_subcompactions_limit;
+    }
+  } else {
+    num_planned_subcompactions = GetSubcompactionsLimit();
+  }
+
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
+                           &num_planned_subcompactions);
+  if (num_planned_subcompactions == 1) return;
+
+  // Group the ranges into subcompactions
+  uint64_t target_range_size = std::max(
+      total_size / num_planned_subcompactions,
+      MaxFileSizeForLevel(
+          *(c->mutable_cf_options()), out_lvl,
+          c->immutable_options()->compaction_style, base_level,
+          c->immutable_options()->level_compaction_dynamic_level_bytes));
+
+  if (target_range_size >= total_size) {
+    return;
+  }
+
+  uint64_t next_threshold = target_range_size;
+  uint64_t cumulative_size = 0;
+  uint64_t num_actual_subcompactions = 1U;
+  for (TableReader::Anchor& anchor : all_anchors) {
+    cumulative_size += anchor.range_size;
+    if (cumulative_size > next_threshold) {
+      next_threshold += target_range_size;
+      num_actual_subcompactions++;
+      boundaries_.push_back(anchor.user_key);
+    }
+    if (num_actual_subcompactions == num_planned_subcompactions) {
+      break;
+    }
+  }
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:1",
+                           &num_actual_subcompactions);
+  // Shrink extra subcompactions resources when extra resrouces are acquired
+  ShrinkSubcompactionResources(
+      std::min((int)(num_planned_subcompactions - num_actual_subcompactions),
+               extra_num_subcompaction_threads_reserved_));
+}
+
+Status CompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+  TEST_SYNC_POINT("CompactionJob::Run():Start");
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+
+  const size_t num_threads = compact_->sub_compact_states.size();
+  assert(num_threads > 0);
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+
+  // Launch a thread for each of subcompactions 1...num_threads-1
+  std::vector<port::Thread> thread_pool;
+  thread_pool.reserve(num_threads - 1);
+  for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+    thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
+                             &compact_->sub_compact_states[i]);
+  }
+
+  // Always schedule the first subcompaction (whether or not there are also
+  // others) in the current thread to be efficient with resources
+  ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
+
+  // Wait for all other threads (if there are any) to finish execution
+  for (auto& thread : thread_pool) {
+    thread.join();
+  }
+
+  compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
+
+  for (auto& state : compact_->sub_compact_states) {
+    compaction_stats_.AddCpuMicros(state.compaction_job_stats.cpu_micros);
+    state.RemoveLastEmptyOutput();
+  }
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME,
+                        compaction_stats_.stats.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.stats.cpu_micros);
+
+  TEST_SYNC_POINT("CompactionJob::Run:BeforeVerify");
+
+  // Check if any thread encountered an error during execution
+  Status status;
+  IOStatus io_s;
+  bool wrote_new_blob_files = false;
+
+  for (const auto& state : compact_->sub_compact_states) {
+    if (!state.status.ok()) {
+      status = state.status;
+      io_s = state.io_status;
+      break;
+    }
+
+    if (state.Current().HasBlobFileAdditions()) {
+      wrote_new_blob_files = true;
+    }
+  }
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+
+    if (io_s.ok() && wrote_new_blob_files && blob_output_directory_ &&
+        blob_output_directory_ != output_directory_) {
+      io_s = blob_output_directory_->FsyncWithDirOptions(
+          IOOptions(), dbg,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    thread_pool.clear();
+    std::vector<const CompactionOutputs::Output*> files_output;
+    for (const auto& state : compact_->sub_compact_states) {
+      for (const auto& output : state.GetOutputs()) {
+        files_output.emplace_back(&output);
+      }
+    }
+    ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+    auto& prefix_extractor =
+        compact_->compaction->mutable_cf_options()->prefix_extractor;
+    std::atomic<size_t> next_file_idx(0);
+    auto verify_table = [&](Status& output_status) {
+      while (true) {
+        size_t file_idx = next_file_idx.fetch_add(1);
+        if (file_idx >= files_output.size()) {
+          break;
+        }
+        // Verify that the table is usable
+        // We set for_compaction to false and don't
+        // OptimizeForCompactionTableRead here because this is a special case
+        // after we finish the table building No matter whether
+        // use_direct_io_for_flush_and_compaction is true, we will regard this
+        // verification as user reads since the goal is to cache it here for
+        // further user reads
+        ReadOptions read_options;
+        InternalIterator* iter = cfd->table_cache()->NewIterator(
+            read_options, file_options_, cfd->internal_comparator(),
+            files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
+            prefix_extractor,
+            /*table_reader_ptr=*/nullptr,
+            cfd->internal_stats()->GetFileReadHist(
+                compact_->compaction->output_level()),
+            TableReaderCaller::kCompactionRefill, /*arena=*/nullptr,
+            /*skip_filters=*/false, compact_->compaction->output_level(),
+            MaxFileSizeForL0MetaPin(
+                *compact_->compaction->mutable_cf_options()),
+            /*smallest_compaction_key=*/nullptr,
+            /*largest_compaction_key=*/nullptr,
+            /*allow_unprepared_value=*/false);
+        auto s = iter->status();
+
+        if (s.ok() && paranoid_file_checks_) {
+          OutputValidator validator(cfd->internal_comparator(),
+                                    /*_enable_order_check=*/true,
+                                    /*_enable_hash=*/true);
+          for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+            s = validator.Add(iter->key(), iter->value());
+            if (!s.ok()) {
+              break;
+            }
+          }
+          if (s.ok()) {
+            s = iter->status();
+          }
+          if (s.ok() &&
+              !validator.CompareValidator(files_output[file_idx]->validator)) {
+            s = Status::Corruption("Paranoid checksums do not match");
+          }
+        }
+
+        delete iter;
+
+        if (!s.ok()) {
+          output_status = s;
+          break;
+        }
+      }
+    };
+    for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
+      thread_pool.emplace_back(
+          verify_table, std::ref(compact_->sub_compact_states[i].status));
+    }
+    verify_table(compact_->sub_compact_states[0].status);
+    for (auto& thread : thread_pool) {
+      thread.join();
+    }
+
+    for (const auto& state : compact_->sub_compact_states) {
+      if (!state.status.ok()) {
+        status = state.status;
+        break;
+      }
+    }
+  }
+
+  ReleaseSubcompactionResources();
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:0");
+  TEST_SYNC_POINT("CompactionJob::ReleaseSubcompactionResources:1");
+
+  TablePropertiesCollection tp;
+  for (const auto& state : compact_->sub_compact_states) {
+    for (const auto& output : state.GetOutputs()) {
+      auto fn =
+          TableFileName(state.compaction->immutable_options()->cf_paths,
+                        output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
+      tp[fn] = output.table_properties;
+    }
+  }
+  compact_->compaction->SetOutputTableProperties(std::move(tp));
+
+  // Finish up all book-keeping to unify the subcompaction results
+  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+  UpdateCompactionStats();
+
+  RecordCompactionIOStats();
+  LogFlush(db_options_.info_log);
+  TEST_SYNC_POINT("CompactionJob::Run():End");
+
+  compact_->status = status;
+  return status;
+}
+
+Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_INSTALL);
+  db_mutex_->AssertHeld();
+  Status status = compact_->status;
+
+  ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+  assert(cfd);
+
+  int output_level = compact_->compaction->output_level();
+  cfd->internal_stats()->AddCompactionStats(output_level, thread_pri_,
+                                            compaction_stats_);
+
+  if (status.ok()) {
+    status = InstallCompactionResults(mutable_cf_options);
+  }
+  if (!versions_->io_status().ok()) {
+    io_status_ = versions_->io_status();
+  }
+
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  auto vstorage = cfd->current()->storage_info();
+  const auto& stats = compaction_stats_.stats;
+
+  double read_write_amp = 0.0;
+  double write_amp = 0.0;
+  double bytes_read_per_sec = 0;
+  double bytes_written_per_sec = 0;
+
+  const uint64_t bytes_read_non_output_and_blob =
+      stats.bytes_read_non_output_levels + stats.bytes_read_blob;
+  const uint64_t bytes_read_all =
+      stats.bytes_read_output_level + bytes_read_non_output_and_blob;
+  const uint64_t bytes_written_all =
+      stats.bytes_written + stats.bytes_written_blob;
+
+  if (bytes_read_non_output_and_blob > 0) {
+    read_write_amp = (bytes_written_all + bytes_read_all) /
+                     static_cast<double>(bytes_read_non_output_and_blob);
+    write_amp =
+        bytes_written_all / static_cast<double>(bytes_read_non_output_and_blob);
+  }
+  if (stats.micros > 0) {
+    bytes_read_per_sec = bytes_read_all / static_cast<double>(stats.micros);
+    bytes_written_per_sec =
+        bytes_written_all / static_cast<double>(stats.micros);
+  }
+
+  const std::string& column_family_name = cfd->GetName();
+
+  constexpr double kMB = 1048576.0;
+
+  ROCKS_LOG_BUFFER(
+      log_buffer_,
+      "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
+      "files in(%d, %d) out(%d +%d blob) "
+      "MB in(%.1f, %.1f +%.1f blob) out(%.1f +%.1f blob), "
+      "read-write-amplify(%.1f) write-amplify(%.1f) %s, records in: %" PRIu64
+      ", records dropped: %" PRIu64 " output_compression: %s\n",
+      column_family_name.c_str(), vstorage->LevelSummary(&tmp),
+      bytes_read_per_sec, bytes_written_per_sec,
+      compact_->compaction->output_level(),
+      stats.num_input_files_in_non_output_levels,
+      stats.num_input_files_in_output_level, stats.num_output_files,
+      stats.num_output_files_blob, stats.bytes_read_non_output_levels / kMB,
+      stats.bytes_read_output_level / kMB, stats.bytes_read_blob / kMB,
+      stats.bytes_written / kMB, stats.bytes_written_blob / kMB, read_write_amp,
+      write_amp, status.ToString().c_str(), stats.num_input_records,
+      stats.num_dropped_records,
+      CompressionTypeToString(compact_->compaction->output_compression())
+          .c_str());
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    assert(blob_files.back());
+
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+        column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+        blob_files.back()->GetBlobFileNumber());
+  }
+
+  if (compaction_stats_.has_penultimate_level_output) {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] has Penultimate Level output: %" PRIu64
+        ", level %d, number of files: %" PRIu64 ", number of records: %" PRIu64,
+        column_family_name.c_str(),
+        compaction_stats_.penultimate_level_stats.bytes_written,
+        compact_->compaction->GetPenultimateLevel(),
+        compaction_stats_.penultimate_level_stats.num_output_files,
+        compaction_stats_.penultimate_level_stats.num_output_records);
+  }
+
+  UpdateCompactionJobStats(stats);
+
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
+  stream << "job" << job_id_ << "event"
+         << "compaction_finished"
+         << "compaction_time_micros" << stats.micros
+         << "compaction_time_cpu_micros" << stats.cpu_micros << "output_level"
+         << compact_->compaction->output_level() << "num_output_files"
+         << stats.num_output_files << "total_output_size"
+         << stats.bytes_written;
+
+  if (stats.num_output_files_blob > 0) {
+    stream << "num_blob_output_files" << stats.num_output_files_blob
+           << "total_blob_output_size" << stats.bytes_written_blob;
+  }
+
+  stream << "num_input_records" << stats.num_input_records
+         << "num_output_records" << stats.num_output_records
+         << "num_subcompactions" << compact_->sub_compact_states.size()
+         << "output_compression"
+         << CompressionTypeToString(compact_->compaction->output_compression());
+
+  stream << "num_single_delete_mismatches"
+         << compaction_job_stats_->num_single_del_mismatch;
+  stream << "num_single_delete_fallthrough"
+         << compaction_job_stats_->num_single_del_fallthru;
+
+  if (measure_io_stats_) {
+    stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
+    stream << "file_range_sync_nanos"
+           << compaction_job_stats_->file_range_sync_nanos;
+    stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
+    stream << "file_prepare_write_nanos"
+           << compaction_job_stats_->file_prepare_write_nanos;
+  }
+
+  stream << "lsm_state";
+  stream.StartArray();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+    assert(blob_files.back());
+    stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+  }
+
+  if (compaction_stats_.has_penultimate_level_output) {
+    InternalStats::CompactionStats& pl_stats =
+        compaction_stats_.penultimate_level_stats;
+    stream << "penultimate_level_num_output_files" << pl_stats.num_output_files;
+    stream << "penultimate_level_bytes_written" << pl_stats.bytes_written;
+    stream << "penultimate_level_num_output_records"
+           << pl_stats.num_output_records;
+    stream << "penultimate_level_num_output_files_blob"
+           << pl_stats.num_output_files_blob;
+    stream << "penultimate_level_bytes_written_blob"
+           << pl_stats.bytes_written_blob;
+  }
+
+  CleanupCompaction();
+  return status;
+}
+
+void CompactionJob::NotifyOnSubcompactionBegin(
+    SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+  Compaction* c = compact_->compaction;
+
+  if (db_options_.listeners.empty()) {
+    return;
+  }
+  if (shutting_down_->load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_canceled_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  sub_compact->notify_on_subcompaction_completion = true;
+
+  SubcompactionJobInfo info{};
+  sub_compact->BuildSubcompactionJobInfo(info);
+  info.job_id = static_cast<int>(job_id_);
+  info.thread_id = env_->GetThreadID();
+
+  for (const auto& listener : db_options_.listeners) {
+    listener->OnSubcompactionBegin(info);
+  }
+  info.status.PermitUncheckedError();
+
+#else
+  (void)sub_compact;
+#endif  // ROCKSDB_LITE
+}
+
+void CompactionJob::NotifyOnSubcompactionCompleted(
+    SubcompactionState* sub_compact) {
+#ifndef ROCKSDB_LITE
+
+  if (db_options_.listeners.empty()) {
+    return;
+  }
+  if (shutting_down_->load(std::memory_order_acquire)) {
+    return;
+  }
+
+  if (sub_compact->notify_on_subcompaction_completion == false) {
+    return;
+  }
+
+  SubcompactionJobInfo info{};
+  sub_compact->BuildSubcompactionJobInfo(info);
+  info.job_id = static_cast<int>(job_id_);
+  info.thread_id = env_->GetThreadID();
+
+  for (const auto& listener : db_options_.listeners) {
+    listener->OnSubcompactionCompleted(info);
+  }
+#else
+  (void)sub_compact;
+#endif  // ROCKSDB_LITE
+}
+
+void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+
+#ifndef ROCKSDB_LITE
+  if (db_options_.compaction_service) {
+    CompactionServiceJobStatus comp_status =
+        ProcessKeyValueCompactionWithCompactionService(sub_compact);
+    if (comp_status == CompactionServiceJobStatus::kSuccess ||
+        comp_status == CompactionServiceJobStatus::kFailure) {
+      return;
+    }
+    // fallback to local compaction
+    assert(comp_status == CompactionServiceJobStatus::kUseLocal);
+  }
+#endif  // !ROCKSDB_LITE
+
+  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+
+  // Create compaction filter and fail the compaction if
+  // IgnoreSnapshots() = false because it is not supported anymore
+  const CompactionFilter* compaction_filter =
+      cfd->ioptions()->compaction_filter;
+  std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
+  if (compaction_filter == nullptr) {
+    compaction_filter_from_factory =
+        sub_compact->compaction->CreateCompactionFilter();
+    compaction_filter = compaction_filter_from_factory.get();
+  }
+  if (compaction_filter != nullptr && !compaction_filter->IgnoreSnapshots()) {
+    sub_compact->status = Status::NotSupported(
+        "CompactionFilter::IgnoreSnapshots() = false is not supported "
+        "anymore.");
+    return;
+  }
+
+  NotifyOnSubcompactionBegin(sub_compact);
+
+  auto range_del_agg = std::make_unique<CompactionRangeDelAggregator>(
+      &cfd->internal_comparator(), existing_snapshots_, &full_history_ts_low_,
+      &trim_ts_);
+
+  // TODO: since we already use C++17, should use
+  // std::optional<const Slice> instead.
+  const std::optional<Slice> start = sub_compact->start;
+  const std::optional<Slice> end = sub_compact->end;
+
+  std::optional<Slice> start_without_ts;
+  std::optional<Slice> end_without_ts;
+
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.fill_cache = false;
+  read_options.rate_limiter_priority = GetRateLimiterPriority();
+  // Compaction iterators shouldn't be confined to a single prefix.
+  // Compactions use Seek() for
+  // (a) concurrent compactions,
+  // (b) CompactionFilter::Decision::kRemoveAndSkipUntil.
+  read_options.total_order_seek = true;
+
+  // Remove the timestamps from boundaries because boundaries created in
+  // GenSubcompactionBoundaries doesn't strip away the timestamp.
+  size_t ts_sz = cfd->user_comparator()->timestamp_size();
+  if (start.has_value()) {
+    read_options.iterate_lower_bound = &start.value();
+    if (ts_sz > 0) {
+      start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
+      read_options.iterate_lower_bound = &start_without_ts.value();
+    }
+  }
+  if (end.has_value()) {
+    read_options.iterate_upper_bound = &end.value();
+    if (ts_sz > 0) {
+      end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
+      read_options.iterate_upper_bound = &end_without_ts.value();
+    }
+  }
+
+  // Although the v2 aggregator is what the level iterator(s) know about,
+  // the AddTombstones calls will be propagated down to the v1 aggregator.
+  std::unique_ptr<InternalIterator> raw_input(versions_->MakeInputIterator(
+      read_options, sub_compact->compaction, range_del_agg.get(),
+      file_options_for_read_, start, end));
+  InternalIterator* input = raw_input.get();
+
+  IterKey start_ikey;
+  IterKey end_ikey;
+  Slice start_slice;
+  Slice end_slice;
+
+  static constexpr char kMaxTs[] =
+      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  Slice ts_slice;
+  std::string max_ts;
+  if (ts_sz > 0) {
+    if (ts_sz <= strlen(kMaxTs)) {
+      ts_slice = Slice(kMaxTs, ts_sz);
+    } else {
+      max_ts = std::string(ts_sz, '\xff');
+      ts_slice = Slice(max_ts);
+    }
+  }
+
+  if (start.has_value()) {
+    start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+    if (ts_sz > 0) {
+      start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+                                   &ts_slice);
+    }
+    start_slice = start_ikey.GetInternalKey();
+  }
+  if (end.has_value()) {
+    end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+    if (ts_sz > 0) {
+      end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
+                                 &ts_slice);
+    }
+    end_slice = end_ikey.GetInternalKey();
+  }
+
+  std::unique_ptr<InternalIterator> clip;
+  if (start.has_value() || end.has_value()) {
+    clip = std::make_unique<ClippingIterator>(
+        raw_input.get(), start.has_value() ? &start_slice : nullptr,
+        end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
+    input = clip.get();
+  }
+
+  std::unique_ptr<InternalIterator> blob_counter;
+
+  if (sub_compact->compaction->DoesInputReferenceBlobFiles()) {
+    BlobGarbageMeter* meter = sub_compact->Current().CreateBlobGarbageMeter();
+    blob_counter = std::make_unique<BlobCountingIterator>(input, meter);
+    input = blob_counter.get();
+  }
+
+  std::unique_ptr<InternalIterator> trim_history_iter;
+  if (ts_sz > 0 && !trim_ts_.empty()) {
+    trim_history_iter = std::make_unique<HistoryTrimmingIterator>(
+        input, cfd->user_comparator(), trim_ts_);
+    input = trim_history_iter.get();
+  }
+
+  input->SeekToFirst();
+
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  const uint64_t kRecordStatsEvery = 1000;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+
+  MergeHelper merge(
+      env_, cfd->user_comparator(), cfd->ioptions()->merge_operator.get(),
+      compaction_filter, db_options_.info_log.get(),
+      false /* internal key corruption is expected */,
+      existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+      snapshot_checker_, compact_->compaction->level(), db_options_.stats);
+
+  const MutableCFOptions* mutable_cf_options =
+      sub_compact->compaction->mutable_cf_options();
+  assert(mutable_cf_options);
+
+  std::vector<std::string> blob_file_paths;
+
+  // TODO: BlobDB to support output_to_penultimate_level compaction, which needs
+  //  2 builders, so may need to move to `CompactionOutputs`
+  std::unique_ptr<BlobFileBuilder> blob_file_builder(
+      (mutable_cf_options->enable_blob_files &&
+       sub_compact->compaction->output_level() >=
+           mutable_cf_options->blob_file_starting_level)
+          ? new BlobFileBuilder(
+                versions_, fs_.get(),
+                sub_compact->compaction->immutable_options(),
+                mutable_cf_options, &file_options_, db_id_, db_session_id_,
+                job_id_, cfd->GetID(), cfd->GetName(), Env::IOPriority::IO_LOW,
+                write_hint_, io_tracer_, blob_callback_,
+                BlobFileCreationReason::kCompaction, &blob_file_paths,
+                sub_compact->Current().GetBlobFileAdditionsPtr())
+          : nullptr);
+
+  TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+
+  const std::string* const full_history_ts_low =
+      full_history_ts_low_.empty() ? nullptr : &full_history_ts_low_;
+  const SequenceNumber job_snapshot_seq =
+      job_context_ ? job_context_->GetJobSnapshotSequence()
+                   : kMaxSequenceNumber;
+
+  auto c_iter = std::make_unique<CompactionIterator>(
+      input, cfd->user_comparator(), &merge, versions_->LastSequence(),
+      &existing_snapshots_, earliest_write_conflict_snapshot_, job_snapshot_seq,
+      snapshot_checker_, env_, ShouldReportDetailedTime(env_, stats_),
+      /*expect_valid_internal_key=*/true, range_del_agg.get(),
+      blob_file_builder.get(), db_options_.allow_data_in_errors,
+      db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
+      sub_compact->compaction, compaction_filter, shutting_down_,
+      db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
+      preclude_last_level_min_seqno_);
+  c_iter->SeekToFirst();
+
+  // Assign range delete aggregator to the target output level, which makes sure
+  // it only output to single level
+  sub_compact->AssignRangeDelAggregator(std::move(range_del_agg));
+
+  const auto& c_iter_stats = c_iter->iter_stats();
+
+  // define the open and close functions for the compaction files, which will be
+  // used open/close output files when needed.
+  const CompactionFileOpenFunc open_file_func =
+      [this, sub_compact](CompactionOutputs& outputs) {
+        return this->OpenCompactionOutputFile(sub_compact, outputs);
+      };
+  const CompactionFileCloseFunc close_file_func =
+      [this, sub_compact](CompactionOutputs& outputs, const Status& status,
+                          const Slice& next_table_min_key) {
+        return this->FinishCompactionOutputFile(status, sub_compact, outputs,
+                                                next_table_min_key);
+      };
+
+  Status status;
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing",
+      reinterpret_cast<void*>(
+          const_cast<Compaction*>(sub_compact->compaction)));
+  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
+    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
+    // returns true.
+
+    assert(!end.has_value() || cfd->user_comparator()->Compare(
+                                   c_iter->user_key(), end.value()) < 0);
+
+    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
+        kRecordStatsEvery - 1) {
+      RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+      c_iter->ResetRecordCounts();
+      RecordCompactionIOStats();
+    }
+
+    // Add current compaction_iterator key to target compaction output, if the
+    // output file needs to be close or open, it will call the `open_file_func`
+    // and `close_file_func`.
+    // TODO: it would be better to have the compaction file open/close moved
+    // into `CompactionOutputs` which has the output file information.
+    status = sub_compact->AddToOutput(*c_iter, open_file_func, close_file_func);
+    if (!status.ok()) {
+      break;
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "CompactionJob::Run():PausingManualCompaction:2",
+        reinterpret_cast<void*>(
+            const_cast<std::atomic<bool>*>(&manual_compaction_canceled_)));
+    c_iter->Next();
+    if (c_iter->status().IsManualCompactionPaused()) {
+      break;
+    }
+  }
+
+  sub_compact->compaction_job_stats.num_blobs_read =
+      c_iter_stats.num_blobs_read;
+  sub_compact->compaction_job_stats.total_blob_bytes_read =
+      c_iter_stats.total_blob_bytes_read;
+  sub_compact->compaction_job_stats.num_input_deletion_records =
+      c_iter_stats.num_input_deletion_records;
+  sub_compact->compaction_job_stats.num_corrupt_keys =
+      c_iter_stats.num_input_corrupt_records;
+  sub_compact->compaction_job_stats.num_single_del_fallthru =
+      c_iter_stats.num_single_del_fallthru;
+  sub_compact->compaction_job_stats.num_single_del_mismatch =
+      c_iter_stats.num_single_del_mismatch;
+  sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
+      c_iter_stats.total_input_raw_key_bytes;
+  sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
+      c_iter_stats.total_input_raw_value_bytes;
+
+  RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
+             c_iter_stats.total_filter_time);
+
+  if (c_iter_stats.num_blobs_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_NUM_KEYS_RELOCATED,
+               c_iter_stats.num_blobs_relocated);
+  }
+  if (c_iter_stats.total_blob_bytes_relocated > 0) {
+    RecordTick(stats_, BLOB_DB_GC_BYTES_RELOCATED,
+               c_iter_stats.total_blob_bytes_relocated);
+  }
+
+  RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
+  RecordCompactionIOStats();
+
+  if (status.ok() && cfd->IsDropped()) {
+    status =
+        Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_relaxed)) {
+    status = Status::ShutdownInProgress("Database shutdown");
+  }
+  if ((status.ok() || status.IsColumnFamilyDropped()) &&
+      (manual_compaction_canceled_.load(std::memory_order_relaxed))) {
+    status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  if (status.ok()) {
+    status = input->status();
+  }
+  if (status.ok()) {
+    status = c_iter->status();
+  }
+
+  // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+  // close the output files. Open file function is also passed, in case there's
+  // only range-dels, no file was opened, to save the range-dels, it need to
+  // create a new output file.
+  status = sub_compact->CloseCompactionFiles(status, open_file_func,
+                                             close_file_func);
+
+  if (blob_file_builder) {
+    if (status.ok()) {
+      status = blob_file_builder->Finish();
+    } else {
+      blob_file_builder->Abandon(status);
+    }
+    blob_file_builder.reset();
+    sub_compact->Current().UpdateBlobStats();
+  }
+
+  sub_compact->compaction_job_stats.cpu_micros =
+      db_options_.clock->CPUMicros() - prev_cpu_micros;
+
+  if (measure_io_stats_) {
+    sub_compact->compaction_job_stats.file_write_nanos +=
+        IOSTATS(write_nanos) - prev_write_nanos;
+    sub_compact->compaction_job_stats.file_fsync_nanos +=
+        IOSTATS(fsync_nanos) - prev_fsync_nanos;
+    sub_compact->compaction_job_stats.file_range_sync_nanos +=
+        IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
+    sub_compact->compaction_job_stats.file_prepare_write_nanos +=
+        IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
+    sub_compact->compaction_job_stats.cpu_micros -=
+        (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos +
+         IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos) /
+        1000;
+    if (prev_perf_level != PerfLevel::kEnableTimeAndCPUTimeExceptForMutex) {
+      SetPerfLevel(prev_perf_level);
+    }
+  }
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (!status.ok()) {
+    if (c_iter) {
+      c_iter->status().PermitUncheckedError();
+    }
+    if (input) {
+      input->status().PermitUncheckedError();
+    }
+  }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+
+  blob_counter.reset();
+  clip.reset();
+  raw_input.reset();
+  sub_compact->status = status;
+  NotifyOnSubcompactionCompleted(sub_compact);
+}
+
+uint64_t CompactionJob::GetCompactionId(SubcompactionState* sub_compact) const {
+  return (uint64_t)job_id_ << 32 | sub_compact->sub_job_id;
+}
+
+void CompactionJob::RecordDroppedKeys(
+    const CompactionIterationStats& c_iter_stats,
+    CompactionJobStats* compaction_job_stats) {
+  if (c_iter_stats.num_record_drop_user > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_USER,
+               c_iter_stats.num_record_drop_user);
+  }
+  if (c_iter_stats.num_record_drop_hidden > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
+               c_iter_stats.num_record_drop_hidden);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_records_replaced +=
+          c_iter_stats.num_record_drop_hidden;
+    }
+  }
+  if (c_iter_stats.num_record_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
+               c_iter_stats.num_record_drop_obsolete);
+    if (compaction_job_stats) {
+      compaction_job_stats->num_expired_deletion_records +=
+          c_iter_stats.num_record_drop_obsolete;
+    }
+  }
+  if (c_iter_stats.num_record_drop_range_del > 0) {
+    RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
+               c_iter_stats.num_record_drop_range_del);
+  }
+  if (c_iter_stats.num_range_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_range_del_drop_obsolete);
+  }
+  if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
+    RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
+               c_iter_stats.num_optimized_del_drop_obsolete);
+  }
+}
+
+Status CompactionJob::FinishCompactionOutputFile(
+    const Status& input_status, SubcompactionState* sub_compact,
+    CompactionOutputs& outputs, const Slice& next_table_min_key) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
+  assert(sub_compact != nullptr);
+  assert(outputs.HasBuilder());
+
+  FileMetaData* meta = outputs.GetMetaData();
+  uint64_t output_number = meta->fd.GetNumber();
+  assert(output_number != 0);
+
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+  std::string file_checksum = kUnknownFileChecksum;
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+  // Check for iterator errors
+  Status s = input_status;
+
+  // Add range tombstones
+  auto earliest_snapshot = kMaxSequenceNumber;
+  if (existing_snapshots_.size() > 0) {
+    earliest_snapshot = existing_snapshots_[0];
+  }
+  if (s.ok()) {
+    CompactionIterationStats range_del_out_stats;
+    // if the compaction supports per_key_placement, only output range dels to
+    // the penultimate level.
+    // Note: Use `bottommost_level_ = true` for both bottommost and
+    // output_to_penultimate_level compaction here, as it's only used to decide
+    // if range dels could be dropped.
+    if (outputs.HasRangeDel()) {
+      s = outputs.AddRangeDels(
+          sub_compact->start.has_value() ? &(sub_compact->start.value())
+                                         : nullptr,
+          sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
+          range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
+          earliest_snapshot, next_table_min_key, full_history_ts_low_);
+    }
+    RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
+    TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
+  }
+
+  const uint64_t current_entries = outputs.NumEntries();
+
+  s = outputs.Finish(s, seqno_time_mapping_);
+
+  if (s.ok()) {
+    // With accurate smallest and largest key, we can get a slightly more
+    // accurate oldest ancester time.
+    // This makes oldest ancester time in manifest more accurate than in
+    // table properties. Not sure how to resolve it.
+    if (meta->smallest.size() > 0 && meta->largest.size() > 0) {
+      uint64_t refined_oldest_ancester_time;
+      Slice new_smallest = meta->smallest.user_key();
+      Slice new_largest = meta->largest.user_key();
+      if (!new_largest.empty() && !new_smallest.empty()) {
+        refined_oldest_ancester_time =
+            sub_compact->compaction->MinInputFileOldestAncesterTime(
+                &(meta->smallest), &(meta->largest));
+        if (refined_oldest_ancester_time !=
+            std::numeric_limits<uint64_t>::max()) {
+          meta->oldest_ancester_time = refined_oldest_ancester_time;
+        }
+      }
+    }
+  }
+
+  // Finish and check for file errors
+  IOStatus io_s = outputs.WriterSyncClose(s, db_options_.clock, stats_,
+                                          db_options_.use_fsync);
+
+  if (s.ok() && io_s.ok()) {
+    file_checksum = meta->file_checksum;
+    file_checksum_func_name = meta->file_checksum_func_name;
+  }
+
+  if (s.ok()) {
+    s = io_s;
+  }
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the
+    // "normal" status, it does not also need to be checked
+    sub_compact->io_status.PermitUncheckedError();
+  }
+
+  TableProperties tp;
+  if (s.ok()) {
+    tp = outputs.GetTableProperties();
+  }
+
+  if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
+    // If there is nothing to output, no necessary to generate a sst file.
+    // This happens when the output level is bottom level, at the same time
+    // the sub_compact output nothing.
+    std::string fname =
+        TableFileName(sub_compact->compaction->immutable_options()->cf_paths,
+                      meta->fd.GetNumber(), meta->fd.GetPathId());
+
+    // TODO(AR) it is not clear if there are any larger implications if
+    // DeleteFile fails here
+    Status ds = env_->DeleteFile(fname);
+    if (!ds.ok()) {
+      ROCKS_LOG_WARN(
+          db_options_.info_log,
+          "[%s] [JOB %d] Unable to remove SST file for table #%" PRIu64
+          " at bottom level%s",
+          cfd->GetName().c_str(), job_id_, output_number,
+          meta->marked_for_compaction ? " (need compaction)" : "");
+    }
+
+    // Also need to remove the file from outputs, or it will be added to the
+    // VersionEdit.
+    outputs.RemoveLastOutput();
+    meta = nullptr;
+  }
+
+  if (s.ok() && (current_entries > 0 || tp.num_range_deletions > 0)) {
+    // Output to event logger and fire events.
+    outputs.UpdateTableProperties();
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
+                   " keys, %" PRIu64 " bytes%s, temperature: %s",
+                   cfd->GetName().c_str(), job_id_, output_number,
+                   current_entries, meta->fd.file_size,
+                   meta->marked_for_compaction ? " (need compaction)" : "",
+                   temperature_to_string[meta->temperature].c_str());
+  }
+  std::string fname;
+  FileDescriptor output_fd;
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+  Status status_for_listener = s;
+  if (meta != nullptr) {
+    fname = GetTableFileName(meta->fd.GetNumber());
+    output_fd = meta->fd;
+    oldest_blob_file_number = meta->oldest_blob_file_number;
+  } else {
+    fname = "(nil)";
+    if (s.ok()) {
+      status_for_listener = Status::Aborted("Empty SST file not kept");
+    }
+  }
+  EventHelpers::LogAndNotifyTableFileCreationFinished(
+      event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
+      job_id_, output_fd, oldest_blob_file_number, tp,
+      TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
+      file_checksum_func_name);
+
+#ifndef ROCKSDB_LITE
+  // Report new file to SstFileManagerImpl
+  auto sfm =
+      static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+  if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
+    Status add_s = sfm->OnAddFile(fname);
+    if (!add_s.ok() && s.ok()) {
+      s = add_s;
+    }
+    if (sfm->IsMaxAllowedSpaceReached()) {
+      // TODO(ajkr): should we return OK() if max space was reached by the final
+      // compaction output file (similarly to how flush works when full)?
+      s = Status::SpaceLimit("Max allowed space was reached");
+      TEST_SYNC_POINT(
+          "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached");
+      InstrumentedMutexLock l(db_mutex_);
+      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
+    }
+  }
+#endif
+
+  outputs.ResetBuilder();
+  return s;
+}
+
+Status CompactionJob::InstallCompactionResults(
+    const MutableCFOptions& mutable_cf_options) {
+  assert(compact_);
+
+  db_mutex_->AssertHeld();
+
+  auto* compaction = compact_->compaction;
+  assert(compaction);
+
+  {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    if (compaction_stats_.has_penultimate_level_output) {
+      ROCKS_LOG_BUFFER(
+          log_buffer_,
+          "[%s] [JOB %d] Compacted %s => output_to_penultimate_level: %" PRIu64
+          " bytes + last: %" PRIu64 " bytes. Total: %" PRIu64 " bytes",
+          compaction->column_family_data()->GetName().c_str(), job_id_,
+          compaction->InputLevelSummary(&inputs_summary),
+          compaction_stats_.penultimate_level_stats.bytes_written,
+          compaction_stats_.stats.bytes_written,
+          compaction_stats_.TotalBytesWritten());
+    } else {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
+                       compaction->column_family_data()->GetName().c_str(),
+                       job_id_, compaction->InputLevelSummary(&inputs_summary),
+                       compaction_stats_.TotalBytesWritten());
+    }
+  }
+
+  VersionEdit* const edit = compaction->edit();
+  assert(edit);
+
+  // Add compaction inputs
+  compaction->AddInputDeletions(edit);
+
+  std::unordered_map<uint64_t, BlobGarbageMeter::BlobStats> blob_total_garbage;
+
+  for (const auto& sub_compact : compact_->sub_compact_states) {
+    sub_compact.AddOutputsEdit(edit);
+
+    for (const auto& blob : sub_compact.Current().GetBlobFileAdditions()) {
+      edit->AddBlobFile(blob);
+    }
+
+    if (sub_compact.Current().GetBlobGarbageMeter()) {
+      const auto& flows = sub_compact.Current().GetBlobGarbageMeter()->flows();
+
+      for (const auto& pair : flows) {
+        const uint64_t blob_file_number = pair.first;
+        const BlobGarbageMeter::BlobInOutFlow& flow = pair.second;
+
+        assert(flow.IsValid());
+        if (flow.HasGarbage()) {
+          blob_total_garbage[blob_file_number].Add(flow.GetGarbageCount(),
+                                                   flow.GetGarbageBytes());
+        }
+      }
+    }
+  }
+
+  for (const auto& pair : blob_total_garbage) {
+    const uint64_t blob_file_number = pair.first;
+    const BlobGarbageMeter::BlobStats& stats = pair.second;
+
+    edit->AddBlobFileGarbage(blob_file_number, stats.GetCount(),
+                             stats.GetBytes());
+  }
+
+  if ((compaction->compaction_reason() ==
+           CompactionReason::kLevelMaxLevelSize ||
+       compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) &&
+      compaction->immutable_options()->compaction_pri == kRoundRobin) {
+    int start_level = compaction->start_level();
+    if (start_level > 0) {
+      auto vstorage = compaction->input_version()->storage_info();
+      edit->AddCompactCursor(start_level,
+                             vstorage->GetNextCompactCursor(
+                                 start_level, compaction->num_input_files(0)));
+    }
+  }
+
+  return versions_->LogAndApply(compaction->column_family_data(),
+                                mutable_cf_options, edit, db_mutex_,
+                                db_directory_);
+}
+
+void CompactionJob::RecordCompactionIOStats() {
+  RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
+  RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
+  CompactionReason compaction_reason =
+      compact_->compaction->compaction_reason();
+  if (compaction_reason == CompactionReason::kFilesMarkedForCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_MARKED, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_MARKED, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+    RecordTick(stats_, COMPACT_READ_BYTES_PERIODIC, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_PERIODIC, IOSTATS(bytes_written));
+  } else if (compaction_reason == CompactionReason::kTtl) {
+    RecordTick(stats_, COMPACT_READ_BYTES_TTL, IOSTATS(bytes_read));
+    RecordTick(stats_, COMPACT_WRITE_BYTES_TTL, IOSTATS(bytes_written));
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
+  IOSTATS_RESET(bytes_read);
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+
+Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
+                                               CompactionOutputs& outputs) {
+  assert(sub_compact != nullptr);
+
+  // no need to lock because VersionSet::next_file_number_ is atomic
+  uint64_t file_number = versions_->NewFileNumber();
+  std::string fname = GetTableFileName(file_number);
+  // Fire events.
+  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
+#ifndef ROCKSDB_LITE
+  EventHelpers::NotifyTableFileCreationStarted(
+      cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
+      TableFileCreationReason::kCompaction);
+#endif  // !ROCKSDB_LITE
+  // Make the output file
+  std::unique_ptr<FSWritableFile> writable_file;
+#ifndef NDEBUG
+  bool syncpoint_arg = file_options_.use_direct_writes;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
+                           &syncpoint_arg);
+#endif
+
+  // Pass temperature of the last level files to FileSystem.
+  FileOptions fo_copy = file_options_;
+  Temperature temperature = sub_compact->compaction->output_temperature();
+  // only set for the last level compaction and also it's not output to
+  // penultimate level (when preclude_last_level feature is enabled)
+  if (temperature == Temperature::kUnknown &&
+      sub_compact->compaction->is_last_level() &&
+      !sub_compact->IsCurrentPenultimateLevel()) {
+    temperature =
+        sub_compact->compaction->mutable_cf_options()->last_level_temperature;
+  }
+  fo_copy.temperature = temperature;
+
+  Status s;
+  IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy);
+  s = io_s;
+  if (sub_compact->io_status.ok()) {
+    sub_compact->io_status = io_s;
+    // Since this error is really a copy of the io_s that is checked below as s,
+    // it does not also need to be checked.
+    sub_compact->io_status.PermitUncheckedError();
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(
+        db_options_.info_log,
+        "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
+        " fails at NewWritableFile with status %s",
+        sub_compact->compaction->column_family_data()->GetName().c_str(),
+        job_id_, file_number, s.ToString().c_str());
+    LogFlush(db_options_.info_log);
+    EventHelpers::LogAndNotifyTableFileCreationFinished(
+        event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
+        fname, job_id_, FileDescriptor(), kInvalidBlobFileNumber,
+        TableProperties(), TableFileCreationReason::kCompaction, s,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName);
+    return s;
+  }
+
+  // Try to figure out the output file's oldest ancester time.
+  int64_t temp_current_time = 0;
+  auto get_time_status = db_options_.clock->GetCurrentTime(&temp_current_time);
+  // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+  if (!get_time_status.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get current time. Status: %s",
+                   get_time_status.ToString().c_str());
+  }
+  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+  InternalKey tmp_start, tmp_end;
+  if (sub_compact->start.has_value()) {
+    tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
+  }
+  if (sub_compact->end.has_value()) {
+    tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
+  }
+  uint64_t oldest_ancester_time =
+      sub_compact->compaction->MinInputFileOldestAncesterTime(
+          sub_compact->start.has_value() ? &tmp_start : nullptr,
+          sub_compact->end.has_value() ? &tmp_end : nullptr);
+  if (oldest_ancester_time == std::numeric_limits<uint64_t>::max()) {
+    oldest_ancester_time = current_time;
+  }
+
+  // Initialize a SubcompactionState::Output and add it to sub_compact->outputs
+  {
+    FileMetaData meta;
+    meta.fd = FileDescriptor(file_number,
+                             sub_compact->compaction->output_path_id(), 0);
+    meta.oldest_ancester_time = oldest_ancester_time;
+    meta.file_creation_time = current_time;
+    meta.temperature = temperature;
+    assert(!db_id_.empty());
+    assert(!db_session_id_.empty());
+    s = GetSstInternalUniqueId(db_id_, db_session_id_, meta.fd.GetNumber(),
+                               &meta.unique_id);
+    if (!s.ok()) {
+      ROCKS_LOG_ERROR(db_options_.info_log,
+                      "[%s] [JOB %d] file #%" PRIu64
+                      " failed to generate unique id: %s.",
+                      cfd->GetName().c_str(), job_id_, meta.fd.GetNumber(),
+                      s.ToString().c_str());
+      return s;
+    }
+
+    outputs.AddOutput(std::move(meta), cfd->internal_comparator(),
+                      sub_compact->compaction->mutable_cf_options()
+                          ->check_flush_compaction_key_order,
+                      paranoid_file_checks_);
+  }
+
+  writable_file->SetIOPriority(GetRateLimiterPriority());
+  writable_file->SetWriteLifeTimeHint(write_hint_);
+  FileTypeSet tmp_set = db_options_.checksum_handoff_file_types;
+  writable_file->SetPreallocationBlockSize(static_cast<size_t>(
+      sub_compact->compaction->OutputFilePreallocationSize()));
+  const auto& listeners =
+      sub_compact->compaction->immutable_options()->listeners;
+  outputs.AssignFileWriter(new WritableFileWriter(
+      std::move(writable_file), fname, fo_copy, db_options_.clock, io_tracer_,
+      db_options_.stats, listeners, db_options_.file_checksum_gen_factory.get(),
+      tmp_set.Contains(FileType::kTableFile), false));
+
+  TableBuilderOptions tboptions(
+      *cfd->ioptions(), *(sub_compact->compaction->mutable_cf_options()),
+      cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+      sub_compact->compaction->output_compression(),
+      sub_compact->compaction->output_compression_opts(), cfd->GetID(),
+      cfd->GetName(), sub_compact->compaction->output_level(),
+      bottommost_level_, TableFileCreationReason::kCompaction,
+      0 /* oldest_key_time */, current_time, db_id_, db_session_id_,
+      sub_compact->compaction->max_output_file_size(), file_number);
+
+  outputs.NewBuilder(tboptions);
+
+  LogFlush(db_options_.info_log);
+  return s;
+}
+
+void CompactionJob::CleanupCompaction() {
+  for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
+    sub_compact.Cleanup(table_cache_.get());
+  }
+  delete compact_;
+  compact_ = nullptr;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+}  // namespace
+
+#endif  // !ROCKSDB_LITE
+
+void CompactionJob::UpdateCompactionStats() {
+  assert(compact_);
+
+  Compaction* compaction = compact_->compaction;
+  compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
+  compaction_stats_.stats.num_input_files_in_output_level = 0;
+  for (int input_level = 0;
+       input_level < static_cast<int>(compaction->num_input_levels());
+       ++input_level) {
+    if (compaction->level(input_level) != compaction->output_level()) {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.stats.num_input_files_in_non_output_levels,
+          &compaction_stats_.stats.bytes_read_non_output_levels, input_level);
+    } else {
+      UpdateCompactionInputStatsHelper(
+          &compaction_stats_.stats.num_input_files_in_output_level,
+          &compaction_stats_.stats.bytes_read_output_level, input_level);
+    }
+  }
+
+  assert(compaction_job_stats_);
+  compaction_stats_.stats.bytes_read_blob =
+      compaction_job_stats_->total_blob_bytes_read;
+
+  compaction_stats_.stats.num_dropped_records =
+      compaction_stats_.DroppedRecords();
+}
+
+void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
+                                                     uint64_t* bytes_read,
+                                                     int input_level) {
+  const Compaction* compaction = compact_->compaction;
+  auto num_input_files = compaction->num_input_files(input_level);
+  *num_files += static_cast<int>(num_input_files);
+
+  for (size_t i = 0; i < num_input_files; ++i) {
+    const auto* file_meta = compaction->input(input_level, i);
+    *bytes_read += file_meta->fd.GetFileSize();
+    compaction_stats_.stats.num_input_records +=
+        static_cast<uint64_t>(file_meta->num_entries);
+  }
+}
+
+void CompactionJob::UpdateCompactionJobStats(
+    const InternalStats::CompactionStats& stats) const {
+#ifndef ROCKSDB_LITE
+  compaction_job_stats_->elapsed_micros = stats.micros;
+
+  // input information
+  compaction_job_stats_->total_input_bytes =
+      stats.bytes_read_non_output_levels + stats.bytes_read_output_level;
+  compaction_job_stats_->num_input_records = stats.num_input_records;
+  compaction_job_stats_->num_input_files =
+      stats.num_input_files_in_non_output_levels +
+      stats.num_input_files_in_output_level;
+  compaction_job_stats_->num_input_files_at_output_level =
+      stats.num_input_files_in_output_level;
+
+  // output information
+  compaction_job_stats_->total_output_bytes = stats.bytes_written;
+  compaction_job_stats_->total_output_bytes_blob = stats.bytes_written_blob;
+  compaction_job_stats_->num_output_records = stats.num_output_records;
+  compaction_job_stats_->num_output_files = stats.num_output_files;
+  compaction_job_stats_->num_output_files_blob = stats.num_output_files_blob;
+
+  if (stats.num_output_files > 0) {
+    CopyPrefix(compact_->SmallestUserKey(),
+               CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->smallest_output_key_prefix);
+    CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
+               &compaction_job_stats_->largest_output_key_prefix);
+  }
+#else
+  (void)stats;
+#endif  // !ROCKSDB_LITE
+}
+
+void CompactionJob::LogCompaction() {
+  Compaction* compaction = compact_->compaction;
+  ColumnFamilyData* cfd = compaction->column_family_data();
+
+  // Let's check if anything will get logged. Don't prepare all the info if
+  // we're not logging
+  if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
+    Compaction::InputLevelSummaryBuffer inputs_summary;
+    ROCKS_LOG_INFO(
+        db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+        cfd->GetName().c_str(), job_id_,
+        compaction->InputLevelSummary(&inputs_summary), compaction->score());
+    char scratch[2345];
+    compaction->Summary(scratch, sizeof(scratch));
+    ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n",
+                   cfd->GetName().c_str(), scratch);
+    // build event logger report
+    auto stream = event_logger_->Log();
+    stream << "job" << job_id_ << "event"
+           << "compaction_started"
+           << "compaction_reason"
+           << GetCompactionReasonString(compaction->compaction_reason());
+    for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
+      stream << ("files_L" + std::to_string(compaction->level(i)));
+      stream.StartArray();
+      for (auto f : *compaction->inputs(i)) {
+        stream << f->fd.GetNumber();
+      }
+      stream.EndArray();
+    }
+    stream << "score" << compaction->score() << "input_data_size"
+           << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno"
+           << (existing_snapshots_.empty()
+                   ? int64_t{-1}  // Use -1 for "none"
+                   : static_cast<int64_t>(existing_snapshots_[0]));
+    if (compaction->SupportsPerKeyPlacement()) {
+      stream << "preclude_last_level_min_seqno"
+             << preclude_last_level_min_seqno_;
+      stream << "penultimate_output_level" << compaction->GetPenultimateLevel();
+      stream << "penultimate_output_range"
+             << GetCompactionPenultimateOutputRangeTypeString(
+                    compaction->GetPenultimateOutputRangeType());
+
+      if (compaction->GetPenultimateOutputRangeType() ==
+          Compaction::PenultimateOutputRangeType::kDisabled) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "[%s] [JOB %d] Penultimate level output is disabled, likely "
+            "because of the range conflict in the penultimate level",
+            cfd->GetName().c_str(), job_id_);
+      }
+    }
+  }
+}
+
+std::string CompactionJob::GetTableFileName(uint64_t file_number) {
+  return TableFileName(compact_->compaction->immutable_options()->cf_paths,
+                       file_number, compact_->compaction->output_path_id());
+}
+
+Env::IOPriority CompactionJob::GetRateLimiterPriority() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->NeedsDelay() || write_controller->IsStopped()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_LOW;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job.h b/src/rocksdb/db/compaction/compaction_job.h
new file mode 100644
index 000000000..bfbce1011
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job.h
@@ -0,0 +1,500 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/memtable_list.h"
+#include "db/range_del_aggregator.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "options/cf_options.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/compaction_job_stats.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class CompactionState;
+class ErrorHandler;
+class MemTable;
+class SnapshotChecker;
+class SystemClock;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+
+class SubcompactionState;
+
+// CompactionJob is responsible for executing the compaction. Each (manual or
+// automated) compaction corresponds to a CompactionJob object, and usually
+// goes through the stages of `Prepare()`->`Run()`->`Install()`. CompactionJob
+// will divide the compaction into subcompactions and execute them in parallel
+// if needed.
+//
+// CompactionJob has 2 main stats:
+// 1. CompactionJobStats compaction_job_stats_
+//    CompactionJobStats is a public data structure which is part of Compaction
+//    event listener that rocksdb share the job stats with the user.
+//    Internally it's an aggregation of all the compaction_job_stats from each
+//    `SubcompactionState`:
+//                                           +------------------------+
+//                                           | SubcompactionState     |
+//                                           |                        |
+//                                +--------->|   compaction_job_stats |
+//                                |          |                        |
+//                                |          +------------------------+
+// +------------------------+     |
+// | CompactionJob          |     |          +------------------------+
+// |                        |     |          | SubcompactionState     |
+// |   compaction_job_stats +-----+          |                        |
+// |                        |     +--------->|   compaction_job_stats |
+// |                        |     |          |                        |
+// +------------------------+     |          +------------------------+
+//                                |
+//                                |          +------------------------+
+//                                |          | SubcompactionState     |
+//                                |          |                        |
+//                                +--------->+   compaction_job_stats |
+//                                |          |                        |
+//                                |          +------------------------+
+//                                |
+//                                |          +------------------------+
+//                                |          |       ...              |
+//                                +--------->+                        |
+//                                           +------------------------+
+//
+// 2. CompactionStatsFull compaction_stats_
+//    `CompactionStatsFull` is an internal stats about the compaction, which
+//    is eventually sent to `ColumnFamilyData::internal_stats_` and used for
+//    logging and public metrics.
+//    Internally, it's an aggregation of stats_ from each `SubcompactionState`.
+//    It has 2 parts, normal stats about the main compaction information and
+//    the penultimate level output stats.
+//    `SubcompactionState` maintains the CompactionOutputs for normal output and
+//    the penultimate level output if exists, the per_level stats is
+//    stored with the outputs.
+//                                                +---------------------------+
+//                                                | SubcompactionState        |
+//                                                |                           |
+//                                                | +----------------------+  |
+//                                                | | CompactionOutputs    |  |
+//                                                | | (normal output)      |  |
+//                                            +---->|   stats_             |  |
+//                                            |   | +----------------------+  |
+//                                            |   |                           |
+//                                            |   | +----------------------+  |
+// +--------------------------------+         |   | | CompactionOutputs    |  |
+// | CompactionJob                  |         |   | | (penultimate_level)  |  |
+// |                                |    +--------->|   stats_             |  |
+// |   compaction_stats_            |    |    |   | +----------------------+  |
+// |    +-------------------------+ |    |    |   |                           |
+// |    |stats (normal)           |------|----+   +---------------------------+
+// |    +-------------------------+ |    |    |
+// |                                |    |    |
+// |    +-------------------------+ |    |    |   +---------------------------+
+// |    |penultimate_level_stats  +------+    |   | SubcompactionState        |
+// |    +-------------------------+ |    |    |   |                           |
+// |                                |    |    |   | +----------------------+  |
+// |                                |    |    |   | | CompactionOutputs    |  |
+// +--------------------------------+    |    |   | | (normal output)      |  |
+//                                       |    +---->|   stats_             |  |
+//                                       |        | +----------------------+  |
+//                                       |        |                           |
+//                                       |        | +----------------------+  |
+//                                       |        | | CompactionOutputs    |  |
+//                                       |        | | (penultimate_level)  |  |
+//                                       +--------->|   stats_             |  |
+//                                                | +----------------------+  |
+//                                                |                           |
+//                                                +---------------------------+
+
+class CompactionJob {
+ public:
+  CompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* db_directory, FSDirectory* output_directory,
+      FSDirectory* blob_output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      SequenceNumber earliest_write_conflict_snapshot,
+      const SnapshotChecker* snapshot_checker, JobContext* job_context,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      bool paranoid_file_checks, bool measure_io_stats,
+      const std::string& dbname, CompactionJobStats* compaction_job_stats,
+      Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const std::string& db_id = "", const std::string& db_session_id = "",
+      std::string full_history_ts_low = "", std::string trim_ts = "",
+      BlobFileCompletionCallback* blob_callback = nullptr,
+      int* bg_compaction_scheduled = nullptr,
+      int* bg_bottom_compaction_scheduled = nullptr);
+
+  virtual ~CompactionJob();
+
+  // no copy/move
+  CompactionJob(CompactionJob&& job) = delete;
+  CompactionJob(const CompactionJob& job) = delete;
+  CompactionJob& operator=(const CompactionJob& job) = delete;
+
+  // REQUIRED: mutex held
+  // Prepare for the compaction by setting up boundaries for each subcompaction
+  void Prepare();
+  // REQUIRED mutex not held
+  // Launch threads for each subcompaction and wait for them to finish. After
+  // that, verify table is usable and finally do bookkeeping to unify
+  // subcompaction results
+  Status Run();
+
+  // REQUIRED: mutex held
+  // Add compaction input/output to the current version
+  Status Install(const MutableCFOptions& mutable_cf_options);
+
+  // Return the IO status
+  IOStatus io_status() const { return io_status_; }
+
+ protected:
+  void UpdateCompactionStats();
+  void LogCompaction();
+  virtual void RecordCompactionIOStats();
+  void CleanupCompaction();
+
+  // Call compaction filter. Then iterate through input and compact the
+  // kv-pairs
+  void ProcessKeyValueCompaction(SubcompactionState* sub_compact);
+
+  CompactionState* compact_;
+  InternalStats::CompactionStatsFull compaction_stats_;
+  const ImmutableDBOptions& db_options_;
+  const MutableDBOptions mutable_db_options_copy_;
+  LogBuffer* log_buffer_;
+  FSDirectory* output_directory_;
+  Statistics* stats_;
+  // Is this compaction creating a file in the bottom most level?
+  bool bottommost_level_;
+
+  Env::WriteLifeTimeHint write_hint_;
+
+  IOStatus io_status_;
+
+  CompactionJobStats* compaction_job_stats_;
+
+ private:
+  friend class CompactionJobTestBase;
+
+  // Generates a histogram representing potential divisions of key ranges from
+  // the input. It adds the starting and/or ending keys of certain input files
+  // to the working set and then finds the approximate size of data in between
+  // each consecutive pair of slices. Then it divides these ranges into
+  // consecutive groups such that each group has a similar size.
+  void GenSubcompactionBoundaries();
+
+  // Get the number of planned subcompactions based on max_subcompactions and
+  // extra reserved resources
+  uint64_t GetSubcompactionsLimit();
+
+  // Additional reserved threads are reserved and the number is stored in
+  // extra_num_subcompaction_threads_reserved__. For now, this happens only if
+  // the compaction priority is round-robin and max_subcompactions is not
+  // sufficient (extra resources may be needed)
+  void AcquireSubcompactionResources(int num_extra_required_subcompactions);
+
+  // Additional threads may be reserved during IncreaseSubcompactionResources()
+  // if num_actual_subcompactions is less than num_planned_subcompactions.
+  // Additional threads will be released and the bg_compaction_scheduled_ or
+  // bg_bottom_compaction_scheduled_ will be updated if they are used.
+  // DB Mutex lock is required.
+  void ShrinkSubcompactionResources(uint64_t num_extra_resources);
+
+  // Release all reserved threads and update the compaction limits.
+  void ReleaseSubcompactionResources();
+
+  CompactionServiceJobStatus ProcessKeyValueCompactionWithCompactionService(
+      SubcompactionState* sub_compact);
+
+  // update the thread status for starting a compaction.
+  void ReportStartedCompaction(Compaction* compaction);
+
+  Status FinishCompactionOutputFile(const Status& input_status,
+                                    SubcompactionState* sub_compact,
+                                    CompactionOutputs& outputs,
+                                    const Slice& next_table_min_key);
+  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
+  Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
+                                  CompactionOutputs& outputs);
+  void UpdateCompactionJobStats(
+      const InternalStats::CompactionStats& stats) const;
+  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
+                         CompactionJobStats* compaction_job_stats = nullptr);
+
+  void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
+                                        int input_level);
+
+  void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
+
+  void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
+
+  uint32_t job_id_;
+
+  // DBImpl state
+  const std::string& dbname_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  const FileOptions file_options_;
+
+  Env* env_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  FileSystemPtr fs_;
+  // env_option optimized for compaction table reads
+  FileOptions file_options_for_read_;
+  VersionSet* versions_;
+  const std::atomic<bool>* shutting_down_;
+  const std::atomic<bool>& manual_compaction_canceled_;
+  FSDirectory* db_directory_;
+  FSDirectory* blob_output_directory_;
+  InstrumentedMutex* db_mutex_;
+  ErrorHandler* db_error_handler_;
+  // If there were two snapshots with seq numbers s1 and
+  // s2 and s1 < s2, and if we find two instances of a key k1 then lies
+  // entirely within s1 and s2, then the earlier version of k1 can be safely
+  // deleted because that version is not visible in any snapshot.
+  std::vector<SequenceNumber> existing_snapshots_;
+
+  // This is the earliest snapshot that could be used for write-conflict
+  // checking by a transaction.  For any user-key newer than this snapshot, we
+  // should make sure not to remove evidence that a write occurred.
+  SequenceNumber earliest_write_conflict_snapshot_;
+
+  const SnapshotChecker* const snapshot_checker_;
+
+  JobContext* job_context_;
+
+  std::shared_ptr<Cache> table_cache_;
+
+  EventLogger* event_logger_;
+
+  bool paranoid_file_checks_;
+  bool measure_io_stats_;
+  // Stores the Slices that designate the boundaries for each subcompaction
+  std::vector<std::string> boundaries_;
+  Env::Priority thread_pri_;
+  std::string full_history_ts_low_;
+  std::string trim_ts_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  uint64_t GetCompactionId(SubcompactionState* sub_compact) const;
+  // Stores the number of reserved threads in shared env_ for the number of
+  // extra subcompaction in kRoundRobin compaction priority
+  int extra_num_subcompaction_threads_reserved_;
+
+  // Stores the pointer to bg_compaction_scheduled_,
+  // bg_bottom_compaction_scheduled_ in DBImpl. Mutex is required when accessing
+  // or updating it.
+  int* bg_compaction_scheduled_;
+  int* bg_bottom_compaction_scheduled_;
+
+  // Stores the sequence number to time mapping gathered from all input files
+  // it also collects the smallest_seqno -> oldest_ancester_time from the SST.
+  SeqnoToTimeMapping seqno_time_mapping_;
+
+  // Minimal sequence number for preserving the time information. The time info
+  // older than this sequence number won't be preserved after the compaction and
+  // if it's bottommost compaction, the seq num will be zeroed out.
+  SequenceNumber preserve_time_min_seqno_ = kMaxSequenceNumber;
+
+  // Minimal sequence number to preclude the data from the last level. If the
+  // key has bigger (newer) sequence number than this, it will be precluded from
+  // the last level (output to penultimate level).
+  SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+
+  // Get table file name in where it's outputting to, which should also be in
+  // `output_directory_`.
+  virtual std::string GetTableFileName(uint64_t file_number);
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  // The Compaction Read and Write priorities are the same for different
+  // scenarios, such as write stalled.
+  Env::IOPriority GetRateLimiterPriority();
+};
+
+// CompactionServiceInput is used the pass compaction information between two
+// db instances. It contains the information needed to do a compaction. It
+// doesn't contain the LSM tree information, which is passed though MANIFEST
+// file.
+struct CompactionServiceInput {
+  ColumnFamilyDescriptor column_family;
+
+  DBOptions db_options;
+
+  std::vector<SequenceNumber> snapshots;
+
+  // SST files for compaction, it should already be expended to include all the
+  // files needed for this compaction, for both input level files and output
+  // level files.
+  std::vector<std::string> input_files;
+  int output_level;
+
+  // db_id is used to generate unique id of sst on the remote compactor
+  std::string db_id;
+
+  // information for subcompaction
+  bool has_begin = false;
+  std::string begin;
+  bool has_end = false;
+  std::string end;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceInput* obj);
+  Status Write(std::string* output);
+
+  // Initialize a dummy ColumnFamilyDescriptor
+  CompactionServiceInput() : column_family("", ColumnFamilyOptions()) {}
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceInput* other);
+  bool TEST_Equals(CompactionServiceInput* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceOutputFile is the metadata for the output SST file
+struct CompactionServiceOutputFile {
+  std::string file_name;
+  SequenceNumber smallest_seqno;
+  SequenceNumber largest_seqno;
+  std::string smallest_internal_key;
+  std::string largest_internal_key;
+  uint64_t oldest_ancester_time;
+  uint64_t file_creation_time;
+  uint64_t paranoid_hash;
+  bool marked_for_compaction;
+  UniqueId64x2 unique_id;
+
+  CompactionServiceOutputFile() = default;
+  CompactionServiceOutputFile(
+      const std::string& name, SequenceNumber smallest, SequenceNumber largest,
+      std::string _smallest_internal_key, std::string _largest_internal_key,
+      uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+      uint64_t _paranoid_hash, bool _marked_for_compaction,
+      UniqueId64x2 _unique_id)
+      : file_name(name),
+        smallest_seqno(smallest),
+        largest_seqno(largest),
+        smallest_internal_key(std::move(_smallest_internal_key)),
+        largest_internal_key(std::move(_largest_internal_key)),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        paranoid_hash(_paranoid_hash),
+        marked_for_compaction(_marked_for_compaction),
+        unique_id(std::move(_unique_id)) {}
+};
+
+// CompactionServiceResult contains the compaction result from a different db
+// instance, with these information, the primary db instance with write
+// permission is able to install the result to the DB.
+struct CompactionServiceResult {
+  Status status;
+  std::vector<CompactionServiceOutputFile> output_files;
+  int output_level;
+
+  // location of the output files
+  std::string output_path;
+
+  // some statistics about the compaction
+  uint64_t num_output_records = 0;
+  uint64_t total_bytes = 0;
+  uint64_t bytes_read = 0;
+  uint64_t bytes_written = 0;
+  CompactionJobStats stats;
+
+  // serialization interface to read and write the object
+  static Status Read(const std::string& data_str, CompactionServiceResult* obj);
+  Status Write(std::string* output);
+
+#ifndef NDEBUG
+  bool TEST_Equals(CompactionServiceResult* other);
+  bool TEST_Equals(CompactionServiceResult* other, std::string* mismatch);
+#endif  // NDEBUG
+};
+
+// CompactionServiceCompactionJob is an read-only compaction job, it takes
+// input information from `compaction_service_input` and put result information
+// in `compaction_service_result`, the SST files are generated to `output_path`.
+class CompactionServiceCompactionJob : private CompactionJob {
+ public:
+  CompactionServiceCompactionJob(
+      int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+      const MutableDBOptions& mutable_db_options,
+      const FileOptions& file_options, VersionSet* versions,
+      const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
+      FSDirectory* output_directory, Statistics* stats,
+      InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+      std::vector<SequenceNumber> existing_snapshots,
+      std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+      const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+      const std::atomic<bool>& manual_compaction_canceled,
+      const std::string& db_id, const std::string& db_session_id,
+      std::string output_path,
+      const CompactionServiceInput& compaction_service_input,
+      CompactionServiceResult* compaction_service_result);
+
+  // Run the compaction in current thread and return the result
+  Status Run();
+
+  void CleanupCompaction();
+
+  IOStatus io_status() const { return CompactionJob::io_status(); }
+
+ protected:
+  void RecordCompactionIOStats() override;
+
+ private:
+  // Get table file name in output_path
+  std::string GetTableFileName(uint64_t file_number) override;
+  // Specific the compaction output path, otherwise it uses default DB path
+  const std::string output_path_;
+
+  // Compaction job input
+  const CompactionServiceInput& compaction_input_;
+
+  // Compaction job result
+  CompactionServiceResult* compaction_result_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_job_stats_test.cc b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
new file mode 100644
index 000000000..930270778
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_stats_test.cc
@@ -0,0 +1,975 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <algorithm>
+#include <cinttypes>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#if !defined(IOS_CROSS_COMPILE)
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+static std::string RandomString(Random* rnd, int len, double ratio) {
+  std::string r;
+  test::CompressibleString(rnd, ratio, len, &r);
+  return r;
+}
+
+std::string Key(uint64_t key, int length) {
+  const int kBufSize = 1000;
+  char buf[kBufSize];
+  if (length > kBufSize) {
+    length = kBufSize;
+  }
+  snprintf(buf, kBufSize, "%0*" PRIu64, length, key);
+  return std::string(buf);
+}
+
+class CompactionJobStatsTest : public testing::Test,
+                               public testing::WithParamInterface<bool> {
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  Env* env_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+  uint32_t max_subcompactions_;
+
+  Options last_options_;
+
+  CompactionJobStatsTest() : env_(Env::Default()) {
+    env_->SetBackgroundThreads(1, Env::LOW);
+    env_->SetBackgroundThreads(1, Env::HIGH);
+    dbname_ = test::PerThreadDBPath("compaction_job_stats_test");
+    alternative_wal_dir_ = dbname_ + "/wal";
+    Options options;
+    options.create_if_missing = true;
+    max_subcompactions_ = GetParam();
+    options.max_subcompactions = max_subcompactions_;
+    auto delete_options = options;
+    delete_options.wal_dir = alternative_wal_dir_;
+    EXPECT_OK(DestroyDB(dbname_, delete_options));
+    // Destroy it for not alternative WAL dir is used.
+    EXPECT_OK(DestroyDB(dbname_, options));
+    db_ = nullptr;
+    Reopen(options);
+  }
+
+  ~CompactionJobStatsTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    Close();
+    Options options;
+    options.db_paths.emplace_back(dbname_, 0);
+    options.db_paths.emplace_back(dbname_ + "_2", 0);
+    options.db_paths.emplace_back(dbname_ + "_3", 0);
+    options.db_paths.emplace_back(dbname_ + "_4", 0);
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options) {
+    ColumnFamilyOptions cf_opts(options);
+    size_t cfi = handles_.size();
+    handles_.resize(cfi + cfs.size());
+    for (auto cf : cfs) {
+      ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]));
+    }
+  }
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options) {
+    CreateColumnFamilies(cfs, options);
+    std::vector<std::string> cfs_plus_default = cfs;
+    cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+    ReopenWithColumnFamilies(cfs_plus_default, options);
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options) {
+    ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options) {
+    Close();
+    EXPECT_EQ(cfs.size(), options.size());
+    std::vector<ColumnFamilyDescriptor> column_families;
+    for (size_t i = 0; i < cfs.size(); ++i) {
+      column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+    }
+    DBOptions db_opts = DBOptions(options[0]);
+    return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+  }
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options) {
+    Close();
+    std::vector<Options> v_opts(cfs.size(), options);
+    return TryReopenWithColumnFamilies(cfs, v_opts);
+  }
+
+  void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); }
+
+  void Close() {
+    for (auto h : handles_) {
+      delete h;
+    }
+    handles_.clear();
+    delete db_;
+    db_ = nullptr;
+  }
+
+  void DestroyAndReopen(const Options& options) {
+    // Destroy using last options
+    Destroy(last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(const Options& options) {
+    Close();
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+
+  Status ReadOnlyReopen(const Options& options) {
+    return DB::OpenForReadOnly(options, dbname_, &db_);
+  }
+
+  Status TryReopen(const Options& options) {
+    Close();
+    last_options_ = options;
+    return DB::Open(options, dbname_, &db_);
+  }
+
+  Status Flush(int cf = 0) {
+    if (cf == 0) {
+      return db_->Flush(FlushOptions());
+    } else {
+      return db_->Flush(FlushOptions(), handles_[cf]);
+    }
+  }
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, k, v);
+  }
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions()) {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  Status Delete(int cf, const std::string& k) {
+    return db_->Delete(WriteOptions(), handles_[cf], k);
+  }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.verify_checksums = true;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, handles_[cf], k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level, int cf = 0) {
+    std::string property;
+    if (cf == 0) {
+      // default cfd
+      EXPECT_TRUE(db_->GetProperty(
+          "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    } else {
+      EXPECT_TRUE(db_->GetProperty(
+          handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+          &property));
+    }
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0) {
+    int num_levels =
+        (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < num_levels; level++) {
+      int f = NumTableFilesAtLevel(level, cf);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  Status Size(uint64_t* size, const Slice& start, const Slice& limit,
+              int cf = 0) {
+    Range r(start, limit);
+    if (cf == 0) {
+      return db_->GetApproximateSizes(&r, 1, size);
+    } else {
+      return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+    }
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id) {
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = target_path_id;
+    ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+  }
+
+  void Compact(int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(
+        db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+  }
+
+  void Compact(const Slice& start, const Slice& limit) {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+  }
+
+  void TEST_Compact(int level, int cf, const Slice& start, const Slice& limit) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(level, &start, &limit, handles_[cf],
+                                          true /* disallow trivial move */));
+  }
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0) {
+    for (int i = 0; i < n; i++) {
+      ASSERT_OK(Put(cf, small, "begin"));
+      ASSERT_OK(Put(cf, large, "end"));
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  static void SetDeletionCompactionStats(CompactionJobStats* stats,
+                                         uint64_t input_deletions,
+                                         uint64_t expired_deletions,
+                                         uint64_t records_replaced) {
+    stats->num_input_deletion_records = input_deletions;
+    stats->num_expired_deletion_records = expired_deletions;
+    stats->num_records_replaced = records_replaced;
+  }
+
+  void MakeTableWithKeyValues(Random* rnd, uint64_t smallest, uint64_t largest,
+                              int key_size, int value_size, uint64_t interval,
+                              double ratio, int cf = 0) {
+    for (auto key = smallest; key < largest; key += interval) {
+      ASSERT_OK(Put(cf, Slice(Key(key, key_size)),
+                    Slice(RandomString(rnd, value_size, ratio))));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  // This function behaves with the implicit understanding that two
+  // rounds of keys are inserted into the database, as per the behavior
+  // of the DeletionStatsTest.
+  void SelectivelyDeleteKeys(uint64_t smallest, uint64_t largest,
+                             uint64_t interval, int deletion_interval,
+                             int key_size, uint64_t cutoff_key_num,
+                             CompactionJobStats* stats, int cf = 0) {
+    // interval needs to be >= 2 so that deletion entries can be inserted
+    // that are intended to not result in an actual key deletion by using
+    // an offset of 1 from another existing key
+    ASSERT_GE(interval, 2);
+
+    uint64_t ctr = 1;
+    uint32_t deletions_made = 0;
+    uint32_t num_deleted = 0;
+    uint32_t num_expired = 0;
+    for (auto key = smallest; key <= largest; key += interval, ctr++) {
+      if (ctr % deletion_interval == 0) {
+        ASSERT_OK(Delete(cf, Key(key, key_size)));
+        deletions_made++;
+        num_deleted++;
+
+        if (key > cutoff_key_num) {
+          num_expired++;
+        }
+      }
+    }
+
+    // Insert some deletions for keys that don't exist that
+    // are both in and out of the key range
+    ASSERT_OK(Delete(cf, Key(smallest + 1, key_size)));
+    deletions_made++;
+
+    ASSERT_OK(Delete(cf, Key(smallest - 1, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Delete(cf, Key(smallest - 9, key_size)));
+    deletions_made++;
+    num_expired++;
+
+    ASSERT_OK(Flush(cf));
+    SetDeletionCompactionStats(stats, deletions_made, num_expired, num_deleted);
+  }
+};
+
+// An EventListener which helps verify the compaction results in
+// test CompactionJobStatsTest.
+class CompactionJobStatsChecker : public EventListener {
+ public:
+  CompactionJobStatsChecker()
+      : compression_enabled_(false), verify_next_comp_io_stats_(false) {}
+
+  size_t NumberOfUnverifiedStats() { return expected_stats_.size(); }
+
+  void set_verify_next_comp_io_stats(bool v) { verify_next_comp_io_stats_ = v; }
+
+  // Once a compaction completed, this function will verify the returned
+  // CompactionJobInfo with the oldest CompactionJobInfo added earlier
+  // in "expected_stats_" which has not yet being used for verification.
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    if (verify_next_comp_io_stats_) {
+      ASSERT_GT(ci.stats.file_write_nanos, 0);
+      ASSERT_GT(ci.stats.file_range_sync_nanos, 0);
+      ASSERT_GT(ci.stats.file_fsync_nanos, 0);
+      ASSERT_GT(ci.stats.file_prepare_write_nanos, 0);
+      verify_next_comp_io_stats_ = false;
+    }
+
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (expected_stats_.size()) {
+      Verify(ci.stats, expected_stats_.front());
+      expected_stats_.pop();
+    }
+  }
+
+  // A helper function which verifies whether two CompactionJobStats
+  // match.  The verification of all compaction stats are done by
+  // ASSERT_EQ except for the total input / output bytes, which we
+  // use ASSERT_GE and ASSERT_LE with a reasonable bias ---
+  // 10% in uncompressed case and 20% when compression is used.
+  virtual void Verify(const CompactionJobStats& current_stats,
+                      const CompactionJobStats& stats) {
+    // time
+    ASSERT_GT(current_stats.elapsed_micros, 0U);
+
+    ASSERT_EQ(current_stats.num_input_records, stats.num_input_records);
+    ASSERT_EQ(current_stats.num_input_files, stats.num_input_files);
+    ASSERT_EQ(current_stats.num_input_files_at_output_level,
+              stats.num_input_files_at_output_level);
+
+    ASSERT_EQ(current_stats.num_output_records, stats.num_output_records);
+    ASSERT_EQ(current_stats.num_output_files, stats.num_output_files);
+
+    ASSERT_EQ(current_stats.is_full_compaction, stats.is_full_compaction);
+    ASSERT_EQ(current_stats.is_manual_compaction, stats.is_manual_compaction);
+
+    // file size
+    double kFileSizeBias = compression_enabled_ ? 0.20 : 0.10;
+    ASSERT_GE(current_stats.total_input_bytes * (1.00 + kFileSizeBias),
+              stats.total_input_bytes);
+    ASSERT_LE(current_stats.total_input_bytes,
+              stats.total_input_bytes * (1.00 + kFileSizeBias));
+    ASSERT_GE(current_stats.total_output_bytes * (1.00 + kFileSizeBias),
+              stats.total_output_bytes);
+    ASSERT_LE(current_stats.total_output_bytes,
+              stats.total_output_bytes * (1.00 + kFileSizeBias));
+    ASSERT_EQ(current_stats.total_input_raw_key_bytes,
+              stats.total_input_raw_key_bytes);
+    ASSERT_EQ(current_stats.total_input_raw_value_bytes,
+              stats.total_input_raw_value_bytes);
+
+    ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+
+    ASSERT_EQ(std::string(current_stats.smallest_output_key_prefix),
+              std::string(stats.smallest_output_key_prefix));
+    ASSERT_EQ(std::string(current_stats.largest_output_key_prefix),
+              std::string(stats.largest_output_key_prefix));
+  }
+
+  // Add an expected compaction stats, which will be used to
+  // verify the CompactionJobStats returned by the OnCompactionCompleted()
+  // callback.
+  void AddExpectedStats(const CompactionJobStats& stats) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    expected_stats_.push(stats);
+  }
+
+  void EnableCompression(bool flag) { compression_enabled_ = flag; }
+
+  bool verify_next_comp_io_stats() const { return verify_next_comp_io_stats_; }
+
+ private:
+  std::mutex mutex_;
+  std::queue<CompactionJobStats> expected_stats_;
+  bool compression_enabled_;
+  bool verify_next_comp_io_stats_;
+};
+
+// An EventListener which helps verify the compaction statistics in
+// the test DeletionStatsTest.
+class CompactionJobDeletionStatsChecker : public CompactionJobStatsChecker {
+ public:
+  // Verifies whether two CompactionJobStats match.
+  void Verify(const CompactionJobStats& current_stats,
+              const CompactionJobStats& stats) override {
+    ASSERT_EQ(current_stats.num_input_deletion_records,
+              stats.num_input_deletion_records);
+    ASSERT_EQ(current_stats.num_expired_deletion_records,
+              stats.num_expired_deletion_records);
+    ASSERT_EQ(current_stats.num_records_replaced, stats.num_records_replaced);
+
+    ASSERT_EQ(current_stats.num_corrupt_keys, stats.num_corrupt_keys);
+  }
+};
+
+namespace {
+
+uint64_t EstimatedFileSize(uint64_t num_records, size_t key_size,
+                           size_t value_size, double compression_ratio = 1.0,
+                           size_t block_size = 4096,
+                           int bloom_bits_per_key = 10) {
+  const size_t kPerKeyOverhead = 8;
+  const size_t kFooterSize = 512;
+
+  uint64_t data_size = static_cast<uint64_t>(
+      num_records *
+      (key_size + value_size * compression_ratio + kPerKeyOverhead));
+
+  return data_size + kFooterSize +
+         num_records * bloom_bits_per_key / 8        // filter block
+         + data_size * (key_size + 8) / block_size;  // index block
+}
+
+namespace {
+
+void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
+  assert(prefix_length > 0);
+  size_t length = src.size() > prefix_length ? prefix_length : src.size();
+  dst->assign(src.data(), length);
+}
+
+}  // namespace
+
+CompactionJobStats NewManualCompactionJobStats(
+    const std::string& smallest_key, const std::string& largest_key,
+    size_t num_input_files, size_t num_input_files_at_output_level,
+    uint64_t num_input_records, size_t key_size, size_t value_size,
+    size_t num_output_files, uint64_t num_output_records,
+    double compression_ratio, uint64_t num_records_replaced,
+    bool is_full = false, bool is_manual = true) {
+  CompactionJobStats stats;
+  stats.Reset();
+
+  stats.num_input_records = num_input_records;
+  stats.num_input_files = num_input_files;
+  stats.num_input_files_at_output_level = num_input_files_at_output_level;
+
+  stats.num_output_records = num_output_records;
+  stats.num_output_files = num_output_files;
+
+  stats.total_input_bytes =
+      EstimatedFileSize(num_input_records / num_input_files, key_size,
+                        value_size, compression_ratio) *
+      num_input_files;
+  stats.total_output_bytes =
+      EstimatedFileSize(num_output_records / num_output_files, key_size,
+                        value_size, compression_ratio) *
+      num_output_files;
+  stats.total_input_raw_key_bytes = num_input_records * (key_size + 8);
+  stats.total_input_raw_value_bytes = num_input_records * value_size;
+
+  stats.is_full_compaction = is_full;
+  stats.is_manual_compaction = is_manual;
+
+  stats.num_records_replaced = num_records_replaced;
+
+  CopyPrefix(smallest_key, CompactionJobStats::kMaxPrefixLength,
+             &stats.smallest_output_key_prefix);
+  CopyPrefix(largest_key, CompactionJobStats::kMaxPrefixLength,
+             &stats.largest_output_key_prefix);
+
+  return stats;
+}
+
+CompressionType GetAnyCompression() {
+  if (Snappy_Supported()) {
+    return kSnappyCompression;
+  } else if (Zlib_Supported()) {
+    return kZlibCompression;
+  } else if (BZip2_Supported()) {
+    return kBZip2Compression;
+  } else if (LZ4_Supported()) {
+    return kLZ4Compression;
+  } else if (XPRESS_Supported()) {
+    return kXpressCompression;
+  }
+
+  return kNoCompression;
+}
+
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
+  Random rnd(301);
+  const int kBufSize = 100;
+  char buf[kBufSize];
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 100;
+  const int kTestScale = 8;
+  const int kKeySize = 10;
+  const int kValueSize = 1000;
+  const double kCompressionRatio = 0.5;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.  The expected CompactionJobStats is added
+  // via AddExpectedStats().
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  // just enough setting to hold off auto-compaction.
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options.bytes_per_sync = 512 * 1024;
+
+  options.report_bg_io_stats = true;
+  for (int test = 0; test < 2; ++test) {
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // 1st Phase: generate "num_L0_files" L0 files.
+    int num_L0_files = 0;
+    for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+         start_key += key_base) {
+      MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1,
+                             kKeySize, kValueSize, key_interval,
+                             compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d", ++num_L0_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+    ASSERT_EQ(std::to_string(num_L0_files), FilesPerLevel(1));
+
+    // 2nd Phase: perform L0 -> L1 compaction.
+    int L0_compaction_count = 6;
+    int count = 1;
+    std::string smallest_key;
+    std::string largest_key;
+    for (uint64_t start_key = key_base;
+         start_key <= key_base * L0_compaction_count;
+         start_key += key_base, count++) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base - key_interval, 10);
+      stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+          smallest_key, largest_key, 1, 0, num_keys_per_L0_file, kKeySize,
+          kValueSize, 1, num_keys_per_L0_file, compression_ratio, 0));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      TEST_Compact(0, 1, smallest_key, largest_key);
+      snprintf(buf, kBufSize, "%d,%d", num_L0_files - count, count);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // compact two files into one in the last L0 -> L1 compaction
+    int num_remaining_L0 = num_L0_files - L0_compaction_count;
+    smallest_key = Key(key_base * (L0_compaction_count + 1), 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        smallest_key, largest_key, num_remaining_L0, 0,
+        num_keys_per_L0_file * num_remaining_L0, kKeySize, kValueSize, 1,
+        num_keys_per_L0_file * num_remaining_L0, compression_ratio, 0));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    TEST_Compact(0, 1, smallest_key, largest_key);
+
+    int num_L1_files = num_L0_files - num_remaining_L0 + 1;
+    num_L0_files = 0;
+    snprintf(buf, kBufSize, "%d,%d", num_L0_files, num_L1_files);
+    ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+
+    // 3rd Phase: generate sparse L0 files (wider key-range, same num of keys)
+    int sparseness = 2;
+    for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+         start_key += key_base * sparseness) {
+      MakeTableWithKeyValues(
+          &rnd, start_key, start_key + key_base * sparseness - 1, kKeySize,
+          kValueSize, key_base * sparseness / num_keys_per_L0_file,
+          compression_ratio, 1);
+      snprintf(buf, kBufSize, "%d,%d", ++num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 4th Phase: perform L0 -> L1 compaction again, expect higher write amp
+    // When subcompactions are enabled, the number of output files increases
+    // by 1 because multiple threads are consuming the input and generating
+    // output files without coordinating to see if the output could fit into
+    // a smaller number of files like it does when it runs sequentially
+    int num_output_files = options.max_subcompactions > 1 ? 2 : 1;
+    for (uint64_t start_key = key_base; num_L0_files > 1;
+         start_key += key_base * sparseness) {
+      smallest_key = Key(start_key, 10);
+      largest_key = Key(start_key + key_base * sparseness - key_interval, 10);
+      stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+          smallest_key, largest_key, 3, 2, num_keys_per_L0_file * 3, kKeySize,
+          kValueSize, num_output_files,
+          num_keys_per_L0_file * 2,  // 1/3 of the data will be updated.
+          compression_ratio, num_keys_per_L0_file));
+      ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+      Compact(1, smallest_key, largest_key);
+      if (options.max_subcompactions == 1) {
+        --num_L1_files;
+      }
+      snprintf(buf, kBufSize, "%d,%d", --num_L0_files, num_L1_files);
+      ASSERT_EQ(std::string(buf), FilesPerLevel(1));
+    }
+
+    // 5th Phase: Do a full compaction, which involves in two sub-compactions.
+    // Here we expect to have 1 L0 files and 4 L1 files
+    // In the first sub-compaction, we expect L0 compaction.
+    smallest_key = Key(key_base, 10);
+    largest_key = Key(key_base * (kTestScale + 1) - key_interval, 10);
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        Key(key_base * (kTestScale + 1 - sparseness), 10), largest_key, 2, 1,
+        num_keys_per_L0_file * 3, kKeySize, kValueSize, 1,
+        num_keys_per_L0_file * 2, compression_ratio, num_keys_per_L0_file));
+    ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 1U);
+    Compact(1, smallest_key, largest_key);
+
+    num_L1_files = options.max_subcompactions > 1 ? 7 : 4;
+    char L1_buf[4];
+    snprintf(L1_buf, sizeof(L1_buf), "0,%d", num_L1_files);
+    std::string L1_files(L1_buf);
+    ASSERT_EQ(L1_files, FilesPerLevel(1));
+    options.compression = GetAnyCompression();
+    if (options.compression == kNoCompression) {
+      break;
+    }
+    stats_checker->EnableCompression(true);
+    compression_ratio = kCompressionRatio;
+
+    for (int i = 0; i < 5; i++) {
+      ASSERT_OK(Put(1, Slice(Key(key_base + i, 10)),
+                    Slice(RandomString(&rnd, 512 * 1024, 1))));
+    }
+
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+
+    stats_checker->set_verify_next_comp_io_stats(true);
+    std::atomic<bool> first_prepare_write(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void* /*arg*/) {
+          if (first_prepare_write.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_prepare_write.store(false);
+          }
+        });
+
+    std::atomic<bool> first_flush(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Flush:BeforeAppend", [&](void* /*arg*/) {
+          if (first_flush.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_flush.store(false);
+          }
+        });
+
+    std::atomic<bool> first_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::SyncInternal:0", [&](void* /*arg*/) {
+          if (first_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_sync.store(false);
+          }
+        });
+
+    std::atomic<bool> first_range_sync(true);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) {
+          if (first_range_sync.load()) {
+            options.env->SleepForMicroseconds(3);
+            first_range_sync.store(false);
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Compact(1, smallest_key, largest_key);
+
+    ASSERT_TRUE(!stats_checker->verify_next_comp_io_stats());
+    ASSERT_TRUE(!first_prepare_write.load());
+    ASSERT_TRUE(!first_flush.load());
+    ASSERT_TRUE(!first_sync.load());
+    ASSERT_TRUE(!first_range_sync.load());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_L0_file = 20;
+  const int kTestScale = 8;  // make sure this is even
+  const int kKeySize = 10;
+  const int kValueSize = 100;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_L0_file;
+  uint64_t largest_key_num = key_base * (kTestScale + 1) - key_interval;
+  uint64_t cutoff_key_num = key_base * (kTestScale / 2 + 1) - key_interval;
+  const std::string smallest_key = Key(key_base - 10, kKeySize);
+  const std::string largest_key = Key(largest_key_num + 10, kKeySize);
+
+  // Whenever a compaction completes, this listener will try to
+  // verify whether the returned CompactionJobStats matches
+  // what we expect.
+  auto* stats_checker = new CompactionJobDeletionStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = kTestScale + 1;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Stage 1: Generate several L0 files and then send them to L2 by
+  // using CompactRangeOptions and CompactRange(). These files will
+  // have a strict subset of the keys from the full key-range
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale / 2;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+  }
+
+  CompactRangeOptions cr_options;
+  cr_options.change_level = true;
+  cr_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cr_options, handles_[1], nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+  // Stage 2: Generate files including keys from the entire key range
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+  }
+
+  // Send these L0 files to L1
+  TEST_Compact(0, 1, smallest_key, largest_key);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+  // Add a new record and flush so now there is a L0 file
+  // with a value too (not just deletions from the next step)
+  ASSERT_OK(Put(1, Key(key_base - 6, kKeySize), "test"));
+  ASSERT_OK(Flush(1));
+
+  // Stage 3: Generate L0 files with some deletions so now
+  // there are files with the same key range in L0, L1, and L2
+  int deletion_interval = 3;
+  CompactionJobStats first_compaction_stats;
+  SelectivelyDeleteKeys(key_base, largest_key_num, key_interval,
+                        deletion_interval, kKeySize, cutoff_key_num,
+                        &first_compaction_stats, 1);
+
+  stats_checker->AddExpectedStats(first_compaction_stats);
+
+  // Stage 4: Trigger compaction and verify the stats
+  TEST_Compact(0, 1, smallest_key, largest_key);
+}
+
+namespace {
+int GetUniversalCompactionInputUnits(uint32_t num_flushes) {
+  uint32_t compaction_input_units;
+  for (compaction_input_units = 1; num_flushes >= compaction_input_units;
+       compaction_input_units *= 2) {
+    if ((num_flushes & compaction_input_units) != 0) {
+      return compaction_input_units > 1 ? compaction_input_units : 0;
+    }
+  }
+  return 0;
+}
+}  // namespace
+
+TEST_P(CompactionJobStatsTest, UniversalCompactionTest) {
+  Random rnd(301);
+  uint64_t key_base = 100000000l;
+  // Note: key_base must be multiple of num_keys_per_L0_file
+  int num_keys_per_table = 100;
+  const uint32_t kTestScale = 6;
+  const int kKeySize = 10;
+  const int kValueSize = 900;
+  double compression_ratio = 1.0;
+  uint64_t key_interval = key_base / num_keys_per_table;
+
+  auto* stats_checker = new CompactionJobStatsChecker();
+  Options options;
+  options.listeners.emplace_back(stats_checker);
+  options.create_if_missing = true;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = num_keys_per_table * 1000;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 1;
+  options.compaction_options_universal.max_size_amplification_percent = 1000;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Generates the expected CompactionJobStats for each compaction
+  for (uint32_t num_flushes = 2; num_flushes <= kTestScale; num_flushes++) {
+    // Here we treat one newly flushed file as an unit.
+    //
+    // For example, if a newly flushed file is 100k, and a compaction has
+    // 4 input units, then this compaction inputs 400k.
+    uint32_t num_input_units = GetUniversalCompactionInputUnits(num_flushes);
+    if (num_input_units == 0) {
+      continue;
+    }
+    // A full compaction only happens when the number of flushes equals to
+    // the number of compaction input runs.
+    bool is_full = num_flushes == num_input_units;
+    // The following statement determines the expected smallest key
+    // based on whether it is a full compaction.
+    uint64_t smallest_key = is_full ? key_base : key_base * (num_flushes - 1);
+
+    stats_checker->AddExpectedStats(NewManualCompactionJobStats(
+        Key(smallest_key, 10),
+        Key(smallest_key + key_base * num_input_units - key_interval, 10),
+        num_input_units, num_input_units > 2 ? num_input_units / 2 : 0,
+        num_keys_per_table * num_input_units, kKeySize, kValueSize,
+        num_input_units, num_keys_per_table * num_input_units, 1.0, 0, is_full,
+        false));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 3U);
+
+  for (uint64_t start_key = key_base; start_key <= key_base * kTestScale;
+       start_key += key_base) {
+    MakeTableWithKeyValues(&rnd, start_key, start_key + key_base - 1, kKeySize,
+                           kValueSize, key_interval, compression_ratio, 1);
+    ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(stats_checker->NumberOfUnverifiedStats(), 0U);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobStatsTest, CompactionJobStatsTest,
+                        ::testing::Values(1, 4));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
+
+#else
+
+int main(int /*argc*/, char** /*argv*/) { return 0; }
+#endif  // !defined(IOS_CROSS_COMPILE)
diff --git a/src/rocksdb/db/compaction/compaction_job_test.cc b/src/rocksdb/db/compaction/compaction_job_test.cc
new file mode 100644
index 000000000..c87871100
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_job_test.cc
@@ -0,0 +1,2451 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_job.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <map>
+#include <string>
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/version_set.h"
+#include "file/random_access_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/options_helper.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+void VerifyInitializationOfCompactionJobStats(
+    const CompactionJobStats& compaction_job_stats) {
+#if !defined(IOS_CROSS_COMPILE)
+  ASSERT_EQ(compaction_job_stats.elapsed_micros, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files, 0U);
+  ASSERT_EQ(compaction_job_stats.num_input_files_at_output_level, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_output_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_output_files, 0U);
+
+  ASSERT_EQ(compaction_job_stats.is_manual_compaction, true);
+
+  ASSERT_EQ(compaction_job_stats.total_input_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_output_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.total_input_raw_key_bytes, 0U);
+  ASSERT_EQ(compaction_job_stats.total_input_raw_value_bytes, 0U);
+
+  ASSERT_EQ(compaction_job_stats.smallest_output_key_prefix[0], 0);
+  ASSERT_EQ(compaction_job_stats.largest_output_key_prefix[0], 0);
+
+  ASSERT_EQ(compaction_job_stats.num_records_replaced, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_input_deletion_records, 0U);
+  ASSERT_EQ(compaction_job_stats.num_expired_deletion_records, 0U);
+
+  ASSERT_EQ(compaction_job_stats.num_corrupt_keys, 0U);
+#endif  // !defined(IOS_CROSS_COMPILE)
+}
+
+// Mock FSWritableFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestWritableFile : public FSWritableFileOwnerWrapper {
+ public:
+  MockTestWritableFile(std::unique_ptr<FSWritableFile>&& file,
+                       Env::IOPriority io_priority)
+      : FSWritableFileOwnerWrapper(std::move(file)),
+        write_io_priority_(io_priority) {}
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Append(data, options, dbg);
+  }
+  IOStatus Append(const Slice& data, const IOOptions& options,
+                  const DataVerificationInfo& verification_info,
+                  IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Append(data, options, verification_info, dbg);
+  }
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Close(options, dbg);
+  }
+  IOStatus Flush(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Flush(options, dbg);
+  }
+  IOStatus Sync(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Sync(options, dbg);
+  }
+  IOStatus Fsync(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Fsync(options, dbg);
+  }
+  uint64_t GetFileSize(const IOOptions& options, IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->GetFileSize(options, dbg);
+  }
+  IOStatus RangeSync(uint64_t offset, uint64_t nbytes, const IOOptions& options,
+                     IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->RangeSync(offset, nbytes, options, dbg);
+  }
+
+  void PrepareWrite(size_t offset, size_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    target()->PrepareWrite(offset, len, options, dbg);
+  }
+
+  IOStatus Allocate(uint64_t offset, uint64_t len, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, write_io_priority_);
+    return target()->Allocate(offset, len, options, dbg);
+  }
+
+ private:
+  Env::IOPriority write_io_priority_;
+};
+
+// Mock FSRandomAccessFile for testing io priority.
+// Only override the essential functions for testing compaction io priority.
+class MockTestRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  MockTestRandomAccessFile(std::unique_ptr<FSRandomAccessFile>&& file,
+                           Env::IOPriority io_priority)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)),
+        read_io_priority_(io_priority) {}
+
+  IOStatus Read(uint64_t offset, size_t n, const IOOptions& options,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override {
+    EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+    return target()->Read(offset, n, options, result, scratch, dbg);
+  }
+  IOStatus Prefetch(uint64_t offset, size_t n, const IOOptions& options,
+                    IODebugContext* dbg) override {
+    EXPECT_EQ(options.rate_limiter_priority, read_io_priority_);
+    return target()->Prefetch(offset, n, options, dbg);
+  }
+
+ private:
+  Env::IOPriority read_io_priority_;
+};
+
+// Mock FileSystem for testing io priority.
+class MockTestFileSystem : public FileSystemWrapper {
+ public:
+  explicit MockTestFileSystem(const std::shared_ptr<FileSystem>& base,
+                              Env::IOPriority read_io_priority,
+                              Env::IOPriority write_io_priority)
+      : FileSystemWrapper(base),
+        read_io_priority_(read_io_priority),
+        write_io_priority_(write_io_priority) {}
+
+  static const char* kClassName() { return "MockTestFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& file_opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg);
+    EXPECT_OK(s);
+    result->reset(
+        new MockTestRandomAccessFile(std::move(*result), read_io_priority_));
+    return s;
+  }
+  IOStatus NewWritableFile(const std::string& fname,
+                           const FileOptions& file_opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg);
+    EXPECT_OK(s);
+    result->reset(
+        new MockTestWritableFile(std::move(*result), write_io_priority_));
+    return s;
+  }
+
+ private:
+  Env::IOPriority read_io_priority_;
+  Env::IOPriority write_io_priority_;
+};
+
+enum TableTypeForTest : uint8_t { kMockTable = 0, kBlockBasedTable = 1 };
+
+}  // namespace
+
+class CompactionJobTestBase : public testing::Test {
+ protected:
+  CompactionJobTestBase(std::string dbname, const Comparator* ucmp,
+                        std::function<std::string(uint64_t)> encode_u64_ts,
+                        bool test_io_priority, TableTypeForTest table_type)
+      : dbname_(std::move(dbname)),
+        ucmp_(ucmp),
+        db_options_(),
+        mutable_cf_options_(cf_options_),
+        mutable_db_options_(),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        versions_(new VersionSet(
+            dbname_, &db_options_, env_options_, table_cache_.get(),
+            &write_buffer_manager_, &write_controller_,
+            /*block_cache_tracer=*/nullptr,
+            /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ "")),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()),
+        error_handler_(nullptr, db_options_, &mutex_),
+        encode_u64_ts_(std::move(encode_u64_ts)),
+        test_io_priority_(test_io_priority),
+        table_type_(table_type) {
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    env_ = base_env;
+    fs_ = env_->GetFileSystem();
+    // set default for the tests
+    mutable_cf_options_.target_file_size_base = 1024 * 1024;
+    mutable_cf_options_.max_compaction_bytes = 10 * 1024 * 1024;
+  }
+
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    cf_options_.comparator = ucmp_;
+    if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+      BlockBasedTableOptions table_options;
+      cf_options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    } else if (table_type_ == TableTypeForTest::kMockTable) {
+      cf_options_.table_factory = mock_table_factory_;
+    } else {
+      assert(false);
+    }
+  }
+
+  std::string GenerateFileName(uint64_t file_number) {
+    FileMetaData meta;
+    std::vector<DbPath> db_paths;
+    db_paths.emplace_back(dbname_, std::numeric_limits<uint64_t>::max());
+    meta.fd = FileDescriptor(file_number, 0, 0);
+    return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId());
+  }
+
+  std::string KeyStr(const std::string& user_key, const SequenceNumber seq_num,
+                     const ValueType t, uint64_t ts = 0) {
+    std::string user_key_with_ts = user_key + encode_u64_ts_(ts);
+    return InternalKey(user_key_with_ts, seq_num, t).Encode().ToString();
+  }
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrTTL(uint64_t blob_file_number, uint64_t offset,
+                                uint64_t size, uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+    return blob_index;
+  }
+
+  static std::string BlobStrInlinedTTL(const Slice& value,
+                                       uint64_t expiration) {
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+    return blob_index;
+  }
+
+  // Creates a table with the specificied key value pairs.
+  void CreateTable(const std::string& table_name,
+                   const mock::KVVector& contents, uint64_t& file_size) {
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(fs_, table_name, FileOptions(),
+                                          &file_writer, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<TableBuilder> table_builder(
+        cf_options_.table_factory->NewTableBuilder(
+            TableBuilderOptions(*cfd_->ioptions(), mutable_cf_options_,
+                                cfd_->internal_comparator(),
+                                cfd_->int_tbl_prop_collector_factories(),
+                                CompressionType::kNoCompression,
+                                CompressionOptions(), 0 /* column_family_id */,
+                                kDefaultColumnFamilyName, -1 /* level */),
+            file_writer.get()));
+    // Build table.
+    for (auto kv : contents) {
+      std::string key;
+      std::string value;
+      std::tie(key, value) = kv;
+      table_builder->Add(key, value);
+    }
+    ASSERT_OK(table_builder->Finish());
+    file_size = table_builder->FileSize();
+  }
+
+  void AddMockFile(const mock::KVVector& contents, int level = 0) {
+    assert(contents.size() > 0);
+
+    bool first_key = true;
+    std::string smallest, largest;
+    InternalKey smallest_key, largest_key;
+    SequenceNumber smallest_seqno = kMaxSequenceNumber;
+    SequenceNumber largest_seqno = 0;
+    uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+    for (auto kv : contents) {
+      ParsedInternalKey key;
+      std::string skey;
+      std::string value;
+      std::tie(skey, value) = kv;
+      const Status pik_status =
+          ParseInternalKey(skey, &key, true /* log_err_key */);
+
+      smallest_seqno = std::min(smallest_seqno, key.sequence);
+      largest_seqno = std::max(largest_seqno, key.sequence);
+
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, smallest) < 0) {
+        smallest.assign(key.user_key.data(), key.user_key.size());
+        smallest_key.DecodeFrom(skey);
+      }
+      if (first_key ||
+          cfd_->user_comparator()->Compare(key.user_key, largest) > 0) {
+        largest.assign(key.user_key.data(), key.user_key.size());
+        largest_key.DecodeFrom(skey);
+      }
+
+      first_key = false;
+
+      if (pik_status.ok() && key.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+        const Status s = blob_index.DecodeFrom(value);
+        if (!s.ok()) {
+          continue;
+        }
+
+        if (blob_index.IsInlined() || blob_index.HasTTL() ||
+            blob_index.file_number() == kInvalidBlobFileNumber) {
+          continue;
+        }
+
+        if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+            oldest_blob_file_number > blob_index.file_number()) {
+          oldest_blob_file_number = blob_index.file_number();
+        }
+      }
+    }
+
+    uint64_t file_number = versions_->NewFileNumber();
+
+    uint64_t file_size = 0;
+    if (table_type_ == TableTypeForTest::kBlockBasedTable) {
+      CreateTable(GenerateFileName(file_number), contents, file_size);
+    } else if (table_type_ == TableTypeForTest::kMockTable) {
+      file_size = 10;
+      EXPECT_OK(mock_table_factory_->CreateMockTable(
+          env_, GenerateFileName(file_number), std::move(contents)));
+    } else {
+      assert(false);
+    }
+
+    VersionEdit edit;
+    edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key,
+                 smallest_seqno, largest_seqno, false, Temperature::kUnknown,
+                 oldest_blob_file_number, kUnknownOldestAncesterTime,
+                 kUnknownFileCreationTime, kUnknownFileChecksum,
+                 kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+    mutex_.Lock();
+    EXPECT_OK(
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_, nullptr));
+    mutex_.Unlock();
+  }
+
+  void VerifyTables(int output_level,
+                    const std::vector<mock::KVVector>& expected_results,
+                    std::vector<uint64_t> expected_oldest_blob_file_numbers) {
+    if (expected_results.empty()) {
+      ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
+      return;
+    }
+    int expected_output_file_num = 0;
+    for (const auto& e : expected_results) {
+      if (!e.empty()) {
+        ++expected_output_file_num;
+      }
+    }
+    ASSERT_EQ(expected_output_file_num, compaction_job_stats_.num_output_files);
+    if (expected_output_file_num == 0) {
+      return;
+    }
+
+    if (expected_oldest_blob_file_numbers.empty()) {
+      expected_oldest_blob_file_numbers.resize(expected_output_file_num,
+                                               kInvalidBlobFileNumber);
+    }
+
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+    if (table_type_ == TableTypeForTest::kMockTable) {
+      ASSERT_EQ(compaction_job_stats_.num_output_files,
+                expected_results.size());
+      mock_table_factory_->AssertLatestFiles(expected_results);
+    } else {
+      assert(table_type_ == TableTypeForTest::kBlockBasedTable);
+    }
+
+    auto output_files =
+        cfd->current()->storage_info()->LevelFiles(output_level);
+    ASSERT_EQ(expected_output_file_num, output_files.size());
+
+    if (table_type_ == TableTypeForTest::kMockTable) {
+      assert(output_files.size() ==
+             static_cast<size_t>(expected_output_file_num));
+      const FileMetaData* const output_file = output_files[0];
+      ASSERT_EQ(output_file->oldest_blob_file_number,
+                expected_oldest_blob_file_numbers[0]);
+      return;
+    }
+
+    for (size_t i = 0; i < expected_results.size(); ++i) {
+      const FileMetaData* const output_file = output_files[i];
+      std::string file_name = GenerateFileName(output_file->fd.GetNumber());
+      const auto& fs = env_->GetFileSystem();
+      std::unique_ptr<RandomAccessFileReader> freader;
+      IOStatus ios = RandomAccessFileReader::Create(
+          fs, file_name, FileOptions(), &freader, nullptr);
+      ASSERT_OK(ios);
+      std::unique_ptr<TableReader> table_reader;
+      uint64_t file_size = output_file->fd.GetFileSize();
+      ReadOptions read_opts;
+      Status s = cf_options_.table_factory->NewTableReader(
+          read_opts,
+          TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
+                             cfd_->internal_comparator()),
+          std::move(freader), file_size, &table_reader, false);
+      ASSERT_OK(s);
+      assert(table_reader);
+      std::unique_ptr<InternalIterator> iiter(
+          table_reader->NewIterator(read_opts, nullptr, nullptr, true,
+                                    TableReaderCaller::kUncategorized));
+      assert(iiter);
+
+      mock::KVVector from_db;
+      for (iiter->SeekToFirst(); iiter->Valid(); iiter->Next()) {
+        const Slice key = iiter->key();
+        const Slice value = iiter->value();
+        from_db.emplace_back(
+            make_pair(key.ToString(false), value.ToString(false)));
+      }
+      ASSERT_EQ(expected_results[i], from_db);
+    }
+  }
+
+  void SetLastSequence(const SequenceNumber sequence_number) {
+    versions_->SetLastAllocatedSequence(sequence_number + 1);
+    versions_->SetLastPublishedSequence(sequence_number + 1);
+    versions_->SetLastSequence(sequence_number + 1);
+  }
+
+  // returns expected result after compaction
+  mock::KVVector CreateTwoFiles(bool gen_corrupted_keys) {
+    stl_wrappers::KVMap expected_results;
+    constexpr int kKeysPerFile = 10000;
+    constexpr int kCorruptKeysPerFile = 200;
+    constexpr int kMatchingKeys = kKeysPerFile / 2;
+    SequenceNumber sequence_number = 0;
+
+    auto corrupt_id = [&](int id) {
+      return gen_corrupted_keys && id > 0 && id <= kCorruptKeysPerFile;
+    };
+
+    for (int i = 0; i < 2; ++i) {
+      auto contents = mock::MakeMockFile();
+      for (int k = 0; k < kKeysPerFile; ++k) {
+        auto key = std::to_string(i * kMatchingKeys + k);
+        auto value = std::to_string(i * kKeysPerFile + k);
+        InternalKey internal_key(key, ++sequence_number, kTypeValue);
+
+        // This is how the key will look like once it's written in bottommost
+        // file
+        InternalKey bottommost_internal_key(key, 0, kTypeValue);
+
+        if (corrupt_id(k)) {
+          test::CorruptKeyType(&internal_key);
+          test::CorruptKeyType(&bottommost_internal_key);
+        }
+        contents.push_back({internal_key.Encode().ToString(), value});
+        if (i == 1 || k < kMatchingKeys || corrupt_id(k - kMatchingKeys)) {
+          expected_results.insert(
+              {bottommost_internal_key.Encode().ToString(), value});
+        }
+      }
+      mock::SortKVVector(&contents, ucmp_);
+
+      AddMockFile(contents);
+    }
+
+    SetLastSequence(sequence_number);
+
+    mock::KVVector expected_results_kvvector;
+    for (auto& kv : expected_results) {
+      expected_results_kvvector.push_back({kv.first, kv.second});
+    }
+
+    return expected_results_kvvector;
+  }
+
+  void NewDB() {
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    std::shared_ptr<Logger> info_log;
+    DBOptions db_opts = BuildDBOptions(db_options_, mutable_db_options_);
+    Status s = CreateLoggerFromOptions(dbname_, db_opts, &info_log);
+    ASSERT_OK(s);
+    db_options_.info_log = info_log;
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    compaction_job_stats_.Reset();
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
+
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const auto& fs = env_->GetFileSystem();
+    s = WritableFileWriter::Create(fs, manifest,
+                                   fs->OptimizeForManifestWrite(env_options_),
+                                   &file_writer, nullptr);
+
+    ASSERT_OK(s);
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+
+    ASSERT_OK(s);
+
+    cf_options_.merge_operator = merge_op_;
+    cf_options_.compaction_filter = compaction_filter_.get();
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+
+    ASSERT_OK(versions_->Recover(column_families, false));
+    cfd_ = versions_->GetColumnFamilySet()->GetDefault();
+  }
+
+  // input_files[i] on input_levels[i]
+  void RunLastLevelCompaction(
+      const std::vector<std::vector<FileMetaData*>>& input_files,
+      const std::vector<int> input_levels,
+      std::function<void(Compaction& comp)>&& verify_func,
+      const std::vector<SequenceNumber>& snapshots = {}) {
+    const int kLastLevel = cf_options_.num_levels - 1;
+    verify_per_key_placement_ = std::move(verify_func);
+    mock::KVVector empty_map;
+    RunCompaction(input_files, input_levels, {empty_map}, snapshots,
+                  kMaxSequenceNumber, kLastLevel, false);
+  }
+
+  // input_files[i] on input_levels[i]
+  void RunCompaction(
+      const std::vector<std::vector<FileMetaData*>>& input_files,
+      const std::vector<int>& input_levels,
+      const std::vector<mock::KVVector>& expected_results,
+      const std::vector<SequenceNumber>& snapshots = {},
+      SequenceNumber earliest_write_conflict_snapshot = kMaxSequenceNumber,
+      int output_level = 1, bool verify = true,
+      std::vector<uint64_t> expected_oldest_blob_file_numbers = {},
+      bool check_get_priority = false,
+      Env::IOPriority read_io_priority = Env::IO_TOTAL,
+      Env::IOPriority write_io_priority = Env::IO_TOTAL,
+      int max_subcompactions = 0) {
+    // For compaction, set fs as MockTestFileSystem to check the io_priority.
+    if (test_io_priority_) {
+      db_options_.fs.reset(
+          new MockTestFileSystem(fs_, read_io_priority, write_io_priority));
+    }
+
+    auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+
+    size_t num_input_files = 0;
+    std::vector<CompactionInputFiles> compaction_input_files;
+    for (size_t i = 0; i < input_files.size(); ++i) {
+      auto level_files = input_files[i];
+      CompactionInputFiles compaction_level;
+      compaction_level.level = input_levels[i];
+      compaction_level.files.insert(compaction_level.files.end(),
+                                    level_files.begin(), level_files.end());
+      compaction_input_files.push_back(compaction_level);
+      num_input_files += level_files.size();
+    }
+
+    std::vector<FileMetaData*> grandparents;
+    // it should actually be the next non-empty level
+    const int kGrandparentsLevel = output_level + 1;
+    if (kGrandparentsLevel < cf_options_.num_levels) {
+      grandparents =
+          cfd_->current()->storage_info()->LevelFiles(kGrandparentsLevel);
+    }
+
+    Compaction compaction(
+        cfd->current()->storage_info(), *cfd->ioptions(),
+        *cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+        compaction_input_files, output_level,
+        mutable_cf_options_.target_file_size_base,
+        mutable_cf_options_.max_compaction_bytes, 0, kNoCompression,
+        cfd->GetLatestMutableCFOptions()->compression_opts,
+        Temperature::kUnknown, max_subcompactions, grandparents, true);
+    compaction.SetInputVersion(cfd->current());
+
+    assert(db_options_.info_log);
+    LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
+    mutex_.Lock();
+    EventLogger event_logger(db_options_.info_log.get());
+    // TODO(yiwu) add a mock snapshot checker and add test for it.
+    SnapshotChecker* snapshot_checker = nullptr;
+    ASSERT_TRUE(full_history_ts_low_.empty() ||
+                ucmp_->timestamp_size() == full_history_ts_low_.size());
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionJob compaction_job(
+        0, &compaction, db_options_, mutable_db_options_, env_options_,
+        versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
+        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
+        earliest_write_conflict_snapshot, snapshot_checker, nullptr,
+        table_cache_, &event_logger, false, false, dbname_,
+        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        env_->GenerateUniqueId(), DBImpl::GenerateDbSessionId(nullptr),
+        full_history_ts_low_);
+    VerifyInitializationOfCompactionJobStats(compaction_job_stats_);
+
+    compaction_job.Prepare();
+    mutex_.Unlock();
+    Status s = compaction_job.Run();
+    ASSERT_OK(s);
+    ASSERT_OK(compaction_job.io_status());
+    mutex_.Lock();
+    ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions()));
+    ASSERT_OK(compaction_job.io_status());
+    mutex_.Unlock();
+    log_buffer.FlushBufferToLog();
+
+    if (verify) {
+      ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
+      ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
+
+      VerifyTables(output_level, expected_results,
+                   expected_oldest_blob_file_numbers);
+    }
+
+    if (check_get_priority) {
+      CheckGetRateLimiterPriority(compaction_job);
+    }
+
+    if (verify_per_key_placement_) {
+      // Verify per_key_placement compaction
+      assert(compaction.SupportsPerKeyPlacement());
+      verify_per_key_placement_(compaction);
+    }
+  }
+
+  void CheckGetRateLimiterPriority(CompactionJob& compaction_job) {
+    // When the state from WriteController is normal.
+    ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_LOW);
+
+    WriteController* write_controller =
+        compaction_job.versions_->GetColumnFamilySet()->write_controller();
+
+    {
+      // When the state from WriteController is Delayed.
+      std::unique_ptr<WriteControllerToken> delay_token =
+          write_controller->GetDelayToken(1000000);
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
+
+    {
+      // When the state from WriteController is Stopped.
+      std::unique_ptr<WriteControllerToken> stop_token =
+          write_controller->GetStopToken();
+      ASSERT_EQ(compaction_job.GetRateLimiterPriority(), Env::IO_USER);
+    }
+  }
+
+  std::shared_ptr<Env> env_guard_;
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  const Comparator* const ucmp_;
+  EnvOptions env_options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  CompactionJobStats compaction_job_stats_;
+  ColumnFamilyData* cfd_;
+  std::unique_ptr<CompactionFilter> compaction_filter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  ErrorHandler error_handler_;
+  std::string full_history_ts_low_;
+  const std::function<std::string(uint64_t)> encode_u64_ts_;
+  const bool test_io_priority_;
+  std::function<void(Compaction& comp)> verify_per_key_placement_;
+  const TableTypeForTest table_type_ = kMockTable;
+};
+
+// TODO(icanadi) Make it simpler once we mock out VersionSet
+class CompactionJobTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_test"), BytewiseComparator(),
+            [](uint64_t /*ts*/) { return ""; }, /*test_io_priority=*/false,
+            TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTest, Simple) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, DISABLED_SimpleCorrupted) {
+  NewDB();
+
+  auto expected_results = CreateTwoFiles(true);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+  ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
+}
+
+TEST_F(CompactionJobTest, SimpleDeletion) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("c", 4U, kTypeDeletion), ""},
+                                   {KeyStr("c", 3U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("b", 2U, kTypeValue), "val"},
+                                   {KeyStr("b", 1U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("b", 0U, kTypeValue), "val"}});
+
+  SetLastSequence(4U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OutputNothing) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 2U, kTypeDeletion), ""}});
+
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile();
+
+  SetLastSequence(4U);
+
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleOverwrite) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 3U, kTypeValue), "val2"},
+      {KeyStr("b", 4U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "val2"},
+                          {KeyStr("b", 0U, kTypeValue), "val3"}});
+
+  SetLastSequence(4U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleNonLastLevel) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("b", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  // Because level 1 is not the last level, the sequence numbers of a and b
+  // cannot be set to 0
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("b", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files =
+      cfd_->current()->storage_info()->LevelFiles(input_levels[0]);
+  auto lvl1_files =
+      cfd_->current()->storage_info()->LevelFiles(input_levels[1]);
+  RunCompaction({lvl0_files, lvl1_files}, input_levels, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SimpleMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeValue), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeValue), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, NonAssocMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeMerge), "5"},
+      {KeyStr("a", 4U, kTypeMerge), "4"},
+      {KeyStr("a", 3U, kTypeMerge), "3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 2U, kTypeMerge), "2"}, {KeyStr("b", 1U, kTypeMerge), "1"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), "3,4,5"},
+                          {KeyStr("b", 0U, kTypeValue), "1,2"}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Filters merge operands with value 10.
+TEST_F(CompactionJobTest, MergeOperandFilter) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
+      {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}  // Filtered
+  });
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
+                          {KeyStr("b", 0U, kTypeValue), test::EncodeInt(2U)}});
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
+       {KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)},  // Filtered
+       {KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
+       {KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
+                          {KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
+                          {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
+      {KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
+      // b does not appear because the operands are filtered
+  });
+
+  SetLastSequence(5U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+// Test where all operands/merge results are filtered out.
+TEST_F(CompactionJobTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  compaction_filter_.reset(new test::FilterNumber(10U));
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file2);
+
+  auto file3 =
+      mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
+                          {KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
+  AddMockFile(file3, 2);
+
+  SetLastSequence(11U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  mock::KVVector empty_map;
+  RunCompaction({files}, {input_level}, {empty_map});
+}
+
+TEST_F(CompactionJobTest, SimpleSingleDelete) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeDeletion), ""},
+      {KeyStr("b", 6U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 3U, kTypeValue), "val"},
+                                   {KeyStr("b", 4U, kTypeValue), "val"}});
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("a", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeDeletion), ""}});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, SingleDeleteSnapshots) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("d", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 9U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("l", 2U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("0", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), "val1"},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 21U, kTypeValue), "val3"},
+      {KeyStr("d", 8U, kTypeValue), "val4"},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("h", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("A", 1U, kTypeValue), "val"},
+      {KeyStr("e", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("a", 11U, kTypeValue), ""},
+      {KeyStr("b", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("b", 11U, kTypeValue), "val2"},
+      {KeyStr("c", 22U, kTypeSingleDeletion), ""},
+      {KeyStr("c", 21U, kTypeValue), ""},
+      {KeyStr("e", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 21U, kTypeSingleDeletion), ""},
+      {KeyStr("f", 1U, kTypeValue), "val1"},
+      {KeyStr("g", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("j", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("k", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 12U, kTypeValue), "val1"},
+      {KeyStr("m", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("m", 8U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U}, 10U);
+}
+
+TEST_F(CompactionJobTest, EarliestWriteConflictSnapshot) {
+  NewDB();
+
+  // Test multiple snapshots where the earliest snapshot is not a
+  // write-conflic-snapshot.
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), "val"},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), "val"},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), "val"},
+      {KeyStr("G", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 23U, kTypeValue), "val2"},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("H", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 23U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), "val2"},
+      {KeyStr("I", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 32U, kTypeValue), "val3"},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 13U, kTypeValue), "val"},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val3"},
+      {KeyStr("H", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 13U, kTypeValue), "val2"},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val5"},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("A", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 23U, kTypeValue), ""},
+      {KeyStr("B", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 23U, kTypeValue), ""},
+      {KeyStr("D", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 32U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 31U, kTypeValue), ""},
+      {KeyStr("H", 31U, kTypeValue), "val"},
+      {KeyStr("I", 35U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 34U, kTypeValue), ""},
+      {KeyStr("I", 31U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val4"},
+      {KeyStr("J", 34U, kTypeValue), "val"},
+      {KeyStr("J", 33U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 25U, kTypeValue), "val2"},
+      {KeyStr("J", 24U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 15U, kTypeValue), "val3"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+  });
+
+  SetLastSequence(24U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U, 20U, 30U},
+                20U);
+}
+
+TEST_F(CompactionJobTest, SingleDeleteZeroSeq) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("dummy", 5U, kTypeValue), "val2"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 0U, kTypeValue), "val"},
+  });
+  AddMockFile(file2);
+
+  auto expected_results = mock::MakeMockFile({
+      {KeyStr("dummy", 0U, kTypeValue), "val2"},
+  });
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {});
+}
+
+TEST_F(CompactionJobTest, MultiSingleDelete) {
+  // Tests three scenarios involving multiple single delete/put pairs:
+  //
+  // A: Put Snapshot SDel Put SDel -> Put Snapshot SDel
+  // B: Snapshot Put SDel Put SDel Snapshot -> Snapshot SDel Snapshot
+  // C: SDel Put SDel Snapshot Put -> Snapshot Put
+  // D: (Put) SDel Snapshot Put SDel -> (Put) SDel Snapshot SDel
+  // E: Put SDel Snapshot Put SDel -> Snapshot SDel
+  // F: Put SDel Put Sdel Snapshot -> removed
+  // G: Snapshot SDel Put SDel Put -> Snapshot Put SDel
+  // H: (Put) Put SDel Put Sdel Snapshot -> Removed
+  // I: (Put) Snapshot Put SDel Put SDel -> SDel
+  // J: Put Put SDel Put SDel SDel Snapshot Put Put SDel SDel Put
+  //      -> Snapshot Put
+  // K: SDel SDel Put SDel Put Put Snapshot SDel Put SDel SDel Put SDel
+  //      -> Snapshot Put Snapshot SDel
+  // L: SDel Put SDel Put SDel Snapshot SDel Put SDel SDel Put SDel
+  //      -> Snapshot SDel Put SDel
+  // M: (Put) SDel Put SDel Put SDel Snapshot Put SDel SDel Put SDel SDel
+  //      -> SDel Snapshot Put SDel
+  NewDB();
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("A", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("A", 13U, kTypeValue), "val5"},
+      {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 13U, kTypeValue), "val2"},
+      {KeyStr("C", 14U, kTypeValue), "val3"},
+      {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 11U, kTypeValue), "val4"},
+      {KeyStr("G", 15U, kTypeValue), "val"},
+      {KeyStr("G", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("G", 13U, kTypeValue), "val"},
+      {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 13U, kTypeValue), "val"},
+      {KeyStr("J", 15U, kTypeValue), "val"},
+      {KeyStr("J", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 12U, kTypeValue), "val"},
+      {KeyStr("J", 11U, kTypeValue), "val"},
+      {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 15U, kTypeValue), "val1"},
+      {KeyStr("K", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 12U, kTypeValue), "val2"},
+      {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 15U, kTypeValue), "val"},
+      {KeyStr("L", 14U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 12U, kTypeValue), "val"},
+      {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 16U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 14U, kTypeValue), "val"},
+      {KeyStr("M", 13U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 11U, kTypeValue), "val"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({
+      {KeyStr("A", 10U, kTypeValue), "val"},
+      {KeyStr("B", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("B", 11U, kTypeValue), "val2"},
+      {KeyStr("C", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("C", 9U, kTypeValue), "val6"},
+      {KeyStr("C", 8U, kTypeSingleDeletion), ""},
+      {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 11U, kTypeValue), "val"},
+      {KeyStr("E", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("E", 4U, kTypeValue), "val"},
+      {KeyStr("F", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 5U, kTypeValue), "val"},
+      {KeyStr("F", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("F", 3U, kTypeValue), "val"},
+      {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 5U, kTypeValue), "val"},
+      {KeyStr("H", 4U, kTypeSingleDeletion), ""},
+      {KeyStr("H", 3U, kTypeValue), "val"},
+      {KeyStr("I", 12U, kTypeSingleDeletion), ""},
+      {KeyStr("I", 11U, kTypeValue), "val"},
+      {KeyStr("J", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 4U, kTypeValue), "val"},
+      {KeyStr("J", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("J", 2U, kTypeValue), "val"},
+      {KeyStr("K", 8U, kTypeValue), "val3"},
+      {KeyStr("K", 7U, kTypeValue), "val4"},
+      {KeyStr("K", 6U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 5U, kTypeValue), "val5"},
+      {KeyStr("K", 2U, kTypeSingleDeletion), ""},
+      {KeyStr("K", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 4U, kTypeValue), "val"},
+      {KeyStr("L", 3U, kTypeSingleDeletion), ""},
+      {KeyStr("L", 2U, kTypeValue), "val"},
+      {KeyStr("L", 1U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 10U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 7U, kTypeValue), "val"},
+      {KeyStr("M", 5U, kTypeSingleDeletion), ""},
+      {KeyStr("M", 4U, kTypeValue), "val"},
+      {KeyStr("M", 3U, kTypeSingleDeletion), ""},
+  });
+  AddMockFile(file2);
+
+  auto file3 = mock::MakeMockFile({
+      {KeyStr("D", 1U, kTypeValue), "val"},
+      {KeyStr("H", 1U, kTypeValue), "val"},
+      {KeyStr("I", 2U, kTypeValue), "val"},
+  });
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({
+      {KeyStr("M", 1U, kTypeValue), "val"},
+  });
+  AddMockFile(file4, 2);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("A", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 13U, kTypeValue), ""},
+                          {KeyStr("A", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("A", 10U, kTypeValue), "val"},
+                          {KeyStr("B", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("B", 13U, kTypeValue), ""},
+                          {KeyStr("C", 14U, kTypeValue), "val3"},
+                          {KeyStr("D", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("D", 11U, kTypeValue), ""},
+                          {KeyStr("D", 10U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("E", 11U, kTypeValue), ""},
+                          {KeyStr("G", 15U, kTypeValue), "val"},
+                          {KeyStr("G", 12U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 14U, kTypeSingleDeletion), ""},
+                          {KeyStr("I", 13U, kTypeValue), ""},
+                          {KeyStr("J", 15U, kTypeValue), "val"},
+                          {KeyStr("K", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 15U, kTypeValue), ""},
+                          {KeyStr("K", 11U, kTypeSingleDeletion), ""},
+                          {KeyStr("K", 8U, kTypeValue), "val3"},
+                          {KeyStr("L", 16U, kTypeSingleDeletion), ""},
+                          {KeyStr("L", 15U, kTypeValue), ""},
+                          {KeyStr("L", 11U, kTypeSingleDeletion), ""},
+                          {KeyStr("M", 15U, kTypeSingleDeletion), ""},
+                          {KeyStr("M", 14U, kTypeValue), ""},
+                          {KeyStr("M", 3U, kTypeSingleDeletion), ""}});
+
+  SetLastSequence(22U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results}, {10U}, 10U);
+}
+
+// This test documents the behavior where a corrupt key follows a deletion or a
+// single deletion and the (single) deletion gets removed while the corrupt key
+// gets written out. TODO(noetzli): We probably want a better way to treat
+// corrupt keys.
+TEST_F(CompactionJobTest, DISABLED_CorruptionAfterDeletion) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{test::KeyStr("A", 6U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 5U, kTypeDeletion), ""},
+                          {test::KeyStr("a", 4U, kTypeValue, true), "val"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{test::KeyStr("b", 3U, kTypeSingleDeletion), ""},
+                          {test::KeyStr("b", 2U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 1U, kTypeValue), "val2"}});
+  AddMockFile(file2);
+
+  auto expected_results =
+      mock::MakeMockFile({{test::KeyStr("A", 0U, kTypeValue), "val3"},
+                          {test::KeyStr("a", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("b", 0U, kTypeValue, true), "val"},
+                          {test::KeyStr("c", 0U, kTypeValue), "val2"}});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, OldestBlobFileNumber) {
+  NewDB();
+
+  // Note: blob1 is inlined TTL, so it will not be considered for the purposes
+  // of identifying the oldest referenced blob file. Similarly, blob6 will be
+  // ignored because it has TTL and hence refers to a TTL blob file.
+  const stl_wrappers::KVMap::value_type blob1(
+      KeyStr("a", 1U, kTypeBlobIndex), BlobStrInlinedTTL("foo", 1234567890ULL));
+  const stl_wrappers::KVMap::value_type blob2(KeyStr("b", 2U, kTypeBlobIndex),
+                                              BlobStr(59, 123456, 999));
+  const stl_wrappers::KVMap::value_type blob3(KeyStr("c", 3U, kTypeBlobIndex),
+                                              BlobStr(138, 1000, 1 << 8));
+  auto file1 = mock::MakeMockFile({blob1, blob2, blob3});
+  AddMockFile(file1);
+
+  const stl_wrappers::KVMap::value_type blob4(KeyStr("d", 4U, kTypeBlobIndex),
+                                              BlobStr(199, 3 << 10, 1 << 20));
+  const stl_wrappers::KVMap::value_type blob5(KeyStr("e", 5U, kTypeBlobIndex),
+                                              BlobStr(19, 6789, 333));
+  const stl_wrappers::KVMap::value_type blob6(
+      KeyStr("f", 6U, kTypeBlobIndex),
+      BlobStrTTL(5, 2048, 1 << 7, 1234567890ULL));
+  auto file2 = mock::MakeMockFile({blob4, blob5, blob6});
+  AddMockFile(file2);
+
+  const stl_wrappers::KVMap::value_type expected_blob1(
+      KeyStr("a", 0U, kTypeBlobIndex), blob1.second);
+  const stl_wrappers::KVMap::value_type expected_blob2(
+      KeyStr("b", 0U, kTypeBlobIndex), blob2.second);
+  const stl_wrappers::KVMap::value_type expected_blob3(
+      KeyStr("c", 0U, kTypeBlobIndex), blob3.second);
+  const stl_wrappers::KVMap::value_type expected_blob4(
+      KeyStr("d", 0U, kTypeBlobIndex), blob4.second);
+  const stl_wrappers::KVMap::value_type expected_blob5(
+      KeyStr("e", 0U, kTypeBlobIndex), blob5.second);
+  const stl_wrappers::KVMap::value_type expected_blob6(
+      KeyStr("f", 0U, kTypeBlobIndex), blob6.second);
+  auto expected_results =
+      mock::MakeMockFile({expected_blob1, expected_blob2, expected_blob3,
+                          expected_blob4, expected_blob5, expected_blob6});
+
+  SetLastSequence(6U);
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results},
+                std::vector<SequenceNumber>(), kMaxSequenceNumber,
+                /* output_level */ 1, /* verify */ true,
+                /* expected_oldest_blob_file_numbers */ {19});
+}
+
+TEST_F(CompactionJobTest, VerifyPenultimateLevelOutput) {
+  cf_options_.bottommost_temperature = Temperature::kCold;
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = true;
+      });
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  NewDB();
+
+  // Add files on different levels that may overlap
+  auto file0_1 = mock::MakeMockFile({{KeyStr("z", 12U, kTypeValue), "val"}});
+  AddMockFile(file0_1);
+
+  auto file1_1 = mock::MakeMockFile({{KeyStr("b", 10U, kTypeValue), "val"},
+                                     {KeyStr("f", 11U, kTypeValue), "val"}});
+  AddMockFile(file1_1, 1);
+  auto file1_2 = mock::MakeMockFile({{KeyStr("j", 12U, kTypeValue), "val"},
+                                     {KeyStr("k", 13U, kTypeValue), "val"}});
+  AddMockFile(file1_2, 1);
+  auto file1_3 = mock::MakeMockFile({{KeyStr("p", 14U, kTypeValue), "val"},
+                                     {KeyStr("u", 15U, kTypeValue), "val"}});
+  AddMockFile(file1_3, 1);
+
+  auto file2_1 = mock::MakeMockFile({{KeyStr("f", 8U, kTypeValue), "val"},
+                                     {KeyStr("h", 9U, kTypeValue), "val"}});
+  AddMockFile(file2_1, 2);
+  auto file2_2 = mock::MakeMockFile({{KeyStr("m", 6U, kTypeValue), "val"},
+                                     {KeyStr("p", 7U, kTypeValue), "val"}});
+  AddMockFile(file2_2, 2);
+
+  auto file3_1 = mock::MakeMockFile({{KeyStr("g", 2U, kTypeValue), "val"},
+                                     {KeyStr("k", 3U, kTypeValue), "val"}});
+  AddMockFile(file3_1, 3);
+  auto file3_2 = mock::MakeMockFile({{KeyStr("v", 4U, kTypeValue), "val"},
+                                     {KeyStr("x", 5U, kTypeValue), "val"}});
+  AddMockFile(file3_2, 3);
+
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  const std::vector<int> input_levels = {0, 1, 2, 3};
+  auto files0 = cfd->current()->storage_info()->LevelFiles(input_levels[0]);
+  auto files1 = cfd->current()->storage_info()->LevelFiles(input_levels[1]);
+  auto files2 = cfd->current()->storage_info()->LevelFiles(input_levels[2]);
+  auto files3 = cfd->current()->storage_info()->LevelFiles(input_levels[3]);
+
+  RunLastLevelCompaction(
+      {files0, files1, files2, files3}, input_levels,
+      /*verify_func=*/[&](Compaction& comp) {
+        for (char c = 'a'; c <= 'z'; c++) {
+          std::string c_str;
+          c_str = c;
+          const Slice key(c_str);
+          if (c == 'a') {
+            ASSERT_FALSE(comp.WithinPenultimateLevelOutputRange(key));
+          } else {
+            ASSERT_TRUE(comp.WithinPenultimateLevelOutputRange(key));
+          }
+        }
+      });
+}
+
+TEST_F(CompactionJobTest, NoEnforceSingleDeleteContract) {
+  db_options_.enforce_single_del_contracts = false;
+  NewDB();
+
+  auto file =
+      mock::MakeMockFile({{KeyStr("a", 4U, kTypeSingleDeletion), ""},
+                          {KeyStr("a", 3U, kTypeDeletion), "dontcare"}});
+  AddMockFile(file);
+  SetLastSequence(4U);
+
+  auto expected_results = mock::MakeMockFile();
+  constexpr int input_level = 0;
+  auto files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTest, InputSerialization) {
+  // Setup a random CompactionServiceInput
+  CompactionServiceInput input;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  input.column_family.name = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  input.column_family.options.comparator = ReverseBytewiseComparator();
+  input.column_family.options.max_bytes_for_level_base =
+      rnd64.Uniform(UINT64_MAX);
+  input.column_family.options.disable_auto_compactions = rnd.OneIn(2);
+  input.column_family.options.compression = kZSTD;
+  input.column_family.options.compression_opts.level = 4;
+  input.db_options.max_background_flushes = 10;
+  input.db_options.paranoid_checks = rnd.OneIn(2);
+  input.db_options.statistics = CreateDBStatistics();
+  input.db_options.env = env_;
+  while (!rnd.OneIn(10)) {
+    input.snapshots.emplace_back(rnd64.Uniform(UINT64_MAX));
+  }
+  while (!rnd.OneIn(10)) {
+    input.input_files.emplace_back(rnd.RandomString(
+        rnd.Uniform(kStrMaxLen - 1) +
+        1));  // input file name should have at least one character
+  }
+  input.output_level = 4;
+  input.has_begin = rnd.OneIn(2);
+  if (input.has_begin) {
+    input.begin = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+  input.has_end = rnd.OneIn(2);
+  if (input.has_end) {
+    input.end = rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen));
+  }
+
+  std::string output;
+  ASSERT_OK(input.Write(&output));
+
+  // Test deserialization
+  CompactionServiceInput deserialized1;
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&input));
+
+  // Test mismatch
+  deserialized1.db_options.max_background_flushes += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "db_options.max_background_flushes");
+
+  // Test unknown field
+  CompactionServiceInput deserialized2;
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&input));
+
+  // Test missing field
+  CompactionServiceInput deserialized3;
+  deserialized3.output_level = 0;
+  std::string to_remove = "output_level=4;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceInput::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&input, &mismatch));
+  ASSERT_EQ(mismatch, "output_level");
+
+  // manually set the value back, should match the original structure
+  deserialized3.output_level = 4;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&input));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(input.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceInput::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+TEST_F(CompactionJobTest, ResultSerialization) {
+  // Setup a random CompactionServiceResult
+  CompactionServiceResult result;
+  const int kStrMaxLen = 1000;
+  Random rnd(static_cast<uint32_t>(time(nullptr)));
+  Random64 rnd64(time(nullptr));
+  std::vector<Status> status_list = {
+      Status::OK(),
+      Status::InvalidArgument("invalid option"),
+      Status::Aborted("failed to run"),
+      Status::NotSupported("not supported option"),
+  };
+  result.status =
+      status_list.at(rnd.Uniform(static_cast<int>(status_list.size())));
+  while (!rnd.OneIn(10)) {
+    UniqueId64x2 id{rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX)};
+    result.output_files.emplace_back(
+        rnd.RandomString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)),
+        rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX),
+        rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id);
+  }
+  result.output_level = rnd.Uniform(10);
+  result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen));
+  result.num_output_records = rnd64.Uniform(UINT64_MAX);
+  result.total_bytes = rnd64.Uniform(UINT64_MAX);
+  result.bytes_read = 123;
+  result.bytes_written = rnd64.Uniform(UINT64_MAX);
+  result.stats.elapsed_micros = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_output_files = rnd.Uniform(1000);
+  result.stats.is_full_compaction = rnd.OneIn(2);
+  result.stats.num_single_del_mismatch = rnd64.Uniform(UINT64_MAX);
+  result.stats.num_input_files = 9;
+
+  std::string output;
+  ASSERT_OK(result.Write(&output));
+
+  // Test deserialization
+  CompactionServiceResult deserialized1;
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized1));
+  ASSERT_TRUE(deserialized1.TEST_Equals(&result));
+
+  // Test mismatch
+  deserialized1.stats.num_input_files += 10;
+  std::string mismatch;
+  ASSERT_FALSE(deserialized1.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "stats.num_input_files");
+
+  // Test unique id mismatch
+  if (!result.output_files.empty()) {
+    CompactionServiceResult deserialized_tmp;
+    ASSERT_OK(CompactionServiceResult::Read(output, &deserialized_tmp));
+    deserialized_tmp.output_files[0].unique_id[0] += 1;
+    ASSERT_FALSE(deserialized_tmp.TEST_Equals(&result, &mismatch));
+    ASSERT_EQ(mismatch, "output_files.unique_id");
+    deserialized_tmp.status.PermitUncheckedError();
+  }
+
+  // Test unknown field
+  CompactionServiceResult deserialized2;
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+  output.append("new_field=123;");
+
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized2));
+  ASSERT_TRUE(deserialized2.TEST_Equals(&result));
+
+  // Test missing field
+  CompactionServiceResult deserialized3;
+  deserialized3.bytes_read = 0;
+  std::string to_remove = "bytes_read=123;";
+  size_t pos = output.find(to_remove);
+  ASSERT_TRUE(pos != std::string::npos);
+  output.erase(pos, to_remove.length());
+  ASSERT_OK(CompactionServiceResult::Read(output, &deserialized3));
+  mismatch.clear();
+  ASSERT_FALSE(deserialized3.TEST_Equals(&result, &mismatch));
+  ASSERT_EQ(mismatch, "bytes_read");
+
+  deserialized3.bytes_read = 123;
+  ASSERT_TRUE(deserialized3.TEST_Equals(&result));
+
+  // Test invalid version
+  output.clear();
+  ASSERT_OK(result.Write(&output));
+
+  uint32_t data_version = DecodeFixed32(output.data());
+  const size_t kDataVersionSize = sizeof(data_version);
+  ASSERT_EQ(data_version,
+            1U);  // Update once the default data version is changed
+  char buf[kDataVersionSize];
+  EncodeFixed32(buf, data_version + 10);  // make sure it's not valid
+  output.replace(0, kDataVersionSize, buf, kDataVersionSize);
+  Status s = CompactionServiceResult::Read(output, &deserialized3);
+  ASSERT_TRUE(s.IsNotSupported());
+  for (const auto& item : status_list) {
+    item.PermitUncheckedError();
+  }
+}
+
+class CompactionJobDynamicFileSizeTest
+    : public CompactionJobTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompactionJobDynamicFileSizeTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_dynamic_file_size_test"),
+            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+            /*test_io_priority=*/false, TableTypeForTest::kMockTable) {}
+};
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytes) {
+  // dynamic_file_size option should have no impact on cutting for max
+  // compaction bytes.
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  mutable_cf_options_.target_file_size_base = 80;
+  mutable_cf_options_.max_compaction_bytes = 21;
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("c", 5U, kTypeValue), "val2"},
+      {KeyStr("n", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("h", 3U, kTypeValue), "val"},
+                                   {KeyStr("j", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  // Create three L2 files, each size 10.
+  // max_compaction_bytes 21 means the compaction output in L1 will
+  // be cut to at least two files.
+  auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+                                   {KeyStr("c", 1U, kTypeValue), "val"},
+                                   {KeyStr("c1", 1U, kTypeValue), "val"},
+                                   {KeyStr("c2", 1U, kTypeValue), "val"},
+                                   {KeyStr("c3", 1U, kTypeValue), "val"},
+                                   {KeyStr("c4", 1U, kTypeValue), "val"},
+                                   {KeyStr("d", 1U, kTypeValue), "val"},
+                                   {KeyStr("e", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("i", 1U, kTypeValue), "val"},
+                                   {KeyStr("i1", 1U, kTypeValue), "val"},
+                                   {KeyStr("i2", 1U, kTypeValue), "val"},
+                                   {KeyStr("i3", 1U, kTypeValue), "val"},
+                                   {KeyStr("i4", 1U, kTypeValue), "val"},
+                                   {KeyStr("j", 1U, kTypeValue), "val"},
+                                   {KeyStr("k", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("l", 1U, kTypeValue), "val"},
+                                   {KeyStr("m", 1U, kTypeValue), "val"},
+                                   {KeyStr("m1", 1U, kTypeValue), "val"},
+                                   {KeyStr("m2", 1U, kTypeValue), "val"},
+                                   {KeyStr("m3", 1U, kTypeValue), "val"},
+                                   {KeyStr("m4", 1U, kTypeValue), "val"},
+                                   {KeyStr("n", 1U, kTypeValue), "val"},
+                                   {KeyStr("o", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  // The expected output should be:
+  //  L1:   [c,   h,  j]        [n]
+  //  L2: [b ... e] [h ... k] [l ... o]
+  // It's better to have "j" in the first file, because anyway it's overlapping
+  // with the second file on L2.
+  // (Note: before this PR, it was cut at "h" because it's using the internal
+  // comparator which think L1 "h" with seqno 3 is smaller than L2 "h" with
+  // seqno 1, but actually they're overlapped with the compaction picker).
+
+  auto expected_file1 =
+      mock::MakeMockFile({{KeyStr("c", 5U, kTypeValue), "val2"},
+                          {KeyStr("h", 3U, kTypeValue), "val"},
+                          {KeyStr("j", 4U, kTypeValue), "val"}});
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("n", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                {expected_file1, expected_file2});
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToSkipGrandparentFile) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  // Make sure the grandparent level file size (10) qualifies skipping.
+  // Currently, it has to be > 1/8 of target file size.
+  mutable_cf_options_.target_file_size_base = 70;
+
+  auto file1 = mock::MakeMockFile({
+      {KeyStr("a", 5U, kTypeValue), "val2"},
+      {KeyStr("z", 6U, kTypeValue), "val3"},
+  });
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("c", 3U, kTypeValue), "val"},
+                                   {KeyStr("x", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("b", 1U, kTypeValue), "val"},
+                                   {KeyStr("d", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("i", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("v", 1U, kTypeValue), "val"},
+                                   {KeyStr("y", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  auto expected_file1 =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("c", 3U, kTypeValue), "val"}});
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("x", 4U, kTypeValue), "val"},
+                          {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+  auto expected_file_disable_dynamic_file_size =
+      mock::MakeMockFile({{KeyStr("a", 5U, kTypeValue), "val2"},
+                          {KeyStr("c", 3U, kTypeValue), "val"},
+                          {KeyStr("x", 4U, kTypeValue), "val"},
+                          {KeyStr("z", 6U, kTypeValue), "val3"}});
+
+  SetLastSequence(6U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2});
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size});
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundary) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+  NewDB();
+
+  // MockTable has 1 byte per entry by default and each file is 10 bytes.
+  // When the file size is smaller than 100, it won't cut file earlier to align
+  // with its grandparent boundary.
+  const size_t kKeyValueSize = 10000;
+  mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+  mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+  mock::KVVector file1;
+  char ch = 'd';
+  // Add value from d -> o
+  for (char i = 0; i < 12; i++) {
+    file1.emplace_back(KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+                       "val" + std::to_string(i));
+  }
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("e", 3U, kTypeValue), "val"},
+                                   {KeyStr("s", 4U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  // the 1st grandparent file should be skipped
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 1U, kTypeValue), "val"},
+                                   {KeyStr("b", 2U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("c", 1U, kTypeValue), "val"},
+                                   {KeyStr("e", 2U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("h", 1U, kTypeValue), "val"},
+                                   {KeyStr("j", 2U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  auto file6 = mock::MakeMockFile({{KeyStr("k", 1U, kTypeValue), "val"},
+                                   {KeyStr("n", 2U, kTypeValue), "val"}});
+  AddMockFile(file6, 2);
+
+  auto file7 = mock::MakeMockFile({{KeyStr("q", 1U, kTypeValue), "val"},
+                                   {KeyStr("t", 2U, kTypeValue), "val"}});
+  AddMockFile(file7, 2);
+
+  // The expected outputs are:
+  //  L1:         [d,e,f,g,h,i,j] [k,l,m,n,o,s]
+  //  L2: [a, b] [c,  e]   [h, j] [k, n]  [q, t]
+  // The first output cut earlier at "j", so it could be aligned with L2 files.
+  // If dynamic_file_size is not enabled, it will be cut based on the
+  // target_file_size
+  mock::KVVector expected_file1;
+  for (char i = 0; i < 7; i++) {
+    expected_file1.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  mock::KVVector expected_file2;
+  for (char i = 7; i < 12; i++) {
+    expected_file2.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+  expected_file2.emplace_back(KeyStr("s", 4U, kTypeValue), "val");
+
+  mock::KVVector expected_file_disable_dynamic_file_size1;
+  for (char i = 0; i < 10; i++) {
+    expected_file_disable_dynamic_file_size1.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  mock::KVVector expected_file_disable_dynamic_file_size2;
+  for (char i = 10; i < 12; i++) {
+    expected_file_disable_dynamic_file_size2.emplace_back(
+        KeyStr(std::string(1, ch + i), i + 10, kTypeValue),
+        "val" + std::to_string(i));
+  }
+
+  expected_file_disable_dynamic_file_size2.emplace_back(
+      KeyStr("s", 4U, kTypeValue), "val");
+
+  SetLastSequence(22U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2});
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size1,
+                   expected_file_disable_dynamic_file_size2});
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutToAlignGrandparentBoundarySameKey) {
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+  NewDB();
+
+  // MockTable has 1 byte per entry by default and each file is 10 bytes.
+  // When the file size is smaller than 100, it won't cut file earlier to align
+  // with its grandparent boundary.
+  const size_t kKeyValueSize = 10000;
+  mock_table_factory_->SetKeyValueSize(kKeyValueSize);
+
+  mutable_cf_options_.target_file_size_base = 10 * kKeyValueSize;
+
+  mock::KVVector file1;
+  for (int i = 0; i < 7; i++) {
+    file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+                       "val" + std::to_string(100 - i));
+  }
+  file1.emplace_back(KeyStr("b", 90, kTypeValue), "valb");
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 93U, kTypeValue), "val93"},
+                                   {KeyStr("b", 90U, kTypeValue), "valb"}});
+  AddMockFile(file2, 1);
+
+  auto file3 = mock::MakeMockFile({{KeyStr("a", 89U, kTypeValue), "val"},
+                                   {KeyStr("a", 88U, kTypeValue), "val"}});
+  AddMockFile(file3, 2);
+
+  auto file4 = mock::MakeMockFile({{KeyStr("a", 87U, kTypeValue), "val"},
+                                   {KeyStr("a", 86U, kTypeValue), "val"}});
+  AddMockFile(file4, 2);
+
+  auto file5 = mock::MakeMockFile({{KeyStr("b", 85U, kTypeValue), "val"},
+                                   {KeyStr("b", 84U, kTypeValue), "val"}});
+  AddMockFile(file5, 2);
+
+  mock::KVVector expected_file1;
+  mock::KVVector expected_file_disable_dynamic_file_size;
+
+  for (int i = 0; i < 8; i++) {
+    expected_file1.emplace_back(KeyStr("a", 100 - i, kTypeValue),
+                                "val" + std::to_string(100 - i));
+    expected_file_disable_dynamic_file_size.emplace_back(
+        KeyStr("a", 100 - i, kTypeValue), "val" + std::to_string(100 - i));
+  }
+
+  // make sure `b` is cut in a separated file (so internally it's not using
+  // internal comparator, which will think the "b:90" (seqno 90) here is smaller
+  // than "b:85" on L2.)
+  auto expected_file2 =
+      mock::MakeMockFile({{KeyStr("b", 90U, kTypeValue), "valb"}});
+
+  expected_file_disable_dynamic_file_size.emplace_back(
+      KeyStr("b", 90U, kTypeValue), "valb");
+
+  SetLastSequence(122U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  // Just keep all the history
+  std::vector<SequenceNumber> snapshots;
+  for (int i = 80; i <= 100; i++) {
+    snapshots.emplace_back(i);
+  }
+  if (enable_dyanmic_file_size) {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file1, expected_file2}, snapshots);
+  } else {
+    RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                  {expected_file_disable_dynamic_file_size}, snapshots);
+  }
+}
+
+TEST_P(CompactionJobDynamicFileSizeTest, CutForMaxCompactionBytesSameKey) {
+  // dynamic_file_size option should have no impact on cutting for max
+  // compaction bytes.
+  bool enable_dyanmic_file_size = GetParam();
+  cf_options_.level_compaction_dynamic_file_size = enable_dyanmic_file_size;
+
+  NewDB();
+  mutable_cf_options_.target_file_size_base = 80;
+  mutable_cf_options_.max_compaction_bytes = 20;
+
+  auto file1 = mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+                                   {KeyStr("b", 103U, kTypeValue), "val"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile({{KeyStr("a", 102U, kTypeValue), "val2"},
+                                   {KeyStr("c", 101U, kTypeValue), "val"}});
+  AddMockFile(file2, 1);
+
+  for (int i = 0; i < 10; i++) {
+    auto file =
+        mock::MakeMockFile({{KeyStr("a", 100 - (i * 2), kTypeValue), "val"},
+                            {KeyStr("a", 99 - (i * 2), kTypeValue), "val"}});
+    AddMockFile(file, 2);
+  }
+
+  for (int i = 0; i < 10; i++) {
+    auto file =
+        mock::MakeMockFile({{KeyStr("b", 80 - (i * 2), kTypeValue), "val"},
+                            {KeyStr("b", 79 - (i * 2), kTypeValue), "val"}});
+    AddMockFile(file, 2);
+  }
+
+  auto file5 = mock::MakeMockFile({{KeyStr("c", 60U, kTypeValue), "valc"},
+                                   {KeyStr("c", 59U, kTypeValue), "valc"}});
+
+  // "a" has 10 overlapped grandparent files (each size 10), which is far
+  // exceeded the `max_compaction_bytes`, but make sure 2 "a" are not separated,
+  // as splitting them won't help reducing the compaction size.
+  // also make sure "b" and "c" are cut separately.
+  mock::KVVector expected_file1 =
+      mock::MakeMockFile({{KeyStr("a", 104U, kTypeValue), "val1"},
+                          {KeyStr("a", 102U, kTypeValue), "val2"}});
+  mock::KVVector expected_file2 =
+      mock::MakeMockFile({{KeyStr("b", 103U, kTypeValue), "val"}});
+  mock::KVVector expected_file3 =
+      mock::MakeMockFile({{KeyStr("c", 101U, kTypeValue), "val"}});
+
+  SetLastSequence(122U);
+  const std::vector<int> input_levels = {0, 1};
+  auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
+  auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
+
+  // Just keep all the history
+  std::vector<SequenceNumber> snapshots;
+  for (int i = 80; i <= 105; i++) {
+    snapshots.emplace_back(i);
+  }
+  RunCompaction({lvl0_files, lvl1_files}, input_levels,
+                {expected_file1, expected_file2, expected_file3}, snapshots);
+}
+
+INSTANTIATE_TEST_CASE_P(CompactionJobDynamicFileSizeTest,
+                        CompactionJobDynamicFileSizeTest, testing::Bool());
+
+class CompactionJobTimestampTest : public CompactionJobTestBase {
+ public:
+  CompactionJobTimestampTest()
+      : CompactionJobTestBase(test::PerThreadDBPath("compaction_job_ts_test"),
+                              test::BytewiseComparatorWithU64TsWrapper(),
+                              test::EncodeInt, /*test_io_priority=*/false,
+                              TableTypeForTest::kMockTable) {}
+};
+
+TEST_F(CompactionJobTimestampTest, GCDisabled) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+                          {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+                          {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+                          {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"}});
+
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(10);
+
+  auto expected_results = mock::MakeMockFile(
+      {{KeyStr("a", 10, ValueType::kTypeValue, 100), "a10"},
+       {KeyStr("a", 9, ValueType::kTypeValue, 99), "a9"},
+       {KeyStr("b", 8, ValueType::kTypeValue, 98), "b8"},
+       {KeyStr("b", 6, ValueType::kTypeDeletionWithTimestamp, 96), ""},
+       {KeyStr("c", 5, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 4, ValueType::kTypeValue, 94), "c5"},
+       {KeyStr("d", 7, ValueType::kTypeValue, 97), "d7"},
+       {KeyStr("d", 3, ValueType::kTypeSingleDeletion, 93), ""}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, NoKeyExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"}});
+  AddMockFile(file1);
+
+  auto file2 =
+      mock::MakeMockFile({{KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  AddMockFile(file2);
+
+  SetLastSequence(101);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 6, ValueType::kTypeValue, 100), "a6"},
+                          {KeyStr("a", 4, ValueType::kTypeValue, 98), "a4"},
+                          {KeyStr("b", 7, ValueType::kTypeValue, 101), "b7"},
+                          {KeyStr("c", 5, ValueType::kTypeValue, 99), "c5"},
+                          {KeyStr("c", 3, ValueType::kTypeValue, 97), "c3"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(0);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, AllKeysExpired) {
+  NewDB();
+
+  auto file1 = mock::MakeMockFile(
+      {{KeyStr("a", 5, ValueType::kTypeDeletionWithTimestamp, 100), ""},
+       {KeyStr("b", 6, ValueType::kTypeSingleDeletion, 99), ""},
+       {KeyStr("c", 7, ValueType::kTypeValue, 98), "c7"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 4, ValueType::kTypeValue, 97), "a4"},
+       {KeyStr("b", 3, ValueType::kTypeValue, 96), "b3"},
+       {KeyStr("c", 2, ValueType::kTypeDeletionWithTimestamp, 95), ""},
+       {KeyStr("c", 1, ValueType::kTypeValue, 94), "c1"}});
+  AddMockFile(file2);
+
+  SetLastSequence(7);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("c", 0, ValueType::kTypeValue, 0), "c7"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(std::numeric_limits<uint64_t>::max());
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+TEST_F(CompactionJobTimestampTest, SomeKeysExpired) {
+  NewDB();
+
+  auto file1 =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  AddMockFile(file1);
+
+  auto file2 = mock::MakeMockFile(
+      {{KeyStr("a", 3, ValueType::kTypeValue, 48), "a3"},
+       {KeyStr("a", 2, ValueType::kTypeValue, 46), "a2"},
+       {KeyStr("b", 4, ValueType::kTypeDeletionWithTimestamp, 47), ""}});
+  AddMockFile(file2);
+
+  SetLastSequence(6);
+
+  auto expected_results =
+      mock::MakeMockFile({{KeyStr("a", 5, ValueType::kTypeValue, 50), "a5"},
+                          {KeyStr("a", 0, ValueType::kTypeValue, 0), "a3"},
+                          {KeyStr("b", 6, ValueType::kTypeValue, 49), "b6"}});
+  constexpr int input_level = 0;
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  full_history_ts_low_ = encode_u64_ts_(49);
+  RunCompaction({files}, {input_level}, {expected_results});
+}
+
+class CompactionJobTimestampTestWithBbTable : public CompactionJobTestBase {
+ public:
+  // Block-based table is needed if we want to test subcompaction partitioning
+  // with anchors.
+  explicit CompactionJobTimestampTestWithBbTable()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_ts_bbt_test"),
+            test::BytewiseComparatorWithU64TsWrapper(), test::EncodeInt,
+            /*test_io_priority=*/false, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionAnchorL1) {
+  cf_options_.target_file_size_base = 20;
+  mutable_cf_options_.target_file_size_base = 20;
+  NewDB();
+
+  const std::vector<std::string> keys = {
+      KeyStr("a", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 21, ValueType::kTypeValue, 210),
+      KeyStr("b", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 18, ValueType::kTypeValue, 180),
+      KeyStr("c", 17, ValueType::kTypeValue, 170),
+      KeyStr("c", 16, ValueType::kTypeValue, 160),
+      KeyStr("c", 15, ValueType::kTypeValue, 150)};
+  const std::vector<std::string> values = {"a20", "b21", "b20", "b18",
+                                           "c17", "c16", "c15"};
+
+  constexpr int input_level = 1;
+
+  auto file1 = mock::MakeMockFile(
+      {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+  AddMockFile(file1, input_level);
+
+  auto file2 = mock::MakeMockFile(
+      {{keys[3], values[3]}, {keys[4], values[4]}, {keys[5], values[5]}});
+  AddMockFile(file2, input_level);
+
+  auto file3 = mock::MakeMockFile({{keys[6], values[6]}});
+  AddMockFile(file3, input_level);
+
+  SetLastSequence(20);
+
+  auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+  auto output2 = mock::MakeMockFile(
+      {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+  auto output3 = mock::MakeMockFile(
+      {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+  auto expected_results =
+      std::vector<mock::KVVector>{output1, output2, output3};
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  constexpr int output_level = 2;
+  constexpr int max_subcompactions = 4;
+  RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+                /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+                output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+                /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+                max_subcompactions);
+}
+
+TEST_F(CompactionJobTimestampTestWithBbTable, SubcompactionL0) {
+  cf_options_.target_file_size_base = 20;
+  mutable_cf_options_.target_file_size_base = 20;
+  NewDB();
+
+  const std::vector<std::string> keys = {
+      KeyStr("a", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 20, ValueType::kTypeValue, 200),
+      KeyStr("b", 19, ValueType::kTypeValue, 190),
+      KeyStr("b", 18, ValueType::kTypeValue, 180),
+      KeyStr("c", 17, ValueType::kTypeValue, 170),
+      KeyStr("c", 16, ValueType::kTypeValue, 160),
+      KeyStr("c", 15, ValueType::kTypeValue, 150)};
+  const std::vector<std::string> values = {"a20", "b20", "b19", "b18",
+                                           "c17", "c16", "c15"};
+
+  constexpr int input_level = 0;
+
+  auto file1 = mock::MakeMockFile({{keys[5], values[5]}, {keys[6], values[6]}});
+  AddMockFile(file1, input_level);
+
+  auto file2 = mock::MakeMockFile({{keys[3], values[3]}, {keys[4], values[4]}});
+  AddMockFile(file2, input_level);
+
+  auto file3 = mock::MakeMockFile(
+      {{keys[0], values[0]}, {keys[1], values[1]}, {keys[2], values[2]}});
+  AddMockFile(file3, input_level);
+
+  SetLastSequence(20);
+
+  auto output1 = mock::MakeMockFile({{keys[0], values[0]}});
+  auto output2 = mock::MakeMockFile(
+      {{keys[1], values[1]}, {keys[2], values[2]}, {keys[3], values[3]}});
+  auto output3 = mock::MakeMockFile(
+      {{keys[4], values[4]}, {keys[5], values[5]}, {keys[6], values[6]}});
+
+  auto expected_results =
+      std::vector<mock::KVVector>{output1, output2, output3};
+  const auto& files = cfd_->current()->storage_info()->LevelFiles(input_level);
+
+  constexpr int output_level = 1;
+  constexpr int max_subcompactions = 4;
+  RunCompaction({files}, {input_level}, expected_results, /*snapshots=*/{},
+                /*earliest_write_conflict_snapshot=*/kMaxSequenceNumber,
+                output_level, /*verify=*/true, {kInvalidBlobFileNumber},
+                /*check_get_priority=*/false, Env::IO_TOTAL, Env::IO_TOTAL,
+                max_subcompactions);
+}
+
+// The io priority of the compaction reads and writes are different from
+// other DB reads and writes. To prepare the compaction input files, use the
+// default filesystem from Env. To test the io priority of the compaction
+// reads and writes, db_options_.fs is set as MockTestFileSystem.
+class CompactionJobIOPriorityTest : public CompactionJobTestBase {
+ public:
+  CompactionJobIOPriorityTest()
+      : CompactionJobTestBase(
+            test::PerThreadDBPath("compaction_job_io_priority_test"),
+            BytewiseComparator(), [](uint64_t /*ts*/) { return ""; },
+            /*test_io_priority=*/true, TableTypeForTest::kBlockBasedTable) {}
+};
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateNormal) {
+  // When the state from WriteController is normal.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results}, {},
+                kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                Env::IO_LOW, Env::IO_LOW);
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateDelayed) {
+  // When the state from WriteController is Delayed.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  {
+    std::unique_ptr<WriteControllerToken> delay_token =
+        write_controller_.GetDelayToken(1000000);
+    RunCompaction({files}, {input_level}, {expected_results}, {},
+                  kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                  Env::IO_USER, Env::IO_USER);
+  }
+}
+
+TEST_F(CompactionJobIOPriorityTest, WriteControllerStateStalled) {
+  // When the state from WriteController is Stalled.
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  {
+    std::unique_ptr<WriteControllerToken> stop_token =
+        write_controller_.GetStopToken();
+    RunCompaction({files}, {input_level}, {expected_results}, {},
+                  kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, false,
+                  Env::IO_USER, Env::IO_USER);
+  }
+}
+
+TEST_F(CompactionJobIOPriorityTest, GetRateLimiterPriority) {
+  NewDB();
+  mock::KVVector expected_results = CreateTwoFiles(false);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  constexpr int input_level = 0;
+  auto files = cfd->current()->storage_info()->LevelFiles(input_level);
+  ASSERT_EQ(2U, files.size());
+  RunCompaction({files}, {input_level}, {expected_results}, {},
+                kMaxSequenceNumber, 1, false, {kInvalidBlobFileNumber}, true,
+                Env::IO_LOW, Env::IO_LOW);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.cc b/src/rocksdb/db/compaction/compaction_outputs.cc
new file mode 100644
index 000000000..e74378e2a
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.cc
@@ -0,0 +1,646 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_outputs.h"
+
+#include "db/builder.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) {
+  builder_.reset(NewTableBuilder(tboptions, file_writer_.get()));
+}
+
+Status CompactionOutputs::Finish(const Status& intput_status,
+                                 const SeqnoToTimeMapping& seqno_time_mapping) {
+  FileMetaData* meta = GetMetaData();
+  assert(meta != nullptr);
+  Status s = intput_status;
+  if (s.ok()) {
+    std::string seqno_time_mapping_str;
+    seqno_time_mapping.Encode(seqno_time_mapping_str, meta->fd.smallest_seqno,
+                              meta->fd.largest_seqno, meta->file_creation_time);
+    builder_->SetSeqnoTimeTableProperties(seqno_time_mapping_str,
+                                          meta->oldest_ancester_time);
+    s = builder_->Finish();
+
+  } else {
+    builder_->Abandon();
+  }
+  Status io_s = builder_->io_status();
+  if (s.ok()) {
+    s = io_s;
+  } else {
+    io_s.PermitUncheckedError();
+  }
+  const uint64_t current_bytes = builder_->FileSize();
+  if (s.ok()) {
+    meta->fd.file_size = current_bytes;
+    meta->marked_for_compaction = builder_->NeedCompact();
+  }
+  current_output().finished = true;
+  stats_.bytes_written += current_bytes;
+  stats_.num_output_files = outputs_.size();
+
+  return s;
+}
+
+IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
+                                            SystemClock* clock,
+                                            Statistics* statistics,
+                                            bool use_fsync) {
+  IOStatus io_s;
+  if (input_status.ok()) {
+    StopWatch sw(clock, statistics, COMPACTION_OUTFILE_SYNC_MICROS);
+    io_s = file_writer_->Sync(use_fsync);
+  }
+  if (input_status.ok() && io_s.ok()) {
+    io_s = file_writer_->Close();
+  }
+
+  if (input_status.ok() && io_s.ok()) {
+    FileMetaData* meta = GetMetaData();
+    meta->file_checksum = file_writer_->GetFileChecksum();
+    meta->file_checksum_func_name = file_writer_->GetFileChecksumFuncName();
+  }
+
+  file_writer_.reset();
+
+  return io_s;
+}
+
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
+    const Slice& internal_key) {
+  size_t curr_key_boundary_switched_num = 0;
+  const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+
+  if (grandparents.empty()) {
+    return curr_key_boundary_switched_num;
+  }
+  assert(!internal_key.empty());
+  InternalKey ikey;
+  ikey.DecodeFrom(internal_key);
+  assert(ikey.Valid());
+
+  const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+
+  // Move the grandparent_index_ to the file containing the current user_key.
+  // If there are multiple files containing the same user_key, make sure the
+  // index points to the last file containing the key.
+  while (grandparent_index_ < grandparents.size()) {
+    if (being_grandparent_gap_) {
+      if (sstableKeyCompare(ucmp, ikey,
+                            grandparents[grandparent_index_]->smallest) < 0) {
+        break;
+      }
+      if (seen_key_) {
+        curr_key_boundary_switched_num++;
+        grandparent_overlapped_bytes_ +=
+            grandparents[grandparent_index_]->fd.GetFileSize();
+        grandparent_boundary_switched_num_++;
+      }
+      being_grandparent_gap_ = false;
+    } else {
+      int cmp_result = sstableKeyCompare(
+          ucmp, ikey, grandparents[grandparent_index_]->largest);
+      // If it's same key, make sure grandparent_index_ is pointing to the last
+      // one.
+      if (cmp_result < 0 ||
+          (cmp_result == 0 &&
+           (grandparent_index_ == grandparents.size() - 1 ||
+            sstableKeyCompare(ucmp, ikey,
+                              grandparents[grandparent_index_ + 1]->smallest) <
+                0))) {
+        break;
+      }
+      if (seen_key_) {
+        curr_key_boundary_switched_num++;
+        grandparent_boundary_switched_num_++;
+      }
+      being_grandparent_gap_ = true;
+      grandparent_index_++;
+    }
+  }
+
+  // If the first key is in the middle of a grandparent file, adding it to the
+  // overlap
+  if (!seen_key_ && !being_grandparent_gap_) {
+    assert(grandparent_overlapped_bytes_ == 0);
+    grandparent_overlapped_bytes_ =
+        GetCurrentKeyGrandparentOverlappedBytes(internal_key);
+  }
+
+  seen_key_ = true;
+  return curr_key_boundary_switched_num;
+}
+
+uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
+    const Slice& internal_key) const {
+  // no overlap with any grandparent file
+  if (being_grandparent_gap_) {
+    return 0;
+  }
+  uint64_t overlapped_bytes = 0;
+
+  const std::vector<FileMetaData*>& grandparents = compaction_->grandparents();
+  const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+  InternalKey ikey;
+  ikey.DecodeFrom(internal_key);
+#ifndef NDEBUG
+  // make sure the grandparent_index_ is pointing to the last files containing
+  // the current key.
+  int cmp_result =
+      sstableKeyCompare(ucmp, ikey, grandparents[grandparent_index_]->largest);
+  assert(
+      cmp_result < 0 ||
+      (cmp_result == 0 &&
+       (grandparent_index_ == grandparents.size() - 1 ||
+        sstableKeyCompare(
+            ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0)));
+  assert(sstableKeyCompare(ucmp, ikey,
+                           grandparents[grandparent_index_]->smallest) >= 0);
+#endif
+  overlapped_bytes += grandparents[grandparent_index_]->fd.GetFileSize();
+
+  // go backwards to find all overlapped files, one key can overlap multiple
+  // files. In the following example, if the current output key is `c`, and one
+  // compaction file was cut before `c`, current `c` can overlap with 3 files:
+  //  [a b]               [c...
+  // [b, b] [c, c] [c, c] [c, d]
+  for (int64_t i = static_cast<int64_t>(grandparent_index_) - 1;
+       i >= 0 && sstableKeyCompare(ucmp, ikey, grandparents[i]->largest) == 0;
+       i--) {
+    overlapped_bytes += grandparents[i]->fd.GetFileSize();
+  }
+
+  return overlapped_bytes;
+}
+
+bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
+  assert(c_iter.Valid());
+
+  // always update grandparent information like overlapped file number, size
+  // etc.
+  const Slice& internal_key = c_iter.key();
+  const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
+  size_t num_grandparent_boundaries_crossed =
+      UpdateGrandparentBoundaryInfo(internal_key);
+
+  if (!HasBuilder()) {
+    return false;
+  }
+
+  // If there's user defined partitioner, check that first
+  if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
+                          last_key_for_partitioner_, c_iter.user_key(),
+                          current_output_file_size_)) == kRequired) {
+    return true;
+  }
+
+  // files output to Level 0 won't be split
+  if (compaction_->output_level() == 0) {
+    return false;
+  }
+
+  // reach the max file size
+  if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+    return true;
+  }
+
+  const InternalKeyComparator* icmp =
+      &compaction_->column_family_data()->internal_comparator();
+
+  // Check if it needs to split for RoundRobin
+  // Invalid local_output_split_key indicates that we do not need to split
+  if (local_output_split_key_ != nullptr && !is_split_) {
+    // Split occurs when the next key is larger than/equal to the cursor
+    if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
+      is_split_ = true;
+      return true;
+    }
+  }
+
+  // only check if the current key is going to cross the grandparents file
+  // boundary (either the file beginning or ending).
+  if (num_grandparent_boundaries_crossed > 0) {
+    // Cut the file before the current key if the size of the current output
+    // file + its overlapped grandparent files is bigger than
+    // max_compaction_bytes. Which is to prevent future bigger than
+    // max_compaction_bytes compaction from the current output level.
+    if (grandparent_overlapped_bytes_ + current_output_file_size_ >
+        compaction_->max_compaction_bytes()) {
+      return true;
+    }
+
+    // Cut the file if including the key is going to add a skippable file on
+    // the grandparent level AND its size is reasonably big (1/8 of target file
+    // size). For example, if it's compacting the files L0 + L1:
+    //  L0:  [1,   21]
+    //  L1:    [3,   23]
+    //  L2: [2, 4] [11, 15] [22, 24]
+    // Without this break, it will output as:
+    //  L1: [1,3, 21,23]
+    // With this break, it will output as (assuming [11, 15] at L2 is bigger
+    // than 1/8 of target size):
+    //  L1: [1,3] [21,23]
+    // Then for the future compactions, [11,15] won't be included.
+    // For random datasets (either evenly distributed or skewed), it rarely
+    // triggers this condition, but if the user is adding 2 different datasets
+    // without any overlap, it may likely happen.
+    // More details, check PR #1963
+    const size_t num_skippable_boundaries_crossed =
+        being_grandparent_gap_ ? 2 : 3;
+    if (compaction_->immutable_options()->compaction_style ==
+            kCompactionStyleLevel &&
+        compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+        num_grandparent_boundaries_crossed >=
+            num_skippable_boundaries_crossed &&
+        grandparent_overlapped_bytes_ - previous_overlapped_bytes >
+            compaction_->target_output_file_size() / 8) {
+      return true;
+    }
+
+    // Pre-cut the output file if it's reaching a certain size AND it's at the
+    // boundary of a grandparent file. It can reduce the future compaction size,
+    // the cost is having smaller files.
+    // The pre-cut size threshold is based on how many grandparent boundaries
+    // it has seen before. Basically, if it has seen no boundary at all, then it
+    // will pre-cut at 50% target file size. Every boundary it has seen
+    // increases the threshold by 5%, max at 90%, which it will always cut.
+    // The idea is based on if it has seen more boundaries before, it will more
+    // likely to see another boundary (file cutting opportunity) before the
+    // target file size. The test shows it can generate larger files than a
+    // static threshold like 75% and has a similar write amplification
+    // improvement.
+    if (compaction_->immutable_options()->compaction_style ==
+            kCompactionStyleLevel &&
+        compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+        current_output_file_size_ >=
+            ((compaction_->target_output_file_size() + 99) / 100) *
+                (50 + std::min(grandparent_boundary_switched_num_ * 5,
+                               size_t{40}))) {
+      return true;
+    }
+  }
+
+  // check ttl file boundaries if there's any
+  if (!files_to_cut_for_ttl_.empty()) {
+    if (cur_files_to_cut_for_ttl_ != -1) {
+      // Previous key is inside the range of a file
+      if (icmp->Compare(internal_key,
+                        files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
+                            ->largest.Encode()) > 0) {
+        next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
+        cur_files_to_cut_for_ttl_ = -1;
+        return true;
+      }
+    } else {
+      // Look for the key position
+      while (next_files_to_cut_for_ttl_ <
+             static_cast<int>(files_to_cut_for_ttl_.size())) {
+        if (icmp->Compare(internal_key,
+                          files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                              ->smallest.Encode()) >= 0) {
+          if (icmp->Compare(internal_key,
+                            files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                                ->largest.Encode()) <= 0) {
+            // With in the current file
+            cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
+            return true;
+          }
+          // Beyond the current file
+          next_files_to_cut_for_ttl_++;
+        } else {
+          // Still fall into the gap
+          break;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+Status CompactionOutputs::AddToOutput(
+    const CompactionIterator& c_iter,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
+  Status s;
+  const Slice& key = c_iter.key();
+
+  if (ShouldStopBefore(c_iter) && HasBuilder()) {
+    s = close_file_func(*this, c_iter.InputStatus(), key);
+    if (!s.ok()) {
+      return s;
+    }
+    // reset grandparent information
+    grandparent_boundary_switched_num_ = 0;
+    grandparent_overlapped_bytes_ =
+        GetCurrentKeyGrandparentOverlappedBytes(key);
+  }
+
+  // Open output file if necessary
+  if (!HasBuilder()) {
+    s = open_file_func(*this);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  assert(builder_ != nullptr);
+  const Slice& value = c_iter.value();
+  s = current_output().validator.Add(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  builder_->Add(key, value);
+
+  stats_.num_output_records++;
+  current_output_file_size_ = builder_->EstimatedFileSize();
+
+  if (blob_garbage_meter_) {
+    s = blob_garbage_meter_->ProcessOutFlow(key, value);
+  }
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  const ParsedInternalKey& ikey = c_iter.ikey();
+  s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
+                                             ikey.type);
+
+  if (partitioner_) {
+    last_key_for_partitioner_.assign(c_iter.user_key().data_,
+                                     c_iter.user_key().size_);
+  }
+
+  return s;
+}
+
+Status CompactionOutputs::AddRangeDels(
+    const Slice* comp_start_user_key, const Slice* comp_end_user_key,
+    CompactionIterationStats& range_del_out_stats, bool bottommost_level,
+    const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
+    const Slice& next_table_min_key, const std::string& full_history_ts_low) {
+  assert(HasRangeDel());
+  FileMetaData& meta = current_output().meta;
+  const Comparator* ucmp = icmp.user_comparator();
+
+  Slice lower_bound_guard, upper_bound_guard;
+  std::string smallest_user_key;
+  const Slice *lower_bound, *upper_bound;
+  bool lower_bound_from_sub_compact = false;
+
+  size_t output_size = outputs_.size();
+  if (output_size == 1) {
+    // For the first output table, include range tombstones before the min
+    // key but after the subcompaction boundary.
+    lower_bound = comp_start_user_key;
+    lower_bound_from_sub_compact = true;
+  } else if (meta.smallest.size() > 0) {
+    // For subsequent output tables, only include range tombstones from min
+    // key onwards since the previous file was extended to contain range
+    // tombstones falling before min key.
+    smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
+    lower_bound_guard = Slice(smallest_user_key);
+    lower_bound = &lower_bound_guard;
+  } else {
+    lower_bound = nullptr;
+  }
+  if (!next_table_min_key.empty()) {
+    // This may be the last file in the subcompaction in some cases, so we
+    // need to compare the end key of subcompaction with the next file start
+    // key. When the end key is chosen by the subcompaction, we know that
+    // it must be the biggest key in output file. Therefore, it is safe to
+    // use the smaller key as the upper bound of the output file, to ensure
+    // that there is no overlapping between different output files.
+    upper_bound_guard = ExtractUserKey(next_table_min_key);
+    if (comp_end_user_key != nullptr &&
+        ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
+            0) {
+      upper_bound = comp_end_user_key;
+    } else {
+      upper_bound = &upper_bound_guard;
+    }
+  } else {
+    // This is the last file in the subcompaction, so extend until the
+    // subcompaction ends.
+    upper_bound = comp_end_user_key;
+  }
+  bool has_overlapping_endpoints;
+  if (upper_bound != nullptr && meta.largest.size() > 0) {
+    has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
+                                    meta.largest.user_key(), *upper_bound) == 0;
+  } else {
+    has_overlapping_endpoints = false;
+  }
+
+  // The end key of the subcompaction must be bigger or equal to the upper
+  // bound. If the end of subcompaction is null or the upper bound is null,
+  // it means that this file is the last file in the compaction. So there
+  // will be no overlapping between this file and others.
+  assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
+         ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
+  auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
+                                        has_overlapping_endpoints);
+  // Position the range tombstone output iterator. There may be tombstone
+  // fragments that are entirely out of range, so make sure that we do not
+  // include those.
+  if (lower_bound != nullptr) {
+    it->Seek(*lower_bound);
+  } else {
+    it->SeekToFirst();
+  }
+  for (; it->Valid(); it->Next()) {
+    auto tombstone = it->Tombstone();
+    if (upper_bound != nullptr) {
+      int cmp =
+          ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
+      if ((has_overlapping_endpoints && cmp < 0) ||
+          (!has_overlapping_endpoints && cmp <= 0)) {
+        // Tombstones starting after upper_bound only need to be included in
+        // the next table. If the current SST ends before upper_bound, i.e.,
+        // `has_overlapping_endpoints == false`, we can also skip over range
+        // tombstones that start exactly at upper_bound. Such range
+        // tombstones will be included in the next file and are not relevant
+        // to the point keys or endpoints of the current file.
+        break;
+      }
+    }
+
+    const size_t ts_sz = ucmp->timestamp_size();
+    // Garbage collection for range tombstones.
+    // If user-defined timestamp is enabled, range tombstones are dropped if
+    // they are at bottommost_level, below full_history_ts_low and not visible
+    // in any snapshot. trim_ts_ is passed to the constructor for
+    // range_del_agg_, and range_del_agg_ internally drops tombstones above
+    // trim_ts_.
+    if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
+        (ts_sz == 0 ||
+         (!full_history_ts_low.empty() &&
+          ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
+      // TODO(andrewkr): tombstones that span multiple output files are
+      // counted for each compaction output file, so lots of double
+      // counting.
+      range_del_out_stats.num_range_del_drop_obsolete++;
+      range_del_out_stats.num_record_drop_obsolete++;
+      continue;
+    }
+
+    auto kv = tombstone.Serialize();
+    assert(lower_bound == nullptr ||
+           ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
+    // Range tombstone is not supported by output validator yet.
+    builder_->Add(kv.first.Encode(), kv.second);
+    InternalKey smallest_candidate = std::move(kv.first);
+    if (lower_bound != nullptr &&
+        ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
+                                      *lower_bound) <= 0) {
+      // Pretend the smallest key has the same user key as lower_bound
+      // (the max key in the previous table or subcompaction) in order for
+      // files to appear key-space partitioned.
+      //
+      // When lower_bound is chosen by a subcompaction, we know that
+      // subcompactions over smaller keys cannot contain any keys at
+      // lower_bound. We also know that smaller subcompactions exist,
+      // because otherwise the subcompaction woud be unbounded on the left.
+      // As a result, we know that no other files on the output level will
+      // contain actual keys at lower_bound (an output file may have a
+      // largest key of lower_bound@kMaxSequenceNumber, but this only
+      // indicates a large range tombstone was truncated). Therefore, it is
+      // safe to use the tombstone's sequence number, to ensure that keys at
+      // lower_bound at lower levels are covered by truncated tombstones.
+      //
+      // If lower_bound was chosen by the smallest data key in the file,
+      // choose lowest seqnum so this file's smallest internal key comes
+      // after the previous file's largest. The fake seqnum is OK because
+      // the read path's file-picking code only considers user key.
+      if (lower_bound_from_sub_compact) {
+        if (ts_sz) {
+          assert(tombstone.ts_.size() == ts_sz);
+          smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
+                                           kTypeRangeDeletion, tombstone.ts_);
+        } else {
+          smallest_candidate =
+              InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
+        }
+      } else {
+        smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
+      }
+    }
+    InternalKey largest_candidate = tombstone.SerializeEndKey();
+    if (upper_bound != nullptr &&
+        ucmp->CompareWithoutTimestamp(*upper_bound,
+                                      largest_candidate.user_key()) <= 0) {
+      // Pretend the largest key has the same user key as upper_bound (the
+      // min key in the following table or subcompaction) in order for files
+      // to appear key-space partitioned.
+      //
+      // Choose highest seqnum so this file's largest internal key comes
+      // before the next file's/subcompaction's smallest. The fake seqnum is
+      // OK because the read path's file-picking code only considers the
+      // user key portion.
+      //
+      // Note Seek() also creates InternalKey with (user_key,
+      // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
+      // kTypeRangeDeletion (0xF), so the range tombstone comes before the
+      // Seek() key in InternalKey's ordering. So Seek() will look in the
+      // next file for the user key
+      if (ts_sz) {
+        static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+        if (ts_sz <= strlen(kTsMax)) {
+          largest_candidate =
+              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+                          Slice(kTsMax, ts_sz));
+        } else {
+          largest_candidate =
+              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
+                          std::string(ts_sz, '\xff'));
+        }
+      } else {
+        largest_candidate =
+            InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
+      }
+    }
+#ifndef NDEBUG
+    SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
+    if (meta.smallest.size() > 0) {
+      smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
+    }
+#endif
+    meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+                                  tombstone.seq_, icmp);
+    // The smallest key in a file is used for range tombstone truncation, so
+    // it cannot have a seqnum of 0 (unless the smallest data key in a file
+    // has a seqnum of 0). Otherwise, the truncated tombstone may expose
+    // deleted keys at lower levels.
+    assert(smallest_ikey_seqnum == 0 ||
+           ExtractInternalKeyFooter(meta.smallest.Encode()) !=
+               PackSequenceAndType(0, kTypeRangeDeletion));
+  }
+  return Status::OK();
+}
+
+void CompactionOutputs::FillFilesToCutForTtl() {
+  if (compaction_->immutable_options()->compaction_style !=
+          kCompactionStyleLevel ||
+      compaction_->immutable_options()->compaction_pri !=
+          kMinOverlappingRatio ||
+      compaction_->mutable_cf_options()->ttl == 0 ||
+      compaction_->num_input_levels() < 2 || compaction_->bottommost_level()) {
+    return;
+  }
+
+  // We define new file with the oldest ancestor time to be younger than 1/4
+  // TTL, and an old one to be older than 1/2 TTL time.
+  int64_t temp_current_time;
+  auto get_time_status =
+      compaction_->immutable_options()->clock->GetCurrentTime(
+          &temp_current_time);
+  if (!get_time_status.ok()) {
+    return;
+  }
+
+  auto current_time = static_cast<uint64_t>(temp_current_time);
+  if (current_time < compaction_->mutable_cf_options()->ttl) {
+    return;
+  }
+
+  uint64_t old_age_thres =
+      current_time - compaction_->mutable_cf_options()->ttl / 2;
+  const std::vector<FileMetaData*>& olevel =
+      *(compaction_->inputs(compaction_->num_input_levels() - 1));
+  for (FileMetaData* file : olevel) {
+    // Worth filtering out by start and end?
+    uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
+    // We put old files if they are not too small to prevent a flood
+    // of small files.
+    if (oldest_ancester_time < old_age_thres &&
+        file->fd.GetFileSize() >
+            compaction_->mutable_cf_options()->target_file_size_base / 2) {
+      files_to_cut_for_ttl_.push_back(file);
+    }
+  }
+}
+
+CompactionOutputs::CompactionOutputs(const Compaction* compaction,
+                                     const bool is_penultimate_level)
+    : compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+  partitioner_ = compaction->output_level() == 0
+                     ? nullptr
+                     : compaction->CreateSstPartitioner();
+
+  if (compaction->output_level() != 0) {
+    FillFilesToCutForTtl();
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_outputs.h b/src/rocksdb/db/compaction/compaction_outputs.h
new file mode 100644
index 000000000..f40aa8215
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_outputs.h
@@ -0,0 +1,385 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CompactionOutputs;
+using CompactionFileOpenFunc = std::function<Status(CompactionOutputs&)>;
+using CompactionFileCloseFunc =
+    std::function<Status(CompactionOutputs&, const Status&, const Slice&)>;
+
+// Files produced by subcompaction, most of the functions are used by
+// compaction_job Open/Close compaction file functions.
+class CompactionOutputs {
+ public:
+  // compaction output file
+  struct Output {
+    Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp,
+           bool _enable_order_check, bool _enable_hash, bool _finished,
+           uint64_t precalculated_hash)
+        : meta(std::move(_meta)),
+          validator(_icmp, _enable_order_check, _enable_hash,
+                    precalculated_hash),
+          finished(_finished) {}
+    FileMetaData meta;
+    OutputValidator validator;
+    bool finished;
+    std::shared_ptr<const TableProperties> table_properties;
+  };
+
+  CompactionOutputs() = delete;
+
+  explicit CompactionOutputs(const Compaction* compaction,
+                             const bool is_penultimate_level);
+
+  // Add generated output to the list
+  void AddOutput(FileMetaData&& meta, const InternalKeyComparator& icmp,
+                 bool enable_order_check, bool enable_hash,
+                 bool finished = false, uint64_t precalculated_hash = 0) {
+    outputs_.emplace_back(std::move(meta), icmp, enable_order_check,
+                          enable_hash, finished, precalculated_hash);
+  }
+
+  // Set new table builder for the current output
+  void NewBuilder(const TableBuilderOptions& tboptions);
+
+  // Assign a new WritableFileWriter to the current output
+  void AssignFileWriter(WritableFileWriter* writer) {
+    file_writer_.reset(writer);
+  }
+
+  // TODO: Remove it when remote compaction support tiered compaction
+  void SetTotalBytes(uint64_t bytes) { stats_.bytes_written += bytes; }
+  void SetNumOutputRecords(uint64_t num) { stats_.num_output_records = num; }
+
+  // TODO: Move the BlobDB builder into CompactionOutputs
+  const std::vector<BlobFileAddition>& GetBlobFileAdditions() const {
+    if (is_penultimate_level_) {
+      assert(blob_file_additions_.empty());
+    }
+    return blob_file_additions_;
+  }
+
+  std::vector<BlobFileAddition>* GetBlobFileAdditionsPtr() {
+    assert(!is_penultimate_level_);
+    return &blob_file_additions_;
+  }
+
+  bool HasBlobFileAdditions() const { return !blob_file_additions_.empty(); }
+
+  BlobGarbageMeter* CreateBlobGarbageMeter() {
+    assert(!is_penultimate_level_);
+    blob_garbage_meter_ = std::make_unique<BlobGarbageMeter>();
+    return blob_garbage_meter_.get();
+  }
+
+  BlobGarbageMeter* GetBlobGarbageMeter() const {
+    if (is_penultimate_level_) {
+      // blobdb doesn't support per_key_placement yet
+      assert(blob_garbage_meter_ == nullptr);
+      return nullptr;
+    }
+    return blob_garbage_meter_.get();
+  }
+
+  void UpdateBlobStats() {
+    assert(!is_penultimate_level_);
+    stats_.num_output_files_blob = blob_file_additions_.size();
+    for (const auto& blob : blob_file_additions_) {
+      stats_.bytes_written_blob += blob.GetTotalBlobBytes();
+    }
+  }
+
+  // Finish the current output file
+  Status Finish(const Status& intput_status,
+                const SeqnoToTimeMapping& seqno_time_mapping);
+
+  // Update output table properties from table builder
+  void UpdateTableProperties() {
+    current_output().table_properties =
+        std::make_shared<TableProperties>(GetTableProperties());
+  }
+
+  IOStatus WriterSyncClose(const Status& intput_status, SystemClock* clock,
+                           Statistics* statistics, bool use_fsync);
+
+  TableProperties GetTableProperties() {
+    return builder_->GetTableProperties();
+  }
+
+  Slice SmallestUserKey() const {
+    if (!outputs_.empty() && outputs_[0].finished) {
+      return outputs_[0].meta.smallest.user_key();
+    } else {
+      return Slice{nullptr, 0};
+    }
+  }
+
+  Slice LargestUserKey() const {
+    if (!outputs_.empty() && outputs_.back().finished) {
+      return outputs_.back().meta.largest.user_key();
+    } else {
+      return Slice{nullptr, 0};
+    }
+  }
+
+  // In case the last output file is empty, which doesn't need to keep.
+  void RemoveLastEmptyOutput() {
+    if (!outputs_.empty() && !outputs_.back().meta.fd.file_size) {
+      // An error occurred, so ignore the last output.
+      outputs_.pop_back();
+    }
+  }
+
+  // Remove the last output, for example the last output doesn't have data (no
+  // entry and no range-dels), but file_size might not be 0, as it has SST
+  // metadata.
+  void RemoveLastOutput() {
+    assert(!outputs_.empty());
+    outputs_.pop_back();
+  }
+
+  bool HasBuilder() const { return builder_ != nullptr; }
+
+  FileMetaData* GetMetaData() { return &current_output().meta; }
+
+  bool HasOutput() const { return !outputs_.empty(); }
+
+  uint64_t NumEntries() const { return builder_->NumEntries(); }
+
+  void ResetBuilder() {
+    builder_.reset();
+    current_output_file_size_ = 0;
+  }
+
+  // Add range-dels from the aggregator to the current output file
+  // @param comp_start_user_key and comp_end_user_key include timestamp if
+  // user-defined timestamp is enabled.
+  // @param full_history_ts_low used for range tombstone garbage collection.
+  Status AddRangeDels(const Slice* comp_start_user_key,
+                      const Slice* comp_end_user_key,
+                      CompactionIterationStats& range_del_out_stats,
+                      bool bottommost_level, const InternalKeyComparator& icmp,
+                      SequenceNumber earliest_snapshot,
+                      const Slice& next_table_min_key,
+                      const std::string& full_history_ts_low);
+
+  // if the outputs have range delete, range delete is also data
+  bool HasRangeDel() const {
+    return range_del_agg_ && !range_del_agg_->IsEmpty();
+  }
+
+ private:
+  friend class SubcompactionState;
+
+  void FillFilesToCutForTtl();
+
+  void SetOutputSlitKey(const std::optional<Slice> start,
+                        const std::optional<Slice> end) {
+    const InternalKeyComparator* icmp =
+        &compaction_->column_family_data()->internal_comparator();
+
+    const InternalKey* output_split_key = compaction_->GetOutputSplitKey();
+    // Invalid output_split_key indicates that we do not need to split
+    if (output_split_key != nullptr) {
+      // We may only split the output when the cursor is in the range. Split
+      if ((!end.has_value() ||
+           icmp->user_comparator()->Compare(
+               ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
+          (!start.has_value() || icmp->user_comparator()->Compare(
+                                     ExtractUserKey(output_split_key->Encode()),
+                                     start.value()) > 0)) {
+        local_output_split_key_ = output_split_key;
+      }
+    }
+  }
+
+  // Returns true iff we should stop building the current output
+  // before processing the current key in compaction iterator.
+  bool ShouldStopBefore(const CompactionIterator& c_iter);
+
+  void Cleanup() {
+    if (builder_ != nullptr) {
+      // May happen if we get a shutdown call in the middle of compaction
+      builder_->Abandon();
+      builder_.reset();
+    }
+  }
+
+  // update tracked grandparents information like grandparent index, if it's
+  // in the gap between 2 grandparent files, accumulated grandparent files size
+  // etc.
+  // It returns how many boundaries it crosses by including current key.
+  size_t UpdateGrandparentBoundaryInfo(const Slice& internal_key);
+
+  // helper function to get the overlapped grandparent files size, it's only
+  // used for calculating the first key's overlap.
+  uint64_t GetCurrentKeyGrandparentOverlappedBytes(
+      const Slice& internal_key) const;
+
+  // Add current key from compaction_iterator to the output file. If needed
+  // close and open new compaction output with the functions provided.
+  Status AddToOutput(const CompactionIterator& c_iter,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func);
+
+  // Close the current output. `open_file_func` is needed for creating new file
+  // for range-dels only output file.
+  Status CloseOutput(const Status& curr_status,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func) {
+    Status status = curr_status;
+    // handle subcompaction containing only range deletions
+    if (status.ok() && !HasBuilder() && !HasOutput() && HasRangeDel()) {
+      status = open_file_func(*this);
+    }
+    if (HasBuilder()) {
+      const Slice empty_key{};
+      Status s = close_file_func(*this, status, empty_key);
+      if (!s.ok() && status.ok()) {
+        status = s;
+      }
+    }
+
+    return status;
+  }
+
+  // This subcompaction's output could be empty if compaction was aborted before
+  // this subcompaction had a chance to generate any output files. When
+  // subcompactions are executed sequentially this is more likely and will be
+  // particularly likely for the later subcompactions to be empty. Once they are
+  // run in parallel however it should be much rarer.
+  // It's caller's responsibility to make sure it's not empty.
+  Output& current_output() {
+    assert(!outputs_.empty());
+    return outputs_.back();
+  }
+
+  // Assign the range_del_agg to the target output level. There's only one
+  // range-del-aggregator per compaction outputs, for
+  // output_to_penultimate_level compaction it is only assigned to the
+  // penultimate level.
+  void AssignRangeDelAggregator(
+      std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+    assert(range_del_agg_ == nullptr);
+    range_del_agg_ = std::move(range_del_agg);
+  }
+
+  const Compaction* compaction_;
+
+  // current output builder and writer
+  std::unique_ptr<TableBuilder> builder_;
+  std::unique_ptr<WritableFileWriter> file_writer_;
+  uint64_t current_output_file_size_ = 0;
+
+  // all the compaction outputs so far
+  std::vector<Output> outputs_;
+
+  // BlobDB info
+  std::vector<BlobFileAddition> blob_file_additions_;
+  std::unique_ptr<BlobGarbageMeter> blob_garbage_meter_;
+
+  // Basic compaction output stats for this level's outputs
+  InternalStats::CompactionOutputsStats stats_;
+
+  // indicate if this CompactionOutputs obj for penultimate_level, should always
+  // be false if per_key_placement feature is not enabled.
+  const bool is_penultimate_level_;
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg_ = nullptr;
+
+  // partitioner information
+  std::string last_key_for_partitioner_;
+  std::unique_ptr<SstPartitioner> partitioner_;
+
+  // A flag determines if this subcompaction has been split by the cursor
+  bool is_split_ = false;
+
+  // We also maintain the output split key for each subcompaction to avoid
+  // repetitive comparison in ShouldStopBefore()
+  const InternalKey* local_output_split_key_ = nullptr;
+
+  // Some identified files with old oldest ancester time and the range should be
+  // isolated out so that the output file(s) in that range can be merged down
+  // for TTL and clear the timestamps for the range.
+  std::vector<FileMetaData*> files_to_cut_for_ttl_;
+  int cur_files_to_cut_for_ttl_ = -1;
+  int next_files_to_cut_for_ttl_ = 0;
+
+  // An index that used to speed up ShouldStopBefore().
+  size_t grandparent_index_ = 0;
+
+  // if the output key is being grandparent files gap, so:
+  //  key > grandparents[grandparent_index_ - 1].largest &&
+  //  key < grandparents[grandparent_index_].smallest
+  bool being_grandparent_gap_ = true;
+
+  // The number of bytes overlapping between the current output and
+  // grandparent files used in ShouldStopBefore().
+  uint64_t grandparent_overlapped_bytes_ = 0;
+
+  // A flag determines whether the key has been seen in ShouldStopBefore()
+  bool seen_key_ = false;
+
+  // for the current output file, how many file boundaries has it crossed,
+  // basically number of files overlapped * 2
+  size_t grandparent_boundary_switched_num_ = 0;
+};
+
+// helper struct to concatenate the last level and penultimate level outputs
+// which could be replaced by std::ranges::join_view() in c++20
+struct OutputIterator {
+ public:
+  explicit OutputIterator(const std::vector<CompactionOutputs::Output>& a,
+                          const std::vector<CompactionOutputs::Output>& b)
+      : a_(a), b_(b) {
+    within_a = !a_.empty();
+    idx_ = 0;
+  }
+
+  OutputIterator begin() { return *this; }
+
+  OutputIterator end() { return *this; }
+
+  size_t size() { return a_.size() + b_.size(); }
+
+  const CompactionOutputs::Output& operator*() const {
+    return within_a ? a_[idx_] : b_[idx_];
+  }
+
+  OutputIterator& operator++() {
+    idx_++;
+    if (within_a && idx_ >= a_.size()) {
+      within_a = false;
+      idx_ = 0;
+    }
+    assert(within_a || idx_ <= b_.size());
+    return *this;
+  }
+
+  bool operator!=(const OutputIterator& /*rhs*/) const {
+    return within_a || idx_ < b_.size();
+  }
+
+ private:
+  const std::vector<CompactionOutputs::Output>& a_;
+  const std::vector<CompactionOutputs::Output>& b_;
+  bool within_a;
+  size_t idx_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.cc b/src/rocksdb/db/compaction/compaction_picker.cc
new file mode 100644
index 000000000..abdecca9f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.cc
@@ -0,0 +1,1234 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker.h"
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool FindIntraL0Compaction(const std::vector<FileMetaData*>& level_files,
+                           size_t min_files_to_compact,
+                           uint64_t max_compact_bytes_per_del_file,
+                           uint64_t max_compaction_bytes,
+                           CompactionInputFiles* comp_inputs,
+                           SequenceNumber earliest_mem_seqno) {
+  // Do not pick ingested file when there is at least one memtable not flushed
+  // which of seqno is overlap with the sst.
+  TEST_SYNC_POINT("FindIntraL0Compaction");
+  size_t start = 0;
+  for (; start < level_files.size(); start++) {
+    if (level_files[start]->being_compacted) {
+      return false;
+    }
+    // If there is no data in memtable, the earliest sequence number would the
+    // largest sequence number in last memtable.
+    // Because all files are sorted in descending order by largest_seqno, so we
+    // only need to check the first one.
+    if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) {
+      break;
+    }
+  }
+  if (start >= level_files.size()) {
+    return false;
+  }
+  size_t compact_bytes = static_cast<size_t>(level_files[start]->fd.file_size);
+  size_t compact_bytes_per_del_file = std::numeric_limits<size_t>::max();
+  // Compaction range will be [start, limit).
+  size_t limit;
+  // Pull in files until the amount of compaction work per deleted file begins
+  // increasing or maximum total compaction size is reached.
+  size_t new_compact_bytes_per_del_file = 0;
+  for (limit = start + 1; limit < level_files.size(); ++limit) {
+    compact_bytes += static_cast<size_t>(level_files[limit]->fd.file_size);
+    new_compact_bytes_per_del_file = compact_bytes / (limit - start);
+    if (level_files[limit]->being_compacted ||
+        new_compact_bytes_per_del_file > compact_bytes_per_del_file ||
+        compact_bytes > max_compaction_bytes) {
+      break;
+    }
+    compact_bytes_per_del_file = new_compact_bytes_per_del_file;
+  }
+
+  if ((limit - start) >= min_files_to_compact &&
+      compact_bytes_per_del_file < max_compact_bytes_per_del_file) {
+    assert(comp_inputs != nullptr);
+    comp_inputs->level = 0;
+    for (size_t i = start; i < limit; ++i) {
+      comp_inputs->files.push_back(level_files[i]);
+    }
+    return true;
+  }
+  return false;
+}
+
+// Determine compression type, based on user options, level of the output
+// file and whether compression is disabled.
+// If enable_compression is false, then compression is always disabled no
+// matter what the values of the other two parameters are.
+// Otherwise, the compression type is determined based on options and level.
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression) {
+  if (!enable_compression) {
+    // disable compression
+    return kNoCompression;
+  }
+
+  // If bottommost_compression is set and we are compacting to the
+  // bottommost level then we should use it.
+  if (mutable_cf_options.bottommost_compression != kDisableCompressionOption &&
+      level >= (vstorage->num_non_empty_levels() - 1)) {
+    return mutable_cf_options.bottommost_compression;
+  }
+  // If the user has specified a different compression level for each level,
+  // then pick the compression for that level.
+  if (!mutable_cf_options.compression_per_level.empty()) {
+    assert(level == 0 || level >= base_level);
+    int idx = (level == 0) ? 0 : level - base_level + 1;
+
+    const int n =
+        static_cast<int>(mutable_cf_options.compression_per_level.size()) - 1;
+    // It is possible for level_ to be -1; in that case, we use level
+    // 0's compression.  This occurs mostly in backwards compatibility
+    // situations when the builder doesn't know what level the file
+    // belongs to.  Likewise, if level is beyond the end of the
+    // specified compression levels, use the last value.
+    return mutable_cf_options
+        .compression_per_level[std::max(0, std::min(idx, n))];
+  } else {
+    return mutable_cf_options.compression;
+  }
+}
+
+CompressionOptions GetCompressionOptions(const MutableCFOptions& cf_options,
+                                         const VersionStorageInfo* vstorage,
+                                         int level,
+                                         const bool enable_compression) {
+  if (!enable_compression) {
+    return cf_options.compression_opts;
+  }
+  // If bottommost_compression_opts is enabled and we are compacting to the
+  // bottommost level then we should use the specified compression options.
+  if (level >= (vstorage->num_non_empty_levels() - 1) &&
+      cf_options.bottommost_compression_opts.enabled) {
+    return cf_options.bottommost_compression_opts;
+  }
+  return cf_options.compression_opts;
+}
+
+CompactionPicker::CompactionPicker(const ImmutableOptions& ioptions,
+                                   const InternalKeyComparator* icmp)
+    : ioptions_(ioptions), icmp_(icmp) {}
+
+CompactionPicker::~CompactionPicker() {}
+
+// Delete this compaction from the list of running compactions.
+void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
+  UnregisterCompaction(c);
+  if (!status.ok()) {
+    c->ResetNextCompactionIndex();
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  const int level = inputs.level;
+  assert(!inputs.empty());
+  smallest->Clear();
+  largest->Clear();
+
+  if (level == 0) {
+    for (size_t i = 0; i < inputs.size(); i++) {
+      FileMetaData* f = inputs[i];
+      if (i == 0) {
+        *smallest = f->smallest;
+        *largest = f->largest;
+      } else {
+        if (icmp_->Compare(f->smallest, *smallest) < 0) {
+          *smallest = f->smallest;
+        }
+        if (icmp_->Compare(f->largest, *largest) > 0) {
+          *largest = f->largest;
+        }
+      }
+    }
+  } else {
+    *smallest = inputs[0]->smallest;
+    *largest = inputs[inputs.size() - 1]->largest;
+  }
+}
+
+void CompactionPicker::GetRange(const CompactionInputFiles& inputs1,
+                                const CompactionInputFiles& inputs2,
+                                InternalKey* smallest,
+                                InternalKey* largest) const {
+  assert(!inputs1.empty() || !inputs2.empty());
+  if (inputs1.empty()) {
+    GetRange(inputs2, smallest, largest);
+  } else if (inputs2.empty()) {
+    GetRange(inputs1, smallest, largest);
+  } else {
+    InternalKey smallest1, smallest2, largest1, largest2;
+    GetRange(inputs1, &smallest1, &largest1);
+    GetRange(inputs2, &smallest2, &largest2);
+    *smallest =
+        icmp_->Compare(smallest1, smallest2) < 0 ? smallest1 : smallest2;
+    *largest = icmp_->Compare(largest1, largest2) < 0 ? largest2 : largest1;
+  }
+}
+
+void CompactionPicker::GetRange(const std::vector<CompactionInputFiles>& inputs,
+                                InternalKey* smallest, InternalKey* largest,
+                                int exclude_level) const {
+  InternalKey current_smallest;
+  InternalKey current_largest;
+  bool initialized = false;
+  for (const auto& in : inputs) {
+    if (in.empty() || in.level == exclude_level) {
+      continue;
+    }
+    GetRange(in, &current_smallest, &current_largest);
+    if (!initialized) {
+      *smallest = current_smallest;
+      *largest = current_largest;
+      initialized = true;
+    } else {
+      if (icmp_->Compare(current_smallest, *smallest) < 0) {
+        *smallest = current_smallest;
+      }
+      if (icmp_->Compare(current_largest, *largest) > 0) {
+        *largest = current_largest;
+      }
+    }
+  }
+  assert(initialized);
+}
+
+bool CompactionPicker::ExpandInputsToCleanCut(const std::string& /*cf_name*/,
+                                              VersionStorageInfo* vstorage,
+                                              CompactionInputFiles* inputs,
+                                              InternalKey** next_smallest) {
+  // This isn't good compaction
+  assert(!inputs->empty());
+
+  const int level = inputs->level;
+  // GetOverlappingInputs will always do the right thing for level-0.
+  // So we don't need to do any expansion if level == 0.
+  if (level == 0) {
+    return true;
+  }
+
+  InternalKey smallest, largest;
+
+  // Keep expanding inputs until we are sure that there is a "clean cut"
+  // boundary between the files in input and the surrounding files.
+  // This will ensure that no parts of a key are lost during compaction.
+  int hint_index = -1;
+  size_t old_size;
+  do {
+    old_size = inputs->size();
+    GetRange(*inputs, &smallest, &largest);
+    inputs->clear();
+    vstorage->GetOverlappingInputs(level, &smallest, &largest, &inputs->files,
+                                   hint_index, &hint_index, true,
+                                   next_smallest);
+  } while (inputs->size() > old_size);
+
+  // we started off with inputs non-empty and the previous loop only grew
+  // inputs. thus, inputs should be non-empty here
+  assert(!inputs->empty());
+
+  // If, after the expansion, there are files that are already under
+  // compaction, then we must drop/cancel this compaction.
+  if (AreFilesInCompaction(inputs->files)) {
+    return false;
+  }
+  return true;
+}
+
+bool CompactionPicker::RangeOverlapWithCompaction(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int level) const {
+  const Comparator* ucmp = icmp_->user_comparator();
+  for (Compaction* c : compactions_in_progress_) {
+    if (c->output_level() == level &&
+        ucmp->CompareWithoutTimestamp(smallest_user_key,
+                                      c->GetLargestUserKey()) <= 0 &&
+        ucmp->CompareWithoutTimestamp(largest_user_key,
+                                      c->GetSmallestUserKey()) >= 0) {
+      // Overlap
+      return true;
+    }
+    if (c->SupportsPerKeyPlacement()) {
+      if (c->OverlapPenultimateLevelOutputRange(smallest_user_key,
+                                                largest_user_key)) {
+        return true;
+      }
+    }
+  }
+  // Did not overlap with any running compaction in level `level`
+  return false;
+}
+
+bool CompactionPicker::FilesRangeOverlapWithCompaction(
+    const std::vector<CompactionInputFiles>& inputs, int level,
+    int penultimate_level) const {
+  bool is_empty = true;
+  for (auto& in : inputs) {
+    if (!in.empty()) {
+      is_empty = false;
+      break;
+    }
+  }
+  if (is_empty) {
+    // No files in inputs
+    return false;
+  }
+
+  // TODO: Intra L0 compactions can have the ranges overlapped, but the input
+  //  files cannot be overlapped in the order of L0 files.
+  InternalKey smallest, largest;
+  GetRange(inputs, &smallest, &largest, Compaction::kInvalidLevel);
+  if (penultimate_level != Compaction::kInvalidLevel) {
+    if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+      if (RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+                                     penultimate_level)) {
+        return true;
+      }
+    } else {
+      InternalKey penultimate_smallest, penultimate_largest;
+      GetRange(inputs, &penultimate_smallest, &penultimate_largest, level);
+      if (RangeOverlapWithCompaction(penultimate_smallest.user_key(),
+                                     penultimate_largest.user_key(),
+                                     penultimate_level)) {
+        return true;
+      }
+    }
+  }
+
+  return RangeOverlapWithCompaction(smallest.user_key(), largest.user_key(),
+                                    level);
+}
+
+// Returns true if any one of specified files are being compacted
+bool CompactionPicker::AreFilesInCompaction(
+    const std::vector<FileMetaData*>& files) {
+  for (size_t i = 0; i < files.size(); i++) {
+    if (files[i]->being_compacted) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Compaction* CompactionPicker::CompactFiles(
+    const CompactionOptions& compact_options,
+    const std::vector<CompactionInputFiles>& input_files, int output_level,
+    VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, uint32_t output_path_id) {
+#ifndef NDEBUG
+  assert(input_files.size());
+  // This compaction output should not overlap with a running compaction as
+  // `SanitizeCompactionInputFiles` should've checked earlier and db mutex
+  // shouldn't have been released since.
+  int start_level = Compaction::kInvalidLevel;
+  for (const auto& in : input_files) {
+    // input_files should already be sorted by level
+    if (!in.empty()) {
+      start_level = in.level;
+      break;
+    }
+  }
+  assert(output_level == 0 ||
+         !FilesRangeOverlapWithCompaction(
+             input_files, output_level,
+             Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+                                                  start_level, output_level)));
+#endif /* !NDEBUG */
+
+  CompressionType compression_type;
+  if (compact_options.compression == kDisableCompressionOption) {
+    int base_level;
+    if (ioptions_.compaction_style == kCompactionStyleLevel) {
+      base_level = vstorage->base_level();
+    } else {
+      base_level = 1;
+    }
+    compression_type = GetCompressionType(vstorage, mutable_cf_options,
+                                          output_level, base_level);
+  } else {
+    // TODO(ajkr): `CompactionOptions` offers configurable `CompressionType`
+    // without configurable `CompressionOptions`, which is inconsistent.
+    compression_type = compact_options.compression;
+  }
+  auto c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options, input_files,
+      output_level, compact_options.output_file_size_limit,
+      mutable_cf_options.max_compaction_bytes, output_path_id, compression_type,
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_options.max_subcompactions,
+      /* grandparents */ {}, true);
+  RegisterCompaction(c);
+  return c;
+}
+
+Status CompactionPicker::GetCompactionInputsFromFileNumbers(
+    std::vector<CompactionInputFiles>* input_files,
+    std::unordered_set<uint64_t>* input_set, const VersionStorageInfo* vstorage,
+    const CompactionOptions& /*compact_options*/) const {
+  if (input_set->size() == 0U) {
+    return Status::InvalidArgument(
+        "Compaction must include at least one file.");
+  }
+  assert(input_files);
+
+  std::vector<CompactionInputFiles> matched_input_files;
+  matched_input_files.resize(vstorage->num_levels());
+  int first_non_empty_level = -1;
+  int last_non_empty_level = -1;
+  // TODO(yhchiang): use a lazy-initialized mapping from
+  //                 file_number to FileMetaData in Version.
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    for (auto file : vstorage->LevelFiles(level)) {
+      auto iter = input_set->find(file->fd.GetNumber());
+      if (iter != input_set->end()) {
+        matched_input_files[level].files.push_back(file);
+        input_set->erase(iter);
+        last_non_empty_level = level;
+        if (first_non_empty_level == -1) {
+          first_non_empty_level = level;
+        }
+      }
+    }
+  }
+
+  if (!input_set->empty()) {
+    std::string message(
+        "Cannot find matched SST files for the following file numbers:");
+    for (auto fn : *input_set) {
+      message += " ";
+      message += std::to_string(fn);
+    }
+    return Status::InvalidArgument(message);
+  }
+
+  for (int level = first_non_empty_level; level <= last_non_empty_level;
+       ++level) {
+    matched_input_files[level].level = level;
+    input_files->emplace_back(std::move(matched_input_files[level]));
+  }
+
+  return Status::OK();
+}
+
+// Returns true if any one of the parent files are being compacted
+bool CompactionPicker::IsRangeInCompaction(VersionStorageInfo* vstorage,
+                                           const InternalKey* smallest,
+                                           const InternalKey* largest,
+                                           int level, int* level_index) {
+  std::vector<FileMetaData*> inputs;
+  assert(level < NumberLevels());
+
+  vstorage->GetOverlappingInputs(level, smallest, largest, &inputs,
+                                 level_index ? *level_index : 0, level_index);
+  return AreFilesInCompaction(inputs);
+}
+
+// Populates the set of inputs of all other levels that overlap with the
+// start level.
+// Now we assume all levels except start level and output level are empty.
+// Will also attempt to expand "start level" if that doesn't expand
+// "output level" or cause "level" to include a file for compaction that has an
+// overlapping user-key with another file.
+// REQUIRES: input_level and output_level are different
+// REQUIRES: inputs->empty() == false
+// Returns false if files on parent level are currently in compaction, which
+// means that we can't compact them
+bool CompactionPicker::SetupOtherInputs(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    VersionStorageInfo* vstorage, CompactionInputFiles* inputs,
+    CompactionInputFiles* output_level_inputs, int* parent_index,
+    int base_index, bool only_expand_towards_right) {
+  assert(!inputs->empty());
+  assert(output_level_inputs->empty());
+  const int input_level = inputs->level;
+  const int output_level = output_level_inputs->level;
+  if (input_level == output_level) {
+    // no possibility of conflict
+    return true;
+  }
+
+  // For now, we only support merging two levels, start level and output level.
+  // We need to assert other levels are empty.
+  for (int l = input_level + 1; l < output_level; l++) {
+    assert(vstorage->NumLevelFiles(l) == 0);
+  }
+
+  InternalKey smallest, largest;
+
+  // Get the range one last time.
+  GetRange(*inputs, &smallest, &largest);
+
+  // Populate the set of next-level files (inputs_GetOutputLevelInputs()) to
+  // include in compaction
+  vstorage->GetOverlappingInputs(output_level, &smallest, &largest,
+                                 &output_level_inputs->files, *parent_index,
+                                 parent_index);
+  if (AreFilesInCompaction(output_level_inputs->files)) {
+    return false;
+  }
+  if (!output_level_inputs->empty()) {
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, output_level_inputs)) {
+      return false;
+    }
+  }
+
+  // See if we can further grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up. We also choose NOT
+  // to expand if this would cause "level" to include some entries for some
+  // user key, while excluding other entries for the same user key. This
+  // can happen when one user key spans multiple files.
+  if (!output_level_inputs->empty()) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    const uint64_t output_level_inputs_size =
+        TotalFileSize(output_level_inputs->files);
+    const uint64_t inputs_size = TotalFileSize(inputs->files);
+    bool expand_inputs = false;
+
+    CompactionInputFiles expanded_inputs;
+    expanded_inputs.level = input_level;
+    // Get closed interval of output level
+    InternalKey all_start, all_limit;
+    GetRange(*inputs, *output_level_inputs, &all_start, &all_limit);
+    bool try_overlapping_inputs = true;
+    if (only_expand_towards_right) {
+      // Round-robin compaction only allows expansion towards the larger side.
+      vstorage->GetOverlappingInputs(input_level, &smallest, &all_limit,
+                                     &expanded_inputs.files, base_index,
+                                     nullptr);
+    } else {
+      vstorage->GetOverlappingInputs(input_level, &all_start, &all_limit,
+                                     &expanded_inputs.files, base_index,
+                                     nullptr);
+    }
+    uint64_t expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+    if (!ExpandInputsToCleanCut(cf_name, vstorage, &expanded_inputs)) {
+      try_overlapping_inputs = false;
+    }
+    if (try_overlapping_inputs && expanded_inputs.size() > inputs->size() &&
+        (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+         output_level_inputs_size + expanded_inputs_size < limit) &&
+        !AreFilesInCompaction(expanded_inputs.files)) {
+      InternalKey new_start, new_limit;
+      GetRange(expanded_inputs, &new_start, &new_limit);
+      CompactionInputFiles expanded_output_level_inputs;
+      expanded_output_level_inputs.level = output_level;
+      vstorage->GetOverlappingInputs(output_level, &new_start, &new_limit,
+                                     &expanded_output_level_inputs.files,
+                                     *parent_index, parent_index);
+      assert(!expanded_output_level_inputs.empty());
+      if (!AreFilesInCompaction(expanded_output_level_inputs.files) &&
+          ExpandInputsToCleanCut(cf_name, vstorage,
+                                 &expanded_output_level_inputs) &&
+          expanded_output_level_inputs.size() == output_level_inputs->size()) {
+        expand_inputs = true;
+      }
+    }
+    if (!expand_inputs) {
+      vstorage->GetCleanInputsWithinInterval(input_level, &all_start,
+                                             &all_limit, &expanded_inputs.files,
+                                             base_index, nullptr);
+      expanded_inputs_size = TotalFileSize(expanded_inputs.files);
+      if (expanded_inputs.size() > inputs->size() &&
+          (mutable_cf_options.ignore_max_compaction_bytes_for_input ||
+           output_level_inputs_size + expanded_inputs_size < limit) &&
+          !AreFilesInCompaction(expanded_inputs.files)) {
+        expand_inputs = true;
+      }
+    }
+    if (expand_inputs) {
+      ROCKS_LOG_INFO(ioptions_.logger,
+                     "[%s] Expanding@%d %" ROCKSDB_PRIszt "+%" ROCKSDB_PRIszt
+                     "(%" PRIu64 "+%" PRIu64 " bytes) to %" ROCKSDB_PRIszt
+                     "+%" ROCKSDB_PRIszt " (%" PRIu64 "+%" PRIu64 " bytes)\n",
+                     cf_name.c_str(), input_level, inputs->size(),
+                     output_level_inputs->size(), inputs_size,
+                     output_level_inputs_size, expanded_inputs.size(),
+                     output_level_inputs->size(), expanded_inputs_size,
+                     output_level_inputs_size);
+      inputs->files = expanded_inputs.files;
+    }
+  } else {
+    // Likely to be trivial move. Expand files if they are still trivial moves,
+    // but limit to mutable_cf_options.max_compaction_bytes or 8 files so that
+    // we don't create too much compaction pressure for the next level.
+  }
+  return true;
+}
+
+void CompactionPicker::GetGrandparents(
+    VersionStorageInfo* vstorage, const CompactionInputFiles& inputs,
+    const CompactionInputFiles& output_level_inputs,
+    std::vector<FileMetaData*>* grandparents) {
+  InternalKey start, limit;
+  GetRange(inputs, output_level_inputs, &start, &limit);
+  // Compute the set of grandparent files that overlap this compaction
+  // (parent == level+1; grandparent == level+2 or the first
+  // level after that has overlapping files)
+  for (int level = output_level_inputs.level + 1; level < NumberLevels();
+       level++) {
+    vstorage->GetOverlappingInputs(level, &start, &limit, grandparents);
+    if (!grandparents->empty()) {
+      break;
+    }
+  }
+}
+
+Compaction* CompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const InternalKey* begin,
+    const InternalKey* end, InternalKey** compaction_end, bool* manual_conflict,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  // CompactionPickerFIFO has its own implementation of compact range
+  assert(ioptions_.compaction_style != kCompactionStyleFIFO);
+
+  if (input_level == ColumnFamilyData::kCompactAllLevels) {
+    assert(ioptions_.compaction_style == kCompactionStyleUniversal);
+
+    // Universal compaction with more than one level always compacts all the
+    // files together to the last level.
+    assert(vstorage->num_levels() > 1);
+    // DBImpl::CompactRange() set output level to be the last level
+    if (ioptions_.allow_ingest_behind) {
+      assert(output_level == vstorage->num_levels() - 2);
+    } else {
+      assert(output_level == vstorage->num_levels() - 1);
+    }
+    // DBImpl::RunManualCompaction will make full range for universal compaction
+    assert(begin == nullptr);
+    assert(end == nullptr);
+    *compaction_end = nullptr;
+
+    int start_level = 0;
+    for (; start_level < vstorage->num_levels() &&
+           vstorage->NumLevelFiles(start_level) == 0;
+         start_level++) {
+    }
+    if (start_level == vstorage->num_levels()) {
+      return nullptr;
+    }
+
+    if ((start_level == 0) && (!level0_compactions_in_progress_.empty())) {
+      *manual_conflict = true;
+      // Only one level 0 compaction allowed
+      return nullptr;
+    }
+
+    std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+                                             start_level);
+    for (int level = start_level; level < vstorage->num_levels(); level++) {
+      inputs[level - start_level].level = level;
+      auto& files = inputs[level - start_level].files;
+      for (FileMetaData* f : vstorage->LevelFiles(level)) {
+        files.push_back(f);
+      }
+      if (AreFilesInCompaction(files)) {
+        *manual_conflict = true;
+        return nullptr;
+      }
+    }
+
+    // 2 non-exclusive manual compactions could run at the same time producing
+    // overlaping outputs in the same level.
+    if (FilesRangeOverlapWithCompaction(
+            inputs, output_level,
+            Compaction::EvaluatePenultimateLevel(vstorage, ioptions_,
+                                                 start_level, output_level))) {
+      // This compaction output could potentially conflict with the output
+      // of a currently running compaction, we cannot run it.
+      *manual_conflict = true;
+      return nullptr;
+    }
+
+    Compaction* c = new Compaction(
+        vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+        std::move(inputs), output_level,
+        MaxFileSizeForLevel(mutable_cf_options, output_level,
+                            ioptions_.compaction_style),
+        /* max_compaction_bytes */ LLONG_MAX,
+        compact_range_options.target_path_id,
+        GetCompressionType(vstorage, mutable_cf_options, output_level, 1),
+        GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+        Temperature::kUnknown, compact_range_options.max_subcompactions,
+        /* grandparents */ {}, /* is manual */ true, trim_ts, /* score */ -1,
+        /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+        CompactionReason::kUnknown,
+        compact_range_options.blob_garbage_collection_policy,
+        compact_range_options.blob_garbage_collection_age_cutoff);
+
+    RegisterCompaction(c);
+    vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+    return c;
+  }
+
+  CompactionInputFiles inputs;
+  inputs.level = input_level;
+  bool covering_the_whole_range = true;
+
+  // All files are 'overlapping' in universal style compaction.
+  // We have to compact the entire range in one shot.
+  if (ioptions_.compaction_style == kCompactionStyleUniversal) {
+    begin = nullptr;
+    end = nullptr;
+  }
+
+  vstorage->GetOverlappingInputs(input_level, begin, end, &inputs.files);
+  if (inputs.empty()) {
+    return nullptr;
+  }
+
+  if ((input_level == 0) && (!level0_compactions_in_progress_.empty())) {
+    // Only one level 0 compaction allowed
+    TEST_SYNC_POINT("CompactionPicker::CompactRange:Conflict");
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  // Avoid compacting too much in one shot in case the range is large.
+  // But we cannot do this for level-0 since level-0 files can overlap
+  // and we must not pick one file and drop another older file if the
+  // two files overlap.
+  if (input_level > 0) {
+    const uint64_t limit = mutable_cf_options.max_compaction_bytes;
+    uint64_t input_level_total = 0;
+    int hint_index = -1;
+    InternalKey* smallest = nullptr;
+    InternalKey* largest = nullptr;
+    for (size_t i = 0; i + 1 < inputs.size(); ++i) {
+      if (!smallest) {
+        smallest = &inputs[i]->smallest;
+      }
+      largest = &inputs[i]->largest;
+
+      uint64_t input_file_size = inputs[i]->fd.GetFileSize();
+      uint64_t output_level_total = 0;
+      if (output_level < vstorage->num_non_empty_levels()) {
+        std::vector<FileMetaData*> files;
+        vstorage->GetOverlappingInputsRangeBinarySearch(
+            output_level, smallest, largest, &files, hint_index, &hint_index);
+        for (const auto& file : files) {
+          output_level_total += file->fd.GetFileSize();
+        }
+      }
+
+      input_level_total += input_file_size;
+
+      if (input_level_total + output_level_total >= limit) {
+        covering_the_whole_range = false;
+        // still include the current file, so the compaction could be larger
+        // than max_compaction_bytes, which is also to make sure the compaction
+        // can make progress even `max_compaction_bytes` is small (e.g. smaller
+        // than an SST file).
+        inputs.files.resize(i + 1);
+        break;
+      }
+    }
+  }
+
+  assert(compact_range_options.target_path_id <
+         static_cast<uint32_t>(ioptions_.cf_paths.size()));
+
+  // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
+  // files that are created during the current compaction.
+  if (compact_range_options.bottommost_level_compaction ==
+          BottommostLevelCompaction::kForceOptimized &&
+      max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
+    assert(input_level == output_level);
+    // inputs_shrunk holds a continuous subset of input files which were all
+    // created before the current manual compaction
+    std::vector<FileMetaData*> inputs_shrunk;
+    size_t skip_input_index = inputs.size();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        inputs_shrunk.push_back(inputs[i]);
+      } else if (!inputs_shrunk.empty()) {
+        // inputs[i] was created during the current manual compaction and
+        // need to be skipped
+        skip_input_index = i;
+        break;
+      }
+    }
+    if (inputs_shrunk.empty()) {
+      return nullptr;
+    }
+    if (inputs.size() != inputs_shrunk.size()) {
+      inputs.files.swap(inputs_shrunk);
+    }
+    // set covering_the_whole_range to false if there is any file that need to
+    // be compacted in the range of inputs[skip_input_index+1, inputs.size())
+    for (size_t i = skip_input_index + 1; i < inputs.size(); ++i) {
+      if (inputs[i]->fd.GetNumber() < max_file_num_to_ignore) {
+        covering_the_whole_range = false;
+      }
+    }
+  }
+
+  InternalKey key_storage;
+  InternalKey* next_smallest = &key_storage;
+  if (ExpandInputsToCleanCut(cf_name, vstorage, &inputs, &next_smallest) ==
+      false) {
+    // manual compaction is now multi-threaded, so it can
+    // happen that ExpandWhileOverlapping fails
+    // we handle it higher in RunManualCompaction
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  if (covering_the_whole_range || !next_smallest) {
+    *compaction_end = nullptr;
+  } else {
+    **compaction_end = *next_smallest;
+  }
+
+  CompactionInputFiles output_level_inputs;
+  if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+    assert(input_level == 0);
+    output_level = vstorage->base_level();
+    assert(output_level > 0);
+  }
+  output_level_inputs.level = output_level;
+  if (input_level != output_level) {
+    int parent_index = -1;
+    if (!SetupOtherInputs(cf_name, mutable_cf_options, vstorage, &inputs,
+                          &output_level_inputs, &parent_index, -1)) {
+      // manual compaction is now multi-threaded, so it can
+      // happen that SetupOtherInputs fails
+      // we handle it higher in RunManualCompaction
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  std::vector<CompactionInputFiles> compaction_inputs({inputs});
+  if (!output_level_inputs.empty()) {
+    compaction_inputs.push_back(output_level_inputs);
+  }
+  for (size_t i = 0; i < compaction_inputs.size(); i++) {
+    if (AreFilesInCompaction(compaction_inputs[i].files)) {
+      *manual_conflict = true;
+      return nullptr;
+    }
+  }
+
+  // 2 non-exclusive manual compactions could run at the same time producing
+  // overlaping outputs in the same level.
+  if (FilesRangeOverlapWithCompaction(
+          compaction_inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage, ioptions_, input_level,
+                                               output_level))) {
+    // This compaction output could potentially conflict with the output
+    // of a currently running compaction, we cannot run it.
+    *manual_conflict = true;
+    return nullptr;
+  }
+
+  std::vector<FileMetaData*> grandparents;
+  GetGrandparents(vstorage, inputs, output_level_inputs, &grandparents);
+  Compaction* compaction = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(compaction_inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options, output_level,
+                          ioptions_.compaction_style, vstorage->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options.max_compaction_bytes,
+      compact_range_options.target_path_id,
+      GetCompressionType(vstorage, mutable_cf_options, output_level,
+                         vstorage->base_level()),
+      GetCompressionOptions(mutable_cf_options, vstorage, output_level),
+      Temperature::kUnknown, compact_range_options.max_subcompactions,
+      std::move(grandparents), /* is manual */ true, trim_ts, /* score */ -1,
+      /* deletion_compaction */ false, /* l0_files_might_overlap */ true,
+      CompactionReason::kUnknown,
+      compact_range_options.blob_garbage_collection_policy,
+      compact_range_options.blob_garbage_collection_age_cutoff);
+
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::CompactRange:Return", compaction);
+  RegisterCompaction(compaction);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage->ComputeCompactionScore(ioptions_, mutable_cf_options);
+
+  return compaction;
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+// Test whether two files have overlapping key-ranges.
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+                              const SstFileMetaData& b) {
+  if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+    if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  auto& levels = cf_meta.levels;
+  auto comparator = icmp_->user_comparator();
+
+  // TODO(yhchiang): add is_adjustable to CompactionOptions
+
+  // the smallest and largest key of the current compaction input
+  std::string smallestkey;
+  std::string largestkey;
+  // a flag for initializing smallest and largest key
+  bool is_first = false;
+  const int kNotFound = -1;
+
+  // For each level, it does the following things:
+  // 1. Find the first and the last compaction input files
+  //    in the current level.
+  // 2. Include all files between the first and the last
+  //    compaction input files.
+  // 3. Update the compaction key-range.
+  // 4. For all remaining levels, include files that have
+  //    overlapping key-range with the compaction key-range.
+  for (int l = 0; l <= output_level; ++l) {
+    auto& current_files = levels[l].files;
+    int first_included = static_cast<int>(current_files.size());
+    int last_included = kNotFound;
+
+    // identify the first and the last compaction input files
+    // in the current level.
+    for (size_t f = 0; f < current_files.size(); ++f) {
+      const uint64_t file_number = TableFileNameToNumber(current_files[f].name);
+      if (input_files->find(file_number) == input_files->end()) {
+        continue;
+      }
+      first_included = std::min(first_included, static_cast<int>(f));
+      last_included = std::max(last_included, static_cast<int>(f));
+      if (is_first == false) {
+        smallestkey = current_files[f].smallestkey;
+        largestkey = current_files[f].largestkey;
+        is_first = true;
+      }
+    }
+    if (last_included == kNotFound) {
+      continue;
+    }
+
+    if (l != 0) {
+      // expand the compaction input of the current level if it
+      // has overlapping key-range with other non-compaction input
+      // files in the same level.
+      while (first_included > 0) {
+        if (comparator->CompareWithoutTimestamp(
+                current_files[first_included - 1].largestkey,
+                current_files[first_included].smallestkey) < 0) {
+          break;
+        }
+        first_included--;
+      }
+
+      while (last_included < static_cast<int>(current_files.size()) - 1) {
+        if (comparator->CompareWithoutTimestamp(
+                current_files[last_included + 1].smallestkey,
+                current_files[last_included].largestkey) > 0) {
+          break;
+        }
+        last_included++;
+      }
+    } else if (output_level > 0) {
+      last_included = static_cast<int>(current_files.size() - 1);
+    }
+
+    // include all files between the first and the last compaction input files.
+    for (int f = first_included; f <= last_included; ++f) {
+      if (current_files[f].being_compacted) {
+        return Status::Aborted("Necessary compaction input file " +
+                               current_files[f].name +
+                               " is currently being compacted.");
+      }
+      input_files->insert(TableFileNameToNumber(current_files[f].name));
+    }
+
+    // update smallest and largest key
+    if (l == 0) {
+      for (int f = first_included; f <= last_included; ++f) {
+        if (comparator->CompareWithoutTimestamp(
+                smallestkey, current_files[f].smallestkey) > 0) {
+          smallestkey = current_files[f].smallestkey;
+        }
+        if (comparator->CompareWithoutTimestamp(
+                largestkey, current_files[f].largestkey) < 0) {
+          largestkey = current_files[f].largestkey;
+        }
+      }
+    } else {
+      if (comparator->CompareWithoutTimestamp(
+              smallestkey, current_files[first_included].smallestkey) > 0) {
+        smallestkey = current_files[first_included].smallestkey;
+      }
+      if (comparator->CompareWithoutTimestamp(
+              largestkey, current_files[last_included].largestkey) < 0) {
+        largestkey = current_files[last_included].largestkey;
+      }
+    }
+
+    SstFileMetaData aggregated_file_meta;
+    aggregated_file_meta.smallestkey = smallestkey;
+    aggregated_file_meta.largestkey = largestkey;
+
+    // For all lower levels, include all overlapping files.
+    // We need to add overlapping files from the current level too because even
+    // if there no input_files in level l, we would still need to add files
+    // which overlap with the range containing the input_files in levels 0 to l
+    // Level 0 doesn't need to be handled this way because files are sorted by
+    // time and not by key
+    for (int m = std::max(l, 1); m <= output_level; ++m) {
+      for (auto& next_lv_file : levels[m].files) {
+        if (HaveOverlappingKeyRanges(comparator, aggregated_file_meta,
+                                     next_lv_file)) {
+          if (next_lv_file.being_compacted) {
+            return Status::Aborted(
+                "File " + next_lv_file.name +
+                " that has overlapping key range with one of the compaction "
+                " input file is currently being compacted.");
+          }
+          input_files->insert(TableFileNameToNumber(next_lv_file.name));
+        }
+      }
+    }
+  }
+  if (RangeOverlapWithCompaction(smallestkey, largestkey, output_level)) {
+    return Status::Aborted(
+        "A running compaction is writing to the same output level in an "
+        "overlapping key range");
+  }
+  return Status::OK();
+}
+
+Status CompactionPicker::SanitizeCompactionInputFiles(
+    std::unordered_set<uint64_t>* input_files,
+    const ColumnFamilyMetaData& cf_meta, const int output_level) const {
+  assert(static_cast<int>(cf_meta.levels.size()) - 1 ==
+         cf_meta.levels[cf_meta.levels.size() - 1].level);
+  if (output_level >= static_cast<int>(cf_meta.levels.size())) {
+    return Status::InvalidArgument(
+        "Output level for column family " + cf_meta.name +
+        " must between [0, " +
+        std::to_string(cf_meta.levels[cf_meta.levels.size() - 1].level) + "].");
+  }
+
+  if (output_level > MaxOutputLevel()) {
+    return Status::InvalidArgument(
+        "Exceed the maximum output level defined by "
+        "the current compaction algorithm --- " +
+        std::to_string(MaxOutputLevel()));
+  }
+
+  if (output_level < 0) {
+    return Status::InvalidArgument("Output level cannot be negative.");
+  }
+
+  if (input_files->size() == 0) {
+    return Status::InvalidArgument(
+        "A compaction must contain at least one file.");
+  }
+
+  Status s = SanitizeCompactionInputFilesForAllLevels(input_files, cf_meta,
+                                                      output_level);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  // for all input files, check whether the file number matches
+  // any currently-existing files.
+  for (auto file_num : *input_files) {
+    bool found = false;
+    int input_file_level = -1;
+    for (const auto& level_meta : cf_meta.levels) {
+      for (const auto& file_meta : level_meta.files) {
+        if (file_num == TableFileNameToNumber(file_meta.name)) {
+          if (file_meta.being_compacted) {
+            return Status::Aborted("Specified compaction input file " +
+                                   MakeTableFileName("", file_num) +
+                                   " is already being compacted.");
+          }
+          found = true;
+          input_file_level = level_meta.level;
+          break;
+        }
+      }
+      if (found) {
+        break;
+      }
+    }
+    if (!found) {
+      return Status::InvalidArgument(
+          "Specified compaction input file " + MakeTableFileName("", file_num) +
+          " does not exist in column family " + cf_meta.name + ".");
+    }
+    if (input_file_level > output_level) {
+      return Status::InvalidArgument(
+          "Cannot compact file to up level, input file: " +
+          MakeTableFileName("", file_num) + " level " +
+          std::to_string(input_file_level) + " > output level " +
+          std::to_string(output_level));
+    }
+  }
+
+  return Status::OK();
+}
+#endif  // !ROCKSDB_LITE
+
+void CompactionPicker::RegisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  assert(ioptions_.compaction_style != kCompactionStyleLevel ||
+         c->output_level() == 0 ||
+         !FilesRangeOverlapWithCompaction(*c->inputs(), c->output_level(),
+                                          c->GetPenultimateLevel()));
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.insert(c);
+  }
+  compactions_in_progress_.insert(c);
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::RegisterCompaction:Registered",
+                           c);
+}
+
+void CompactionPicker::UnregisterCompaction(Compaction* c) {
+  if (c == nullptr) {
+    return;
+  }
+  if (c->start_level() == 0 ||
+      ioptions_.compaction_style == kCompactionStyleUniversal) {
+    level0_compactions_in_progress_.erase(c);
+  }
+  compactions_in_progress_.erase(c);
+}
+
+void CompactionPicker::PickFilesMarkedForCompaction(
+    const std::string& cf_name, VersionStorageInfo* vstorage, int* start_level,
+    int* output_level, CompactionInputFiles* start_level_inputs) {
+  if (vstorage->FilesMarkedForCompaction().empty()) {
+    return;
+  }
+
+  auto continuation = [&, cf_name](std::pair<int, FileMetaData*> level_file) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    *start_level = level_file.first;
+    *output_level =
+        (*start_level == 0) ? vstorage->base_level() : *start_level + 1;
+
+    if (*start_level == 0 && !level0_compactions_in_progress()->empty()) {
+      return false;
+    }
+
+    start_level_inputs->files = {level_file.second};
+    start_level_inputs->level = *start_level;
+    return ExpandInputsToCleanCut(cf_name, vstorage, start_level_inputs);
+  };
+
+  // take a chance on a random file first
+  Random64 rnd(/* seed */ reinterpret_cast<uint64_t>(vstorage));
+  size_t random_file_index = static_cast<size_t>(rnd.Uniform(
+      static_cast<uint64_t>(vstorage->FilesMarkedForCompaction().size())));
+  TEST_SYNC_POINT_CALLBACK("CompactionPicker::PickFilesMarkedForCompaction",
+                           &random_file_index);
+
+  if (continuation(vstorage->FilesMarkedForCompaction()[random_file_index])) {
+    // found the compaction!
+    return;
+  }
+
+  for (auto& level_file : vstorage->FilesMarkedForCompaction()) {
+    if (continuation(level_file)) {
+      // found the compaction!
+      return;
+    }
+  }
+  start_level_inputs->files.clear();
+}
+
+bool CompactionPicker::GetOverlappingL0Files(
+    VersionStorageInfo* vstorage, CompactionInputFiles* start_level_inputs,
+    int output_level, int* parent_index) {
+  // Two level 0 compaction won't run at the same time, so don't need to worry
+  // about files on level 0 being compacted.
+  assert(level0_compactions_in_progress()->empty());
+  InternalKey smallest, largest;
+  GetRange(*start_level_inputs, &smallest, &largest);
+  // Note that the next call will discard the file we placed in
+  // c->inputs_[0] earlier and replace it with an overlapping set
+  // which will include the picked file.
+  start_level_inputs->files.clear();
+  vstorage->GetOverlappingInputs(0, &smallest, &largest,
+                                 &(start_level_inputs->files));
+
+  // If we include more L0 files in the same compaction run it can
+  // cause the 'smallest' and 'largest' key to get extended to a
+  // larger range. So, re-invoke GetRange to get the new key range
+  GetRange(*start_level_inputs, &smallest, &largest);
+  if (IsRangeInCompaction(vstorage, &smallest, &largest, output_level,
+                          parent_index)) {
+    return false;
+  }
+  assert(!start_level_inputs->files.empty());
+
+  return true;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker.h b/src/rocksdb/db/compaction/compaction_picker.h
new file mode 100644
index 000000000..7739dd96b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker.h
@@ -0,0 +1,323 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/compaction/compaction.h"
+#include "db/version_set.h"
+#include "options/cf_options.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file contains an abstract class CompactionPicker, and its two
+// sub-classes LevelCompactionPicker and NullCompactionPicker, as
+// well as some helper functions used by them.
+
+class LogBuffer;
+class Compaction;
+class VersionStorageInfo;
+struct CompactionInputFiles;
+
+// An abstract class to pick compactions from an existing LSM-tree.
+//
+// Each compaction style inherits the class and implement the
+// interface to form automatic compactions. If NeedCompaction() is true,
+// then call PickCompaction() to find what files need to be compacted
+// and where to put the output files.
+//
+// Non-virtual functions CompactRange() and CompactFiles() are used to
+// pick files to compact based on users' DB::CompactRange() and
+// DB::CompactFiles() requests, respectively. There is little
+// compaction style specific logic for them.
+class CompactionPicker {
+ public:
+  CompactionPicker(const ImmutableOptions& ioptions,
+                   const InternalKeyComparator* icmp);
+  virtual ~CompactionPicker();
+
+  // Pick level and inputs for a new compaction.
+  // Returns nullptr if there is no compaction to be done.
+  // Otherwise returns a pointer to a heap-allocated object that
+  // describes the compaction.  Caller should delete the result.
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0;
+
+  // Return a compaction object for compacting the range [begin,end] in
+  // the specified level.  Returns nullptr if there is nothing in that
+  // level that overlaps the specified range.  Caller should delete
+  // the result.
+  //
+  // The returned Compaction might not include the whole requested range.
+  // In that case, compaction_end will be set to the next key that needs
+  // compacting. In case the compaction will compact the whole range,
+  // compaction_end will be set to nullptr.
+  // Client is responsible for compaction_end storage -- when called,
+  // *compaction_end should point to valid InternalKey!
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts);
+
+  // The maximum allowed output level.  Default value is NumberLevels() - 1.
+  virtual int MaxOutputLevel() const { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0;
+
+// Sanitize the input set of compaction input files.
+// When the input parameters do not describe a valid compaction, the
+// function will try to fix the input_files by adding necessary
+// files.  If it's not possible to conver an invalid input_files
+// into a valid one by adding more files, the function will return a
+// non-ok status with specific reason.
+#ifndef ROCKSDB_LITE
+  Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
+                                      const ColumnFamilyMetaData& cf_meta,
+                                      const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Free up the files that participated in a compaction
+  //
+  // Requirement: DB mutex held
+  void ReleaseCompactionFiles(Compaction* c, Status status);
+
+  // Returns true if any one of the specified files are being compacted
+  bool AreFilesInCompaction(const std::vector<FileMetaData*>& files);
+
+  // Takes a list of CompactionInputFiles and returns a (manual) Compaction
+  // object.
+  //
+  // Caller must provide a set of input files that has been passed through
+  // `SanitizeCompactionInputFiles` earlier. The lock should not be released
+  // between that call and this one.
+  Compaction* CompactFiles(const CompactionOptions& compact_options,
+                           const std::vector<CompactionInputFiles>& input_files,
+                           int output_level, VersionStorageInfo* vstorage,
+                           const MutableCFOptions& mutable_cf_options,
+                           const MutableDBOptions& mutable_db_options,
+                           uint32_t output_path_id);
+
+  // Converts a set of compaction input file numbers into
+  // a list of CompactionInputFiles.
+  Status GetCompactionInputsFromFileNumbers(
+      std::vector<CompactionInputFiles>* input_files,
+      std::unordered_set<uint64_t>* input_set,
+      const VersionStorageInfo* vstorage,
+      const CompactionOptions& compact_options) const;
+
+  // Is there currently a compaction involving level 0 taking place
+  bool IsLevel0CompactionInProgress() const {
+    return !level0_compactions_in_progress_.empty();
+  }
+
+  // Return true if the passed key range overlap with a compaction output
+  // that is currently running.
+  bool RangeOverlapWithCompaction(const Slice& smallest_user_key,
+                                  const Slice& largest_user_key,
+                                  int level) const;
+
+  // Stores the minimal range that covers all entries in inputs in
+  // *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs1 and inputs2
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty
+  void GetRange(const CompactionInputFiles& inputs1,
+                const CompactionInputFiles& inputs2, InternalKey* smallest,
+                InternalKey* largest) const;
+
+  // Stores the minimal range that covers all entries in inputs
+  // in *smallest, *largest.
+  // REQUIRES: inputs is not empty (at least on entry have one file)
+  void GetRange(const std::vector<CompactionInputFiles>& inputs,
+                InternalKey* smallest, InternalKey* largest,
+                int exclude_level) const;
+
+  int NumberLevels() const { return ioptions_.num_levels; }
+
+  // Add more files to the inputs on "level" to make sure that
+  // no newer version of a key is compacted to "level+1" while leaving an older
+  // version in a "level". Otherwise, any Get() will search "level" first,
+  // and will likely return an old/stale value for the key, since it always
+  // searches in increasing order of level to find the value. This could
+  // also scramble the order of merge operands. This function should be
+  // called any time a new Compaction is created, and its inputs_[0] are
+  // populated.
+  //
+  // Will return false if it is impossible to apply this compaction.
+  bool ExpandInputsToCleanCut(const std::string& cf_name,
+                              VersionStorageInfo* vstorage,
+                              CompactionInputFiles* inputs,
+                              InternalKey** next_smallest = nullptr);
+
+  // Returns true if any one of the parent files are being compacted
+  bool IsRangeInCompaction(VersionStorageInfo* vstorage,
+                           const InternalKey* smallest,
+                           const InternalKey* largest, int level, int* index);
+
+  // Returns true if the key range that `inputs` files cover overlap with the
+  // key range of a currently running compaction.
+  bool FilesRangeOverlapWithCompaction(
+      const std::vector<CompactionInputFiles>& inputs, int level,
+      int penultimate_level) const;
+
+  bool SetupOtherInputs(const std::string& cf_name,
+                        const MutableCFOptions& mutable_cf_options,
+                        VersionStorageInfo* vstorage,
+                        CompactionInputFiles* inputs,
+                        CompactionInputFiles* output_level_inputs,
+                        int* parent_index, int base_index,
+                        bool only_expand_towards_right = false);
+
+  void GetGrandparents(VersionStorageInfo* vstorage,
+                       const CompactionInputFiles& inputs,
+                       const CompactionInputFiles& output_level_inputs,
+                       std::vector<FileMetaData*>* grandparents);
+
+  void PickFilesMarkedForCompaction(const std::string& cf_name,
+                                    VersionStorageInfo* vstorage,
+                                    int* start_level, int* output_level,
+                                    CompactionInputFiles* start_level_inputs);
+
+  bool GetOverlappingL0Files(VersionStorageInfo* vstorage,
+                             CompactionInputFiles* start_level_inputs,
+                             int output_level, int* parent_index);
+
+  // Register this compaction in the set of running compactions
+  void RegisterCompaction(Compaction* c);
+
+  // Remove this compaction from the set of running compactions
+  void UnregisterCompaction(Compaction* c);
+
+  std::set<Compaction*>* level0_compactions_in_progress() {
+    return &level0_compactions_in_progress_;
+  }
+  std::unordered_set<Compaction*>* compactions_in_progress() {
+    return &compactions_in_progress_;
+  }
+
+  const InternalKeyComparator* icmp() const { return icmp_; }
+
+ protected:
+  const ImmutableOptions& ioptions_;
+
+// A helper function to SanitizeCompactionInputFiles() that
+// sanitizes "input_files" by adding necessary files.
+#ifndef ROCKSDB_LITE
+  virtual Status SanitizeCompactionInputFilesForAllLevels(
+      std::unordered_set<uint64_t>* input_files,
+      const ColumnFamilyMetaData& cf_meta, const int output_level) const;
+#endif  // ROCKSDB_LITE
+
+  // Keeps track of all compactions that are running on Level0.
+  // Protected by DB mutex
+  std::set<Compaction*> level0_compactions_in_progress_;
+
+  // Keeps track of all compactions that are running.
+  // Protected by DB mutex
+  std::unordered_set<Compaction*> compactions_in_progress_;
+
+  const InternalKeyComparator* const icmp_;
+};
+
+#ifndef ROCKSDB_LITE
+// A dummy compaction that never triggers any automatic
+// compaction.
+class NullCompactionPicker : public CompactionPicker {
+ public:
+  NullCompactionPicker(const ImmutableOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual ~NullCompactionPicker() {}
+
+  // Always return "nullptr"
+  Compaction* PickCompaction(
+      const std::string& /*cf_name*/,
+      const MutableCFOptions& /*mutable_cf_options*/,
+      const MutableDBOptions& /*mutable_db_options*/,
+      VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */,
+      SequenceNumber /* earliest_memtable_seqno */) override {
+    return nullptr;
+  }
+
+  // Always return "nullptr"
+  Compaction* CompactRange(const std::string& /*cf_name*/,
+                           const MutableCFOptions& /*mutable_cf_options*/,
+                           const MutableDBOptions& /*mutable_db_options*/,
+                           VersionStorageInfo* /*vstorage*/,
+                           int /*input_level*/, int /*output_level*/,
+                           const CompactRangeOptions& /*compact_range_options*/,
+                           const InternalKey* /*begin*/,
+                           const InternalKey* /*end*/,
+                           InternalKey** /*compaction_end*/,
+                           bool* /*manual_conflict*/,
+                           uint64_t /*max_file_num_to_ignore*/,
+                           const std::string& /*trim_ts*/) override {
+    return nullptr;
+  }
+
+  // Always returns false.
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* /*vstorage*/) const override {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+// Attempts to find an intra L0 compaction conforming to the given parameters.
+//
+// @param level_files                     Metadata for L0 files.
+// @param min_files_to_compact            Minimum number of files required to
+//                                        do the compaction.
+// @param max_compact_bytes_per_del_file  Maximum average size in bytes per
+//                                        file that is going to get deleted by
+//                                        the compaction.
+// @param max_compaction_bytes            Maximum total size in bytes (in terms
+//                                        of compensated file size) for files
+//                                        to be compacted.
+// @param [out] comp_inputs               If a compaction was found, will be
+//                                        initialized with corresponding input
+//                                        files. Cannot be nullptr.
+//
+// @return                                true iff compaction was found.
+bool FindIntraL0Compaction(
+    const std::vector<FileMetaData*>& level_files, size_t min_files_to_compact,
+    uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes,
+    CompactionInputFiles* comp_inputs,
+    SequenceNumber earliest_mem_seqno = kMaxSequenceNumber);
+
+CompressionType GetCompressionType(const VersionStorageInfo* vstorage,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level, int base_level,
+                                   const bool enable_compression = true);
+
+CompressionOptions GetCompressionOptions(
+    const MutableCFOptions& mutable_cf_options,
+    const VersionStorageInfo* vstorage, int level,
+    const bool enable_compression = true);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.cc b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
new file mode 100644
index 000000000..1f875e3e1
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.cc
@@ -0,0 +1,433 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_fifo.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
+  uint64_t total_size = 0;
+  for (const auto& f : files) {
+    total_size += f->fd.file_size;
+  }
+  return total_size;
+}
+}  // anonymous namespace
+
+bool FIFOCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  return vstorage->CompactionScore(kLevel0) >= 1;
+}
+
+Compaction* FIFOCompactionPicker::PickTTLCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  assert(mutable_cf_options.ttl > 0);
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  uint64_t total_size = GetTotalFilesSize(level_files);
+
+  int64_t _current_time;
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on TTL. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.ttl) {
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+        uint64_t creation_time =
+            f->fd.table_reader->GetTableProperties()->creation_time;
+        if (creation_time == 0 ||
+            creation_time >= (current_time - mutable_cf_options.ttl)) {
+          break;
+        }
+      }
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+    }
+  }
+
+  // Return a nullptr and proceed to size-based FIFO compaction if:
+  // 1. there are no files older than ttl OR
+  // 2. there are a few files older than ttl, but deleting them will not bring
+  //    the total size to be less than max_table_files_size threshold.
+  if (inputs[0].files.empty() ||
+      total_size >
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    return nullptr;
+  }
+
+  for (const auto& f : inputs[0].files) {
+    uint64_t creation_time = 0;
+    assert(f);
+    if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
+      creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
+    }
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: picking file %" PRIu64
+                     " with creation time %" PRIu64 " for deletion",
+                     cf_name.c_str(), f->fd.GetNumber(), creation_time);
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0, 0, 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* is deletion compaction */ true, /* l0_files_might_overlap */ true,
+      CompactionReason::kFIFOTtl);
+  return c;
+}
+
+// The size-based compaction picker for FIFO.
+//
+// When the entire column family size exceeds max_table_files_size, FIFO will
+// try to delete the oldest sst file(s) until the resulting column family size
+// is smaller than max_table_files_size.
+//
+// This function also takes care the case where a DB is migrating from level /
+// universal compaction to FIFO compaction.  During the migration, the column
+// family will also have non-L0 files while FIFO can only create L0 files.
+// In this case, this function will first purge the sst files in the bottom-
+// most non-empty level first, and the DB will eventually converge to the
+// regular FIFO case where there're only L0 files.  Note that during the
+// migration case, the purge order will only be an approximation of "FIFO"
+// as entries inside lower-level files might sometimes be newer than some
+// entries inside upper-level files.
+Compaction* FIFOCompactionPicker::PickSizeCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  // compute the total size and identify the last non-empty level
+  int last_level = 0;
+  uint64_t total_size = 0;
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level));
+    total_size += level_size;
+    if (level_size > 0) {
+      last_level = level;
+    }
+  }
+  const std::vector<FileMetaData*>& last_level_files =
+      vstorage->LevelFiles(last_level);
+
+  if (last_level == 0 &&
+      total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+    // total size not exceeded, try to find intra level 0 compaction if enabled
+    const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
+    if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
+        level0_files.size() > 0) {
+      CompactionInputFiles comp_inputs;
+      // try to prevent same files from being compacted multiple times, which
+      // could produce large files that may never TTL-expire. Achieve this by
+      // disallowing compactions with files larger than memtable (inflate its
+      // size by 10% to account for uncompressed L0 files that may have size
+      // slightly greater than memtable size limit).
+      size_t max_compact_bytes_per_del_file =
+          static_cast<size_t>(MultiplyCheckOverflow(
+              static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
+              1.1));
+      if (FindIntraL0Compaction(
+              level0_files,
+              mutable_cf_options
+                  .level0_file_num_compaction_trigger /* min_files_to_compact */
+              ,
+              max_compact_bytes_per_del_file,
+              mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
+        Compaction* c = new Compaction(
+            vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+            {comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
+            0 /* max compaction bytes, not applicable */,
+            0 /* output path ID */, mutable_cf_options.compression,
+            mutable_cf_options.compression_opts, Temperature::kUnknown,
+            0 /* max_subcompactions */, {}, /* is manual */ false,
+            /* trim_ts */ "", vstorage->CompactionScore(0),
+            /* is deletion compaction */ false,
+            /* l0_files_might_overlap */ true,
+            CompactionReason::kFIFOReduceNumFiles);
+        return c;
+      }
+    }
+
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: nothing to do. Total size %" PRIu64
+        ", max size %" PRIu64 "\n",
+        cf_name.c_str(), total_size,
+        mutable_cf_options.compaction_options_fifo.max_table_files_size);
+    return nullptr;
+  }
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. No need "
+        "to run parallel compactions since compactions are very fast",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = last_level;
+
+  if (last_level == 0) {
+    // In L0, right-most files are the oldest files.
+    for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
+         ++ritr) {
+      auto f = *ritr;
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+      char tmp_fsize[16];
+      AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion",
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+      if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+        break;
+      }
+    }
+  } else {
+    // If the last level is non-L0, we actually don't know which file is
+    // logically the oldest since the file creation time only represents
+    // when this file was compacted to this level, which is independent
+    // to when the entries in this file were first inserted.
+    //
+    // As a result, we delete files from the left instead.  This means the sst
+    // file with the smallest key will be deleted first.  This design decision
+    // better serves a major type of FIFO use cases where smaller keys are
+    // associated with older data.
+    for (const auto& f : last_level_files) {
+      total_size -= f->fd.file_size;
+      inputs[0].files.push_back(f);
+      char tmp_fsize[16];
+      AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
+      ROCKS_LOG_BUFFER(log_buffer,
+                       "[%s] FIFO compaction: picking file %" PRIu64
+                       " with size %s for deletion",
+                       cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
+      if (total_size <=
+          mutable_cf_options.compaction_options_fifo.max_table_files_size) {
+        break;
+      }
+    }
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), last_level,
+      /* target_file_size */ 0,
+      /* max_compaction_bytes */ 0,
+      /* output_path_id */ 0, kNoCompression,
+      mutable_cf_options.compression_opts, Temperature::kUnknown,
+      /* max_subcompactions */ 0, {}, /* is manual */ false,
+      /* trim_ts */ "", vstorage->CompactionScore(0),
+      /* is deletion compaction */ true,
+      /* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer) {
+  if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+    return nullptr;
+  }
+
+  // PickCompactionToWarm is only triggered if there is no non-L0 files.
+  for (int level = 1; level < vstorage->num_levels(); ++level) {
+    if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
+      return nullptr;
+    }
+  }
+
+  const int kLevel0 = 0;
+  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+
+  int64_t _current_time;
+  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "[%s] FIFO compaction: Couldn't get current time: %s. "
+                     "Not doing compactions based on warm threshold. ",
+                     cf_name.c_str(), status.ToString().c_str());
+    return nullptr;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  if (!level0_compactions_in_progress_.empty()) {
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Already executing compaction. Parallel "
+        "compactions are not supported",
+        cf_name.c_str());
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  inputs.emplace_back();
+  inputs[0].level = 0;
+
+  // avoid underflow
+  if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
+    uint64_t create_time_threshold =
+        current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+    uint64_t compaction_size = 0;
+    // We will ideally identify a file qualifying for warm tier by knowing
+    // the timestamp for the youngest entry in the file. However, right now
+    // we don't have the information. We infer it by looking at timestamp
+    // of the next file's (which is just younger) oldest entry's timestamp.
+    FileMetaData* prev_file = nullptr;
+    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
+      FileMetaData* f = *ritr;
+      assert(f);
+      if (f->being_compacted) {
+        // Right now this probably won't happen as we never try to schedule
+        // two compactions in parallel, so here we just simply don't schedule
+        // anything.
+        return nullptr;
+      }
+      uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+      if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+        // Older files might not have enough information. It is possible to
+        // handle these files by looking at newer files, but maintaining the
+        // logic isn't worth it.
+        break;
+      }
+      if (oldest_ancester_time > create_time_threshold) {
+        // The previous file (which has slightly older data) doesn't qualify
+        // for warm tier.
+        break;
+      }
+      if (prev_file != nullptr) {
+        compaction_size += prev_file->fd.GetFileSize();
+        if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+          break;
+        }
+        inputs[0].files.push_back(prev_file);
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] FIFO compaction: picking file %" PRIu64
+                         " with next file's oldest time %" PRIu64 " for warm",
+                         cf_name.c_str(), prev_file->fd.GetNumber(),
+                         oldest_ancester_time);
+      }
+      if (f->temperature == Temperature::kUnknown ||
+          f->temperature == Temperature::kHot) {
+        prev_file = f;
+      } else if (!inputs[0].files.empty()) {
+        // A warm file newer than files picked.
+        break;
+      } else {
+        assert(prev_file == nullptr);
+      }
+    }
+  }
+
+  if (inputs[0].files.empty()) {
+    return nullptr;
+  }
+
+  Compaction* c = new Compaction(
+      vstorage, ioptions_, mutable_cf_options, mutable_db_options,
+      std::move(inputs), 0, 0 /* output file size limit */,
+      0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
+      mutable_cf_options.compression, mutable_cf_options.compression_opts,
+      Temperature::kWarm,
+      /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
+      vstorage->CompactionScore(0),
+      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
+      CompactionReason::kChangeTemperature);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) {
+  Compaction* c = nullptr;
+  if (mutable_cf_options.ttl > 0) {
+    c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                          vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
+                           vstorage, log_buffer);
+  }
+  if (c == nullptr) {
+    c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
+                             vstorage, log_buffer);
+  }
+  RegisterCompaction(c);
+  return c;
+}
+
+Compaction* FIFOCompactionPicker::CompactRange(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    int input_level, int output_level,
+    const CompactRangeOptions& /*compact_range_options*/,
+    const InternalKey* /*begin*/, const InternalKey* /*end*/,
+    InternalKey** compaction_end, bool* /*manual_conflict*/,
+    uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
+#ifdef NDEBUG
+  (void)input_level;
+  (void)output_level;
+#endif
+  assert(input_level == 0);
+  assert(output_level == 0);
+  *compaction_end = nullptr;
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
+  Compaction* c = PickCompaction(cf_name, mutable_cf_options,
+                                 mutable_db_options, vstorage, &log_buffer);
+  log_buffer.FlushBufferToLog();
+  return c;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_fifo.h b/src/rocksdb/db/compaction/compaction_picker_fifo.h
new file mode 100644
index 000000000..544259f38
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_fifo.h
@@ -0,0 +1,63 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class FIFOCompactionPicker : public CompactionPicker {
+ public:
+  FIFOCompactionPicker(const ImmutableOptions& ioptions,
+                       const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* version,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual Compaction* CompactRange(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      int input_level, int output_level,
+      const CompactRangeOptions& compact_range_options,
+      const InternalKey* begin, const InternalKey* end,
+      InternalKey** compaction_end, bool* manual_conflict,
+      uint64_t max_file_num_to_ignore, const std::string& trim_ts) override;
+
+  // The maximum allowed output level.  Always returns 0.
+  virtual int MaxOutputLevel() const override { return 0; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+
+ private:
+  Compaction* PickTTLCompaction(const std::string& cf_name,
+                                const MutableCFOptions& mutable_cf_options,
+                                const MutableDBOptions& mutable_db_options,
+                                VersionStorageInfo* version,
+                                LogBuffer* log_buffer);
+
+  Compaction* PickSizeCompaction(const std::string& cf_name,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 const MutableDBOptions& mutable_db_options,
+                                 VersionStorageInfo* version,
+                                 LogBuffer* log_buffer);
+
+  Compaction* PickCompactionToWarm(const std::string& cf_name,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   const MutableDBOptions& mutable_db_options,
+                                   VersionStorageInfo* version,
+                                   LogBuffer* log_buffer);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.cc b/src/rocksdb/db/compaction/compaction_picker_level.cc
new file mode 100644
index 000000000..b689b6add
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.cc
@@ -0,0 +1,841 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_level.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "logging/log_buffer.h"
+#include "test_util/sync_point.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool LevelCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  if (!vstorage->ExpiredTtlFiles().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->BottommostFilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForForcedBlobGC().empty()) {
+    return true;
+  }
+  for (int i = 0; i <= vstorage->MaxInputLevel(); i++) {
+    if (vstorage->CompactionScore(i) >= 1) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace {
+// A class to build a leveled compaction step-by-step.
+class LevelCompactionBuilder {
+ public:
+  LevelCompactionBuilder(const std::string& cf_name,
+                         VersionStorageInfo* vstorage,
+                         SequenceNumber earliest_mem_seqno,
+                         CompactionPicker* compaction_picker,
+                         LogBuffer* log_buffer,
+                         const MutableCFOptions& mutable_cf_options,
+                         const ImmutableOptions& ioptions,
+                         const MutableDBOptions& mutable_db_options)
+      : cf_name_(cf_name),
+        vstorage_(vstorage),
+        earliest_mem_seqno_(earliest_mem_seqno),
+        compaction_picker_(compaction_picker),
+        log_buffer_(log_buffer),
+        mutable_cf_options_(mutable_cf_options),
+        ioptions_(ioptions),
+        mutable_db_options_(mutable_db_options) {}
+
+  // Pick and return a compaction.
+  Compaction* PickCompaction();
+
+  // Pick the initial files to compact to the next level. (or together
+  // in Intra-L0 compactions)
+  void SetupInitialFiles();
+
+  // If the initial files are from L0 level, pick other L0
+  // files if needed.
+  bool SetupOtherL0FilesIfNeeded();
+
+  // Compaction with round-robin compaction priority allows more files to be
+  // picked to form a large compaction
+  void SetupOtherFilesWithRoundRobinExpansion();
+  // Based on initial files, setup other files need to be compacted
+  // in this compaction, accordingly.
+  bool SetupOtherInputsIfNeeded();
+
+  Compaction* GetCompaction();
+
+  // For the specfied level, pick a file that we want to compact.
+  // Returns false if there is no file to compact.
+  // If it returns true, inputs->files.size() will be exactly one for
+  // all compaction priorities except round-robin. For round-robin,
+  // multiple consecutive files may be put into inputs->files.
+  // If level is 0 and there is already a compaction on that level, this
+  // function will return false.
+  bool PickFileToCompact();
+
+  // Return true if a L0 trivial move is picked up.
+  bool TryPickL0TrivialMove();
+
+  // For L0->L0, picks the longest span of files that aren't currently
+  // undergoing compaction for which work-per-deleted-file decreases. The span
+  // always starts from the newest L0 file.
+  //
+  // Intra-L0 compaction is independent of all other files, so it can be
+  // performed even when L0->base_level compactions are blocked.
+  //
+  // Returns true if `inputs` is populated with a span of files to be compacted;
+  // otherwise, returns false.
+  bool PickIntraL0Compaction();
+
+  // Return true if TrivialMove is extended. `start_index` is the index of
+  // the intiial file picked, which should already be in `start_level_inputs_`.
+  bool TryExtendNonL0TrivialMove(int start_index);
+
+  // Picks a file from level_files to compact.
+  // level_files is a vector of (level, file metadata) in ascending order of
+  // level. If compact_to_next_level is true, compact the file to the next
+  // level, otherwise, compact to the same level as the input file.
+  void PickFileToCompact(
+      const autovector<std::pair<int, FileMetaData*>>& level_files,
+      bool compact_to_next_level);
+
+  const std::string& cf_name_;
+  VersionStorageInfo* vstorage_;
+  SequenceNumber earliest_mem_seqno_;
+  CompactionPicker* compaction_picker_;
+  LogBuffer* log_buffer_;
+  int start_level_ = -1;
+  int output_level_ = -1;
+  int parent_index_ = -1;
+  int base_index_ = -1;
+  double start_level_score_ = 0;
+  bool is_manual_ = false;
+  bool is_l0_trivial_move_ = false;
+  CompactionInputFiles start_level_inputs_;
+  std::vector<CompactionInputFiles> compaction_inputs_;
+  CompactionInputFiles output_level_inputs_;
+  std::vector<FileMetaData*> grandparents_;
+  CompactionReason compaction_reason_ = CompactionReason::kUnknown;
+
+  const MutableCFOptions& mutable_cf_options_;
+  const ImmutableOptions& ioptions_;
+  const MutableDBOptions& mutable_db_options_;
+  // Pick a path ID to place a newly generated file, with its level
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            int level);
+
+  static const int kMinFilesForIntraL0Compaction = 4;
+};
+
+void LevelCompactionBuilder::PickFileToCompact(
+    const autovector<std::pair<int, FileMetaData*>>& level_files,
+    bool compact_to_next_level) {
+  for (auto& level_file : level_files) {
+    // If it's being compacted it has nothing to do here.
+    // If this assert() fails that means that some function marked some
+    // files as being_compacted, but didn't call ComputeCompactionScore()
+    assert(!level_file.second->being_compacted);
+    start_level_ = level_file.first;
+    if ((compact_to_next_level &&
+         start_level_ == vstorage_->num_non_empty_levels() - 1) ||
+        (start_level_ == 0 &&
+         !compaction_picker_->level0_compactions_in_progress()->empty())) {
+      continue;
+    }
+    if (compact_to_next_level) {
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+    } else {
+      output_level_ = start_level_;
+    }
+    start_level_inputs_.files = {level_file.second};
+    start_level_inputs_.level = start_level_;
+    if (compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                   &start_level_inputs_)) {
+      return;
+    }
+  }
+  start_level_inputs_.files.clear();
+}
+
+void LevelCompactionBuilder::SetupInitialFiles() {
+  // Find the compactions by size on all levels.
+  bool skipped_l0_to_base = false;
+  for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) {
+    start_level_score_ = vstorage_->CompactionScore(i);
+    start_level_ = vstorage_->CompactionScoreLevel(i);
+    assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1));
+    if (start_level_score_ >= 1) {
+      if (skipped_l0_to_base && start_level_ == vstorage_->base_level()) {
+        // If L0->base_level compaction is pending, don't schedule further
+        // compaction from base level. Otherwise L0->base_level compaction
+        // may starve.
+        continue;
+      }
+      output_level_ =
+          (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1;
+      if (PickFileToCompact()) {
+        // found the compaction!
+        if (start_level_ == 0) {
+          // L0 score = `num L0 files` / `level0_file_num_compaction_trigger`
+          compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+        } else {
+          // L1+ score = `Level files size` / `MaxBytesForLevel`
+          compaction_reason_ = CompactionReason::kLevelMaxLevelSize;
+        }
+        break;
+      } else {
+        // didn't find the compaction, clear the inputs
+        start_level_inputs_.clear();
+        if (start_level_ == 0) {
+          skipped_l0_to_base = true;
+          // L0->base_level may be blocked due to ongoing L0->base_level
+          // compactions. It may also be blocked by an ongoing compaction from
+          // base_level downwards.
+          //
+          // In these cases, to reduce L0 file count and thus reduce likelihood
+          // of write stalls, we can attempt compacting a span of files within
+          // L0.
+          if (PickIntraL0Compaction()) {
+            output_level_ = 0;
+            compaction_reason_ = CompactionReason::kLevelL0FilesNum;
+            break;
+          }
+        }
+      }
+    } else {
+      // Compaction scores are sorted in descending order, no further scores
+      // will be >= 1.
+      break;
+    }
+  }
+  if (!start_level_inputs_.empty()) {
+    return;
+  }
+
+  // if we didn't find a compaction, check if there are any files marked for
+  // compaction
+  parent_index_ = base_index_ = -1;
+
+  compaction_picker_->PickFilesMarkedForCompaction(
+      cf_name_, vstorage_, &start_level_, &output_level_, &start_level_inputs_);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kFilesMarkedForCompaction;
+    return;
+  }
+
+  // Bottommost Files Compaction on deleting tombstones
+  PickFileToCompact(vstorage_->BottommostFilesMarkedForCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kBottommostFiles;
+    return;
+  }
+
+  // TTL Compaction
+  if (ioptions_.compaction_pri == kRoundRobin &&
+      !vstorage_->ExpiredTtlFiles().empty()) {
+    auto expired_files = vstorage_->ExpiredTtlFiles();
+    // the expired files list should already be sorted by level
+    start_level_ = expired_files.front().first;
+#ifndef NDEBUG
+    for (const auto& file : expired_files) {
+      assert(start_level_ <= file.first);
+    }
+#endif
+    if (start_level_ > 0) {
+      output_level_ = start_level_ + 1;
+      if (PickFileToCompact()) {
+        compaction_reason_ = CompactionReason::kRoundRobinTtl;
+        return;
+      }
+    }
+  }
+
+  PickFileToCompact(vstorage_->ExpiredTtlFiles(), true);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kTtl;
+    return;
+  }
+
+  // Periodic Compaction
+  PickFileToCompact(vstorage_->FilesMarkedForPeriodicCompaction(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kPeriodicCompaction;
+    return;
+  }
+
+  // Forced blob garbage collection
+  PickFileToCompact(vstorage_->FilesMarkedForForcedBlobGC(), false);
+  if (!start_level_inputs_.empty()) {
+    compaction_reason_ = CompactionReason::kForcedBlobGC;
+    return;
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherL0FilesIfNeeded() {
+  if (start_level_ == 0 && output_level_ != 0 && !is_l0_trivial_move_) {
+    return compaction_picker_->GetOverlappingL0Files(
+        vstorage_, &start_level_inputs_, output_level_, &parent_index_);
+  }
+  return true;
+}
+
+void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
+  // We only expand when the start level is not L0 under round robin
+  assert(start_level_ >= 1);
+
+  // For round-robin compaction priority, we have 3 constraints when picking
+  // multiple files.
+  // Constraint 1: We can only pick consecutive files
+  //  -> Constraint 1a: When a file is being compacted (or some input files
+  //                    are being compacted after expanding, we cannot
+  //                    choose it and have to stop choosing more files
+  //  -> Constraint 1b: When we reach the last file (with largest keys), we
+  //                    cannot choose more files (the next file will be the
+  //                    first one)
+  // Constraint 2: We should ensure the total compaction bytes (including the
+  //               overlapped files from the next level) is no more than
+  //               mutable_cf_options_.max_compaction_bytes
+  // Constraint 3: We try our best to pick as many files as possible so that
+  //               the post-compaction level size is less than
+  //               MaxBytesForLevel(start_level_)
+  // Constraint 4: We do not expand if it is possible to apply a trivial move
+  // Constraint 5 (TODO): Try to pick minimal files to split into the target
+  //               number of subcompactions
+  TEST_SYNC_POINT("LevelCompactionPicker::RoundRobin");
+
+  // Only expand the inputs when we have selected a file in start_level_inputs_
+  if (start_level_inputs_.size() == 0) return;
+
+  uint64_t start_lvl_bytes_no_compacting = 0;
+  uint64_t curr_bytes_to_compact = 0;
+  uint64_t start_lvl_max_bytes_to_compact = 0;
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+  // Constraint 3 (pre-calculate the ideal max bytes to compact)
+  for (auto f : level_files) {
+    if (!f->being_compacted) {
+      start_lvl_bytes_no_compacting += f->fd.GetFileSize();
+    }
+  }
+  if (start_lvl_bytes_no_compacting >
+      vstorage_->MaxBytesForLevel(start_level_)) {
+    start_lvl_max_bytes_to_compact = start_lvl_bytes_no_compacting -
+                                     vstorage_->MaxBytesForLevel(start_level_);
+  }
+
+  size_t start_index = vstorage_->FilesByCompactionPri(start_level_)[0];
+  InternalKey smallest, largest;
+  // Constraint 4 (No need to check again later)
+  compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+  CompactionInputFiles output_level_inputs;
+  output_level_inputs.level = output_level_;
+  vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                  &output_level_inputs.files);
+  if (output_level_inputs.empty()) {
+    if (TryExtendNonL0TrivialMove((int)start_index)) {
+      return;
+    }
+  }
+  // Constraint 3
+  if (start_level_inputs_[0]->fd.GetFileSize() >=
+      start_lvl_max_bytes_to_compact) {
+    return;
+  }
+  CompactionInputFiles tmp_start_level_inputs;
+  tmp_start_level_inputs = start_level_inputs_;
+  // TODO (zichen): Future parallel round-robin may also need to update this
+  // Constraint 1b (only expand till the end)
+  for (size_t i = start_index + 1; i < level_files.size(); i++) {
+    auto* f = level_files[i];
+    if (f->being_compacted) {
+      // Constraint 1a
+      return;
+    }
+
+    tmp_start_level_inputs.files.push_back(f);
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &tmp_start_level_inputs) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {tmp_start_level_inputs}, output_level_,
+            Compaction::EvaluatePenultimateLevel(
+                vstorage_, ioptions_, start_level_, output_level_))) {
+      // Constraint 1a
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    curr_bytes_to_compact = 0;
+    for (auto start_lvl_f : tmp_start_level_inputs.files) {
+      curr_bytes_to_compact += start_lvl_f->fd.GetFileSize();
+    }
+
+    // Check whether any output level files are locked
+    compaction_picker_->GetRange(tmp_start_level_inputs, &smallest, &largest);
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (!output_level_inputs.empty() &&
+        !compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &output_level_inputs)) {
+      // Constraint 1a
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    uint64_t start_lvl_curr_bytes_to_compact = curr_bytes_to_compact;
+    for (auto output_lvl_f : output_level_inputs.files) {
+      curr_bytes_to_compact += output_lvl_f->fd.GetFileSize();
+    }
+    if (curr_bytes_to_compact > mutable_cf_options_.max_compaction_bytes) {
+      // Constraint 2
+      tmp_start_level_inputs.clear();
+      return;
+    }
+
+    start_level_inputs_.files = tmp_start_level_inputs.files;
+    // Constraint 3
+    if (start_lvl_curr_bytes_to_compact > start_lvl_max_bytes_to_compact) {
+      return;
+    }
+  }
+}
+
+bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() {
+  // Setup input files from output level. For output to L0, we only compact
+  // spans of files that do not interact with any pending compactions, so don't
+  // need to consider other levels.
+  if (output_level_ != 0) {
+    output_level_inputs_.level = output_level_;
+    bool round_robin_expanding =
+        ioptions_.compaction_pri == kRoundRobin &&
+        compaction_reason_ == CompactionReason::kLevelMaxLevelSize;
+    if (round_robin_expanding) {
+      SetupOtherFilesWithRoundRobinExpansion();
+    }
+    if (!is_l0_trivial_move_ &&
+        !compaction_picker_->SetupOtherInputs(
+            cf_name_, mutable_cf_options_, vstorage_, &start_level_inputs_,
+            &output_level_inputs_, &parent_index_, base_index_,
+            round_robin_expanding)) {
+      return false;
+    }
+
+    compaction_inputs_.push_back(start_level_inputs_);
+    if (!output_level_inputs_.empty()) {
+      compaction_inputs_.push_back(output_level_inputs_);
+    }
+
+    if (!is_l0_trivial_move_) {
+      // In some edge cases we could pick a compaction that will be compacting
+      // a key range that overlap with another running compaction, and both
+      // of them have the same output level. This could happen if
+      // (1) we are running a non-exclusive manual compaction
+      // (2) AddFile ingest a new file into the LSM tree
+      // We need to disallow this from happening.
+      if (compaction_picker_->FilesRangeOverlapWithCompaction(
+              compaction_inputs_, output_level_,
+              Compaction::EvaluatePenultimateLevel(
+                  vstorage_, ioptions_, start_level_, output_level_))) {
+        // This compaction output could potentially conflict with the output
+        // of a currently running compaction, we cannot run it.
+        return false;
+      }
+      compaction_picker_->GetGrandparents(vstorage_, start_level_inputs_,
+                                          output_level_inputs_, &grandparents_);
+    }
+  } else {
+    compaction_inputs_.push_back(start_level_inputs_);
+  }
+  return true;
+}
+
+Compaction* LevelCompactionBuilder::PickCompaction() {
+  // Pick up the first file to start compaction. It may have been extended
+  // to a clean cut.
+  SetupInitialFiles();
+  if (start_level_inputs_.empty()) {
+    return nullptr;
+  }
+  assert(start_level_ >= 0 && output_level_ >= 0);
+
+  // If it is a L0 -> base level compaction, we need to set up other L0
+  // files if needed.
+  if (!SetupOtherL0FilesIfNeeded()) {
+    return nullptr;
+  }
+
+  // Pick files in the output level and expand more files in the start level
+  // if needed.
+  if (!SetupOtherInputsIfNeeded()) {
+    return nullptr;
+  }
+
+  // Form a compaction object containing the files we picked.
+  Compaction* c = GetCompaction();
+
+  TEST_SYNC_POINT_CALLBACK("LevelCompactionPicker::PickCompaction:Return", c);
+
+  return c;
+}
+
+Compaction* LevelCompactionBuilder::GetCompaction() {
+  auto c = new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(compaction_inputs_), output_level_,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level_,
+                          ioptions_.compaction_style, vstorage_->base_level(),
+                          ioptions_.level_compaction_dynamic_level_bytes),
+      mutable_cf_options_.max_compaction_bytes,
+      GetPathId(ioptions_, mutable_cf_options_, output_level_),
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level_,
+                         vstorage_->base_level()),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level_),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
+      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
+      compaction_reason_);
+
+  // If it's level 0 compaction, make sure we don't execute any other level 0
+  // compactions in parallel
+  compaction_picker_->RegisterCompaction(c);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  return c;
+}
+
+/*
+ * Find the optimal path to place a file
+ * Given a level, finds the path where levels up to it will fit in levels
+ * up to and including this path
+ */
+uint32_t LevelCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, int level) {
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+
+  // size remaining in the most recent path
+  uint64_t current_path_size = ioptions.cf_paths[0].target_size;
+
+  uint64_t level_size;
+  int cur_level = 0;
+
+  // max_bytes_for_level_base denotes L1 size.
+  // We estimate L0 size to be the same as L1.
+  level_size = mutable_cf_options.max_bytes_for_level_base;
+
+  // Last path is the fallback
+  while (p < ioptions.cf_paths.size() - 1) {
+    if (level_size <= current_path_size) {
+      if (cur_level == level) {
+        // Does desired level fit in this path?
+        return p;
+      } else {
+        current_path_size -= level_size;
+        if (cur_level > 0) {
+          if (ioptions.level_compaction_dynamic_level_bytes) {
+            // Currently, level_compaction_dynamic_level_bytes is ignored when
+            // multiple db paths are specified. https://github.com/facebook/
+            // rocksdb/blob/main/db/column_family.cc.
+            // Still, adding this check to avoid accidentally using
+            // max_bytes_for_level_multiplier_additional
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier);
+          } else {
+            level_size = static_cast<uint64_t>(
+                level_size * mutable_cf_options.max_bytes_for_level_multiplier *
+                mutable_cf_options.MaxBytesMultiplerAdditional(cur_level));
+          }
+        }
+        cur_level++;
+        continue;
+      }
+    }
+    p++;
+    current_path_size = ioptions.cf_paths[p].target_size;
+  }
+  return p;
+}
+
+bool LevelCompactionBuilder::TryPickL0TrivialMove() {
+  if (vstorage_->base_level() <= 0) {
+    return false;
+  }
+  if (start_level_ == 0 && mutable_cf_options_.compression_per_level.empty() &&
+      !vstorage_->LevelFiles(output_level_).empty() &&
+      ioptions_.db_paths.size() <= 1) {
+    // Try to pick trivial move from L0 to L1. We start from the oldest
+    // file. We keep expanding to newer files if it would form a
+    // trivial move.
+    // For now we don't support it with
+    // mutable_cf_options_.compression_per_level to prevent the logic
+    // of determining whether L0 can be trivial moved to the next level.
+    // We skip the case where output level is empty, since in this case, at
+    // least the oldest file would qualify for trivial move, and this would
+    // be a surprising behavior with few benefits.
+
+    // We search from the oldest file from the newest. In theory, there are
+    // files in the middle can form trivial move too, but it is probably
+    // uncommon and we ignore these cases for simplicity.
+    const std::vector<FileMetaData*>& level_files =
+        vstorage_->LevelFiles(start_level_);
+
+    InternalKey my_smallest, my_largest;
+    for (auto it = level_files.rbegin(); it != level_files.rend(); ++it) {
+      CompactionInputFiles output_level_inputs;
+      output_level_inputs.level = output_level_;
+      FileMetaData* file = *it;
+      if (it == level_files.rbegin()) {
+        my_smallest = file->smallest;
+        my_largest = file->largest;
+      } else {
+        if (compaction_picker_->icmp()->Compare(file->largest, my_smallest) <
+            0) {
+          my_smallest = file->smallest;
+        } else if (compaction_picker_->icmp()->Compare(file->smallest,
+                                                       my_largest) > 0) {
+          my_largest = file->largest;
+        } else {
+          break;
+        }
+      }
+      vstorage_->GetOverlappingInputs(output_level_, &my_smallest, &my_largest,
+                                      &output_level_inputs.files);
+      if (output_level_inputs.empty()) {
+        assert(!file->being_compacted);
+        start_level_inputs_.files.push_back(file);
+      } else {
+        break;
+      }
+    }
+  }
+
+  if (!start_level_inputs_.empty()) {
+    // Sort files by key range. Not sure it's 100% necessary but it's cleaner
+    // to always keep files sorted by key the key ranges don't overlap.
+    std::sort(start_level_inputs_.files.begin(),
+              start_level_inputs_.files.end(),
+              [icmp = compaction_picker_->icmp()](FileMetaData* f1,
+                                                  FileMetaData* f2) -> bool {
+                return (icmp->Compare(f1->smallest, f2->smallest) < 0);
+              });
+
+    is_l0_trivial_move_ = true;
+    return true;
+  }
+  return false;
+}
+
+bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
+  if (start_level_inputs_.size() == 1 &&
+      (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
+      (mutable_cf_options_.compression_per_level.empty())) {
+    // Only file of `index`, and it is likely a trivial move. Try to
+    // expand if it is still a trivial move, but not beyond
+    // max_compaction_bytes or 4 files, so that we don't create too
+    // much compaction pressure for the next level.
+    // Ignore if there are more than one DB path, as it would be hard
+    // to predict whether it is a trivial move.
+    const std::vector<FileMetaData*>& level_files =
+        vstorage_->LevelFiles(start_level_);
+    const size_t kMaxMultiTrivialMove = 4;
+    FileMetaData* initial_file = start_level_inputs_.files[0];
+    size_t total_size = initial_file->fd.GetFileSize();
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    for (int i = start_index + 1;
+         i < static_cast<int>(level_files.size()) &&
+         start_level_inputs_.size() < kMaxMultiTrivialMove;
+         i++) {
+      FileMetaData* next_file = level_files[i];
+      if (next_file->being_compacted) {
+        break;
+      }
+      vstorage_->GetOverlappingInputs(output_level_, &(initial_file->smallest),
+                                      &(next_file->largest),
+                                      &output_level_inputs.files);
+      if (!output_level_inputs.empty()) {
+        break;
+      }
+      if (i < static_cast<int>(level_files.size()) - 1 &&
+          compaction_picker_->icmp()
+                  ->user_comparator()
+                  ->CompareWithoutTimestamp(
+                      next_file->largest.user_key(),
+                      level_files[i + 1]->smallest.user_key()) == 0) {
+        TEST_SYNC_POINT_CALLBACK(
+            "LevelCompactionBuilder::TryExtendNonL0TrivialMove:NoCleanCut",
+            nullptr);
+        // Not a clean up after adding the next file. Skip.
+        break;
+      }
+      total_size += next_file->fd.GetFileSize();
+      if (total_size > mutable_cf_options_.max_compaction_bytes) {
+        break;
+      }
+      start_level_inputs_.files.push_back(next_file);
+    }
+    return start_level_inputs_.size() > 1;
+  }
+  return false;
+}
+
+bool LevelCompactionBuilder::PickFileToCompact() {
+  // level 0 files are overlapping. So we cannot pick more
+  // than one concurrent compactions at this level. This
+  // could be made better by looking at key-ranges that are
+  // being compacted at level 0.
+  if (start_level_ == 0 &&
+      !compaction_picker_->level0_compactions_in_progress()->empty()) {
+    TEST_SYNC_POINT("LevelCompactionPicker::PickCompactionBySize:0");
+    return false;
+  }
+
+  start_level_inputs_.clear();
+  start_level_inputs_.level = start_level_;
+
+  assert(start_level_ >= 0);
+
+  if (TryPickL0TrivialMove()) {
+    return true;
+  }
+
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(start_level_);
+
+  // Pick the file with the highest score in this level that is not already
+  // being compacted.
+  const std::vector<int>& file_scores =
+      vstorage_->FilesByCompactionPri(start_level_);
+
+  unsigned int cmp_idx;
+  for (cmp_idx = vstorage_->NextCompactionIndex(start_level_);
+       cmp_idx < file_scores.size(); cmp_idx++) {
+    int index = file_scores[cmp_idx];
+    auto* f = level_files[index];
+
+    // do not pick a file to compact if it is being compacted
+    // from n-1 level.
+    if (f->being_compacted) {
+      if (ioptions_.compaction_pri == kRoundRobin) {
+        // TODO(zichen): this file may be involved in one compaction from
+        // an upper level, cannot advance the cursor for round-robin policy.
+        // Currently, we do not pick any file to compact in this case. We
+        // should fix this later to ensure a compaction is picked but the
+        // cursor shall not be advanced.
+        return false;
+      }
+      continue;
+    }
+
+    start_level_inputs_.files.push_back(f);
+    if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                    &start_level_inputs_) ||
+        compaction_picker_->FilesRangeOverlapWithCompaction(
+            {start_level_inputs_}, output_level_,
+            Compaction::EvaluatePenultimateLevel(
+                vstorage_, ioptions_, start_level_, output_level_))) {
+      // A locked (pending compaction) input-level file was pulled in due to
+      // user-key overlap.
+      start_level_inputs_.clear();
+
+      if (ioptions_.compaction_pri == kRoundRobin) {
+        return false;
+      }
+      continue;
+    }
+
+    // Now that input level is fully expanded, we check whether any output
+    // files are locked due to pending compaction.
+    //
+    // Note we rely on ExpandInputsToCleanCut() to tell us whether any output-
+    // level files are locked, not just the extra ones pulled in for user-key
+    // overlap.
+    InternalKey smallest, largest;
+    compaction_picker_->GetRange(start_level_inputs_, &smallest, &largest);
+    CompactionInputFiles output_level_inputs;
+    output_level_inputs.level = output_level_;
+    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
+                                    &output_level_inputs.files);
+    if (output_level_inputs.empty()) {
+      if (TryExtendNonL0TrivialMove(index)) {
+        break;
+      }
+    } else {
+      if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                                      &output_level_inputs)) {
+        start_level_inputs_.clear();
+        if (ioptions_.compaction_pri == kRoundRobin) {
+          return false;
+        }
+        continue;
+      }
+    }
+
+    base_index_ = index;
+    break;
+  }
+
+  // store where to start the iteration in the next call to PickCompaction
+  if (ioptions_.compaction_pri != kRoundRobin) {
+    vstorage_->SetNextCompactionIndex(start_level_, cmp_idx);
+  }
+  return start_level_inputs_.size() > 0;
+}
+
+bool LevelCompactionBuilder::PickIntraL0Compaction() {
+  start_level_inputs_.clear();
+  const std::vector<FileMetaData*>& level_files =
+      vstorage_->LevelFiles(0 /* level */);
+  if (level_files.size() <
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger + 2) ||
+      level_files[0]->being_compacted) {
+    // If L0 isn't accumulating much files beyond the regular trigger, don't
+    // resort to L0->L0 compaction yet.
+    return false;
+  }
+  return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction,
+                               std::numeric_limits<uint64_t>::max(),
+                               mutable_cf_options_.max_compaction_bytes,
+                               &start_level_inputs_, earliest_mem_seqno_);
+}
+}  // namespace
+
+Compaction* LevelCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) {
+  LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this,
+                                 log_buffer, mutable_cf_options, ioptions_,
+                                 mutable_db_options);
+  return builder.PickCompaction();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_level.h b/src/rocksdb/db/compaction/compaction_picker_level.h
new file mode 100644
index 000000000..42a9b60a6
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_level.h
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Picking compactions for leveled compaction. See wiki page
+// https://github.com/facebook/rocksdb/wiki/Leveled-Compaction
+// for description of Leveled compaction.
+class LevelCompactionPicker : public CompactionPicker {
+ public:
+  LevelCompactionPicker(const ImmutableOptions& ioptions,
+                        const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_picker_test.cc b/src/rocksdb/db/compaction/compaction_picker_test.cc
new file mode 100644
index 000000000..2e2e566c0
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_test.cc
@@ -0,0 +1,3964 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker_fifo.h"
+#include "db/compaction/compaction_picker_level.h"
+#include "db/compaction/compaction_picker_universal.h"
+#include "db/compaction/file_pri.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CountingLogger : public Logger {
+ public:
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  size_t log_count;
+};
+
+class CompactionPickerTestBase : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  MutableDBOptions mutable_db_options_;
+  LevelCompactionPicker level_compaction_picker;
+  std::string cf_name_;
+  CountingLogger logger_;
+  LogBuffer log_buffer_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::unique_ptr<VersionStorageInfo> vstorage_;
+  std::vector<std::unique_ptr<FileMetaData>> files_;
+  // does not own FileMetaData
+  std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
+  // input files to compaction process.
+  std::vector<CompactionInputFiles> input_files_;
+  int compaction_level_start_;
+
+  explicit CompactionPickerTestBase(const Comparator* _ucmp)
+      : ucmp_(_ucmp),
+        icmp_(ucmp_),
+        options_(CreateOptions(ucmp_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        mutable_db_options_(),
+        level_compaction_picker(ioptions_, &icmp_),
+        cf_name_("dummy"),
+        log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_),
+        file_num_(1),
+        vstorage_(nullptr) {
+    mutable_cf_options_.ttl = 0;
+    mutable_cf_options_.periodic_compaction_seconds = 0;
+    // ioptions_.compaction_pri = kMinOverlappingRatio has its own set of
+    // tests to cover.
+    ioptions_.compaction_pri = kByCompensatedSize;
+    fifo_options_.max_table_files_size = 1;
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    ioptions_.cf_paths.emplace_back("dummy",
+                                    std::numeric_limits<uint64_t>::max());
+  }
+
+  ~CompactionPickerTestBase() override {}
+
+  void NewVersionStorage(int num_levels, CompactionStyle style) {
+    DeleteVersionStorage();
+    options_.num_levels = num_levels;
+    vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels,
+                                           style, nullptr, false));
+    vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+  }
+
+  // Create a new VersionStorageInfo object so we can add mode files and then
+  // merge it with the existing VersionStorageInfo
+  void AddVersionStorage() {
+    temp_vstorage_.reset(new VersionStorageInfo(
+        &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style,
+        vstorage_.get(), false));
+  }
+
+  void DeleteVersionStorage() {
+    vstorage_.reset();
+    temp_vstorage_.reset();
+    files_.clear();
+    file_map_.clear();
+    input_files_.clear();
+  }
+
+  // REQUIRES: smallest and largest are c-style strings ending with '\0'
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 1, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           size_t compensated_file_size = 0, bool marked_for_compact = false,
+           Temperature temperature = Temperature::kUnknown,
+           uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime,
+           Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) {
+    assert(ts_of_smallest.size() == ucmp_->timestamp_size());
+    assert(ts_of_largest.size() == ucmp_->timestamp_size());
+
+    VersionStorageInfo* vstorage;
+    if (temp_vstorage_) {
+      vstorage = temp_vstorage_.get();
+    } else {
+      vstorage = vstorage_.get();
+    }
+    assert(level < vstorage->num_levels());
+    char* smallest_key_buf = nullptr;
+    char* largest_key_buf = nullptr;
+
+    if (!ts_of_smallest.empty()) {
+      smallest_key_buf = new char[strlen(smallest) + ucmp_->timestamp_size()];
+      memcpy(smallest_key_buf, smallest, strlen(smallest));
+      memcpy(smallest_key_buf + strlen(smallest), ts_of_smallest.data(),
+             ucmp_->timestamp_size());
+      largest_key_buf = new char[strlen(largest) + ucmp_->timestamp_size()];
+      memcpy(largest_key_buf, largest, strlen(largest));
+      memcpy(largest_key_buf + strlen(largest), ts_of_largest.data(),
+             ucmp_->timestamp_size());
+    }
+
+    InternalKey smallest_ikey = InternalKey(
+        smallest_key_buf ? Slice(smallest_key_buf,
+                                 ucmp_->timestamp_size() + strlen(smallest))
+                         : smallest,
+        smallest_seq, kTypeValue);
+    InternalKey largest_ikey = InternalKey(
+        largest_key_buf
+            ? Slice(largest_key_buf, ucmp_->timestamp_size() + strlen(largest))
+            : largest,
+        largest_seq, kTypeValue);
+
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, smallest_ikey, largest_ikey,
+        smallest_seq, largest_seq, marked_for_compact, temperature,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size =
+        (compensated_file_size != 0) ? compensated_file_size : file_size;
+    f->oldest_ancester_time = oldest_ancestor_time;
+    vstorage->AddFile(level, f);
+    files_.emplace_back(f);
+    file_map_.insert({file_number, {f, level}});
+
+    delete[] smallest_key_buf;
+    delete[] largest_key_buf;
+  }
+
+  void SetCompactionInputFilesLevels(int level_count, int start_level) {
+    input_files_.resize(level_count);
+    for (int i = 0; i < level_count; ++i) {
+      input_files_[i].level = start_level + i;
+    }
+    compaction_level_start_ = start_level;
+  }
+
+  void AddToCompactionFiles(uint32_t file_number) {
+    auto iter = file_map_.find(file_number);
+    assert(iter != file_map_.end());
+    int level = iter->second.second;
+    assert(level < vstorage_->num_levels());
+    input_files_[level - compaction_level_start_].files.emplace_back(
+        iter->second.first);
+  }
+
+  void UpdateVersionStorageInfo() {
+    if (temp_vstorage_) {
+      VersionBuilder builder(FileOptions(), &ioptions_, nullptr,
+                             vstorage_.get(), nullptr);
+      ASSERT_OK(builder.SaveTo(temp_vstorage_.get()));
+      vstorage_ = std::move(temp_vstorage_);
+    }
+    vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    vstorage_->SetFinalized();
+  }
+
+ private:
+  Options CreateOptions(const Comparator* ucmp) const {
+    Options opts;
+    opts.comparator = ucmp;
+    return opts;
+  }
+
+  std::unique_ptr<VersionStorageInfo> temp_vstorage_;
+};
+
+class CompactionPickerTest : public CompactionPickerTestBase {
+ public:
+  explicit CompactionPickerTest()
+      : CompactionPickerTestBase(BytewiseComparator()) {}
+
+  ~CompactionPickerTest() override {}
+};
+
+class CompactionPickerU64TsTest : public CompactionPickerTestBase {
+ public:
+  explicit CompactionPickerU64TsTest()
+      : CompactionPickerTestBase(test::BytewiseComparatorWithU64TsWrapper()) {}
+
+  ~CompactionPickerU64TsTest() override {}
+};
+
+TEST_F(CompactionPickerTest, Empty) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  UpdateVersionStorageInfo();
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Single) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "p", "q");
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, Level0Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, Level1Trigger2) {
+  mutable_cf_options_.target_file_size_base = 10000000000;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(1, 66U, "150", "200", 1000000001U);
+  Add(1, 88U, "201", "300", 1000000000U);
+  Add(2, 6U, "150", "179", 1000000000U);
+  Add(2, 7U, "180", "220", 1000000000U);
+  Add(2, 8U, "221", "300", 1000000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(uint64_t{1073741824}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, LevelMaxScore) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+  Add(0, 1U, "150", "200", 1000000U);
+  // Level 1 score 1.2
+  Add(1, 66U, "150", "200", 6000000U);
+  Add(1, 88U, "201", "300", 6000000U);
+  // Level 2 score 1.8. File 7 is the largest. Should be picked
+  Add(2, 6U, "150", "179", 60000000U);
+  Add(2, 7U, "180", "220", 60000001U);
+  Add(2, 8U, "221", "300", 60000000U);
+  // Level 3 score slightly larger than 1
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(mutable_cf_options_.target_file_size_base +
+                mutable_cf_options_.target_file_size_base / 10,
+            compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionLevel) {
+  const int kLevels = 6;
+  const int kFileCount = 20;
+
+  for (int level = 0; level < kLevels - 1; ++level) {
+    NewVersionStorage(kLevels, kCompactionStyleLevel);
+    uint64_t file_size = vstorage_->MaxBytesForLevel(level) * 2 / kFileCount;
+    for (int file_count = 1; file_count <= kFileCount; ++file_count) {
+      // start a brand new version in each test.
+      NewVersionStorage(kLevels, kCompactionStyleLevel);
+      for (int i = 0; i < file_count; ++i) {
+        Add(level, i, std::to_string((i + 100) * 1000).c_str(),
+            std::to_string((i + 100) * 1000 + 999).c_str(), file_size, 0,
+            i * 100, i * 100 + 99);
+      }
+      UpdateVersionStorageInfo();
+      ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level);
+      ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+                vstorage_->CompactionScore(0) >= 1);
+      // release the version storage
+      DeleteVersionStorage();
+    }
+  }
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 2);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 2, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(1, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, Level0TriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(0, 2U, "200", "250");
+  Add(num_levels - 1, 3U, "200", "250", 300U);
+  Add(num_levels - 1, 4U, "300", "350", 3000U);
+  Add(num_levels - 3, 5U, "150", "180", 3U);
+  Add(num_levels - 3, 6U, "181", "300", 3U);
+  Add(num_levels - 3, 7U, "400", "450", 3U);
+
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(vstorage_->base_level(), num_levels - 3);
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(num_levels - 3, compaction->level(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(2, static_cast<int>(compaction->num_input_levels()));
+  ASSERT_EQ(num_levels - 3, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");
+  Add(num_levels - 1, 2U, "200", "250", 300U);
+  Add(num_levels - 1, 3U, "300", "350", 3000U);
+  Add(num_levels - 1, 4U, "400", "450", 3U);
+  Add(num_levels - 2, 5U, "150", "180", 300U);
+  Add(num_levels - 2, 6U, "181", "350", 500U);
+  Add(num_levels - 2, 7U, "400", "450", 200U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(num_levels - 1, compaction->output_level());
+}
+
+// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  // verify the trigger given different number of L0 files.
+  for (int i = 1;
+       i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) {
+    NewVersionStorage(1, kCompactionStyleUniversal);
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
+  const uint64_t kFileSize = 100000;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  ioptions_.allow_ingest_behind = true;
+  ioptions_.num_levels = 3;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // output level should be the one above the bottom-most
+  ASSERT_EQ(1, compaction->output_level());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files overlaps, they cannot
+// be trivially moved.
+
+TEST_F(CompactionPickerTest, CannotTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()),
+            false);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "100", "151", kFileSize, 0, 200, 251);
+  Add(1, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(2, 6U, "120", "200", kFileSize, 0, 20, 100);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+// Tests if the files can be trivially moved in multi level
+// universal compaction when allow_trivial_move option is set
+// In this test as the input files doesn't overlaps, they should
+// be trivially moved.
+TEST_F(CompactionPickerTest, AllowsTrivialMoveUniversal) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(1, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 3U, "301", "350", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction->is_trivial_move());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction1) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[3].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction2) {
+  // The case where universal periodic compaction does not
+  // pick up only level to compact if it doesn't cover
+  // any file marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction3) {
+  // The case where universal periodic compaction does not
+  // pick up only the last sorted run which is an L0 file if it isn't
+  // marked as periodic compaction.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(0, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[5].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[1].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_FALSE(compaction);
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction4) {
+  // The case where universal periodic compaction couldn't form
+  // a compaction that includes any file marked for periodic compaction.
+  // Right now we form the compaction anyway if it is more than one
+  // sorted run. Just put the case here to validate that it doesn't
+  // crash.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "010", "080", kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", kFileSize, 0, 101, 150);
+
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[2].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(!compaction ||
+              compaction->start_level() != compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction5) {
+  // Test single L0 file periodic compaction triggering.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 6U, "150", "200", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(0, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalPeriodicCompaction6) {
+  // Test single sorted run non-L0 periodic compaction
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.periodic_compaction_seconds = 1000;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(4, 5U, "150", "200", kFileSize, 0, 500, 550);
+  Add(4, 6U, "350", "400", kFileSize, 0, 500, 550);
+  UpdateVersionStorageInfo();
+  vstorage_->TEST_AddFileMarkedForPeriodicCompaction(4, file_map_[6].first);
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace1) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 555555;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(3, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(3, 7U, "910", "980", 1, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+  //  Add(4, 15U, "960", "970", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(5U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(0, 1)->fd.GetNumber());
+  // ASSERT_EQ(4U, compaction->num_input_files(1));
+  ASSERT_EQ(11U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(12U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(13U, compaction->input(1, 2)->fd.GetNumber());
+  ASSERT_EQ(14U, compaction->input(1, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace2) {
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 400000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(1, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(2, 5U, "310", "380", kFileSize, 0, 200, 251);
+  Add(2, 6U, "410", "880", kFileSize, 0, 200, 251);
+  Add(2, 7U, "910", "980", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(15U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace3) {
+  // Test bottom level files falling between gaps between two upper level
+  // files
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 300000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+  Add(3, 5U, "000", "180", kFileSize, 0, 200, 251);
+  Add(3, 6U, "181", "190", kFileSize, 0, 200, 251);
+  Add(3, 7U, "710", "810", kFileSize, 0, 200, 251);
+  Add(3, 8U, "820", "830", kFileSize, 0, 200, 251);
+  Add(3, 9U, "900", "991", kFileSize, 0, 200, 251);
+  Add(4, 10U, "201", "250", kFileSize, 0, 101, 150);
+  Add(4, 11U, "301", "350", kFileSize, 0, 101, 150);
+  Add(4, 12U, "401", "450", kFileSize, 0, 101, 150);
+  Add(4, 13U, "501", "750", kFileSize, 0, 101, 150);
+  Add(4, 14U, "801", "850", kFileSize, 0, 101, 150);
+  Add(4, 15U, "901", "950", kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(2, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(0, compaction->num_input_files(2));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace4) {
+  // Test compaction candidates always cover many files.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 79; i++) {
+    Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+        std::to_string(i * 100 + 80).c_str(), kFileSize, 0, 200, 251);
+    // Add a tie breaker
+    if (i == 66) {
+      Add(3, 10000U, "6690", "6699", kFileSize, 0, 200, 251);
+    }
+
+    Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+        std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+        std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 62U * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(10000U, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(11, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, UniversalIncrementalSpace5) {
+  // Test compaction candidates always cover many files with some single
+  // files larger than size threshold.
+  const uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.max_compaction_bytes = 3200000;
+  mutable_cf_options_.compaction_options_universal.incremental = true;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 30;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(2, 2U, "010", "080", kFileSize, 0, 200, 251);
+
+  // Generate files like following:
+  // L3: (1101, 1180) (1201, 1280) ... (7901, 7908)
+  // L4: (1130, 1150) (1160, 1210) (1230, 1250) (1260 1310) ... (7960, 8010)
+  for (int i = 11; i < 70; i++) {
+    Add(3, 100 + i * 3, std::to_string(i * 100).c_str(),
+        std::to_string(i * 100 + 80).c_str(),
+        i % 10 == 9 ? kFileSize * 100 : kFileSize, 0, 200, 251);
+
+    Add(4, 100 + i * 3 + 1, std::to_string(i * 100 + 30).c_str(),
+        std::to_string(i * 100 + 50).c_str(), kFileSize, 0, 200, 251);
+    Add(4, 100 + i * 3 + 2, std::to_string(i * 100 + 60).c_str(),
+        std::to_string(i * 100 + 110).c_str(), kFileSize, 0, 200, 251);
+  }
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction);
+  ASSERT_EQ(4, compaction->output_level());
+  ASSERT_EQ(3, compaction->start_level());
+  ASSERT_EQ(6U, compaction->num_input_files(0));
+  ASSERT_EQ(100 + 14 * 3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(100 + 19 * 3, compaction->input(0, 5)->fd.GetNumber());
+  ASSERT_EQ(13, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const int kFileCount =
+      mutable_cf_options_.level0_file_num_compaction_trigger * 3;
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * kFileCount / 2;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false);
+
+  // verify whether compaction is needed based on the current
+  // size of L0 files.
+  for (int i = 1; i <= kFileCount; ++i) {
+    NewVersionStorage(1, kCompactionStyleFIFO);
+    Add(0, i, std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), kFileSize, 0, i * 100,
+        i * 100 + 99);
+    UpdateVersionStorageInfo();
+    ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()),
+              vstorage_->CompactionScore(0) >= 1);
+  }
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm1) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarm2) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kUnknown, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kUnknown, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  file_map_[2].first->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+  NewVersionStorage(1, kCompactionStyleFIFO);
+  const uint64_t kFileSize = 100000;
+  const uint64_t kMaxSize = kFileSize * 100000;
+  uint64_t kWarmThreshold = 2000;
+
+  fifo_options_.max_table_files_size = kMaxSize;
+  fifo_options_.age_for_warm = kWarmThreshold;
+  mutable_cf_options_.compaction_options_fifo = fifo_options_;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
+  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
+
+  int64_t current_time = 0;
+  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
+  uint64_t threshold_time =
+      static_cast<uint64_t>(current_time) - kWarmThreshold;
+  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
+      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
+      Temperature::kUnknown, threshold_time + 100);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
+      Temperature::kUnknown, threshold_time - 2000);
+  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
+      Temperature::kWarm, threshold_time - 3000);
+  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
+      Temperature::kUnknown, threshold_time - 4000);
+  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
+      Temperature::kWarm, threshold_time - 5000);
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
+  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  // Stop if a file is being compacted
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 100000000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+  mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+
+  Add(2, 6U, "150", "179", 50000000U);
+  Add(2, 7U, "180", "220", 50000000U);
+  Add(2, 8U, "321", "400", 50000000U);  // File not overlapping
+  Add(2, 9U, "721", "800", 50000000U);
+
+  Add(3, 26U, "150", "170", 260000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 260000000U);
+  Add(3, 30U, "750", "900", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Pick file 8 because it overlaps with 0 files on level 3.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+  // Compaction input size * 1.1
+  ASSERT_GE(uint64_t{55000000}, compaction->OutputFilePreallocationSize());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.target_file_size_base = 10000000;
+  mutable_cf_options_.target_file_size_multiplier = 10;
+  mutable_cf_options_.max_bytes_for_level_base = 10 * 1024 * 1024;
+
+  Add(2, 6U, "150", "175",
+      60000000U);  // Overlaps with file 26, 27, total size 521M
+  Add(2, 7U, "176", "200", 60000000U);  // Overlaps with file 27, 28, total size
+                                        // 520M, the smallest overlapping
+  Add(2, 8U, "201", "300",
+      60000000U);  // Overlaps with file 28, 29, total size 521M
+
+  Add(3, 25U, "100", "110", 261000000U);
+  Add(3, 26U, "150", "170", 261000000U);
+  Add(3, 27U, "171", "179", 260000000U);
+  Add(3, 28U, "191", "220", 260000000U);
+  Add(3, 29U, "221", "300", 261000000U);
+  Add(3, 30U, "321", "400", 261000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 7 because overlapping ratio is the biggest.
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  Add(2, 6U, "150", "167", 60000000U);  // Overlaps with file 26, 27
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28, but the file
+                                        // itself is larger. Should be picked.
+
+  Add(3, 26U, "160", "165", 260000000U);
+  Add(3, 27U, "166", "170", 260000000U);
+  Add(3, 28U, "180", "400", 260000000U);
+  Add(3, 29U, "401", "500", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlapping4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 10000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  // Overlaps with file 26, 27. And the file is compensated so will be
+  // picked up.
+  Add(2, 6U, "150", "167", 60000000U, 0, 100, 100, 180000000U);
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28
+
+  Add(3, 26U, "160", "165", 60000000U);
+  // Boosted file size in output level is not considered.
+  Add(3, 27U, "166", "170", 60000000U, 0, 100, 100, 260000000U);
+  Add(3, 28U, "180", "400", 60000000U);
+  Add(3, 29U, "401", "500", 60000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CompactionPriRoundRobin) {
+  std::vector<InternalKey> test_cursors = {InternalKey("249", 100, kTypeValue),
+                                           InternalKey("600", 100, kTypeValue),
+                                           InternalKey()};
+  std::vector<uint32_t> selected_files = {8U, 6U, 6U};
+
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_bytes_for_level_base = 12000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  for (size_t i = 0; i < test_cursors.size(); i++) {
+    // start a brand new version in each test.
+    NewVersionStorage(6, kCompactionStyleLevel);
+    vstorage_->ResizeCompactCursors(6);
+    // Set the cursor
+    vstorage_->AddCursorForOneLevel(2, test_cursors[i]);
+    Add(2, 6U, "150", "199", 50000000U);  // Overlap with 26U, 27U
+    Add(2, 7U, "200", "249", 50000000U);  // File not overlapping
+    Add(2, 8U, "300", "600", 50000000U);  // Overlap with 28U, 29U
+
+    Add(3, 26U, "130", "165", 60000000U);
+    Add(3, 27U, "166", "170", 60000000U);
+    Add(3, 28U, "270", "340", 60000000U);
+    Add(3, 29U, "401", "500", 60000000U);
+    UpdateVersionStorageInfo();
+    LevelCompactionPicker local_level_compaction_picker =
+        LevelCompactionPicker(ioptions_, &icmp_);
+    std::unique_ptr<Compaction> compaction(
+        local_level_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+    ASSERT_TRUE(compaction.get() != nullptr);
+    // Since the max bytes for level 2 is 120M, picking one file to compact
+    // makes the post-compaction level size less than 120M, there is exactly one
+    // file picked for round-robin compaction
+    ASSERT_EQ(1U, compaction->num_input_files(0));
+    ASSERT_EQ(selected_files[i], compaction->input(0, 0)->fd.GetNumber());
+    // release the version storage
+    DeleteVersionStorage();
+  }
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin1) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 100000000u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 7U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("199", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);
+  Add(2, 7U, "200", "249", 500U);
+  Add(2, 8U, "300", "600", 500U);
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "170", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // The maximum compaction bytes is very large in this case so we can igore its
+  // constraint in this test case. The maximum bytes for level 2 is 1200
+  // bytes, and thus at least 3 files should be picked so that the bytes in
+  // level 2 is less than the maximum
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(8U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(9U, compaction->input(0, 2)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin2) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 2500u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 6U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("1000", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);  // Overlap with 26U, 27U
+  Add(2, 7U, "200", "249", 500U);  // Overlap with 27U
+  Add(2, 8U, "300", "600", 500U);  // Overlap with 28U, 29U
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "230", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // The maximum compaction bytes is only 2500 bytes now. Even though we are
+  // required to choose 3 files so that the post-compaction level size is less
+  // than 1200 bytes. We cannot pick 3 files to compact since the maximum
+  // compaction size is 2500. After picking files 6U and 7U, the number of
+  // compaction bytes has reached 2200, and thus no more space to add another
+  // input file with 50M bytes.
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(6U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(0, 1)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMultipleFilesRoundRobin3) {
+  ioptions_.compaction_pri = kRoundRobin;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  mutable_cf_options_.max_bytes_for_level_base = 120;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  // start a brand new version in each test.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  vstorage_->ResizeCompactCursors(6);
+  // Set the cursor (file picking should start with 9U)
+  vstorage_->AddCursorForOneLevel(2, InternalKey("700", 100, kTypeValue));
+  Add(2, 6U, "150", "199", 500U);
+  Add(2, 7U, "200", "249", 500U);
+  Add(2, 8U, "300", "600", 500U);
+  Add(2, 9U, "700", "800", 500U);
+  Add(2, 10U, "850", "950", 500U);
+
+  Add(3, 26U, "130", "165", 600U);
+  Add(3, 27U, "166", "170", 600U);
+  Add(3, 28U, "270", "340", 600U);
+  Add(3, 29U, "401", "500", 600U);
+  Add(3, 30U, "601", "800", 600U);
+  Add(3, 31U, "830", "890", 600U);
+  UpdateVersionStorageInfo();
+  LevelCompactionPicker local_level_compaction_picker =
+      LevelCompactionPicker(ioptions_, &icmp_);
+  std::unique_ptr<Compaction> compaction(
+      local_level_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+
+  // Cannot pick more files since we reach the last file in level 2
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(9U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(10U, compaction->input(0, 1)->fd.GetNumber());
+  // release the version storage
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, CompactionPriMinOverlappingManyFiles) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  mutable_cf_options_.max_bytes_for_level_base = 15000000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  // file 7 and 8 over lap with the same file, but file 8 is smaller so
+  // it will be picked.
+  Add(2, 13U, "010", "011",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 14U, "020", "021",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 15U, "030", "031",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 16U, "040", "041",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 17U, "050", "051",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 18U, "060", "061",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 19U, "070", "071",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 20U, "080", "081",
+      6100U);  // Overlaps with a large file. Not picked
+
+  Add(2, 6U, "150", "167", 60000000U);  // Overlaps with file 26, 27
+  Add(2, 7U, "168", "169", 60000000U);  // Overlaps with file 27
+  Add(2, 8U, "201", "300", 61000000U);  // Overlaps with file 28, but the file
+                                        // itself is larger. Should be picked.
+  Add(2, 9U, "610", "611",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 10U, "620", "621",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 11U, "630", "631",
+      6100U);  // Overlaps with a large file. Not picked
+  Add(2, 12U, "640", "641",
+      6100U);  // Overlaps with a large file. Not picked
+
+  Add(3, 31U, "001", "100", 260000000U);
+  Add(3, 26U, "160", "165", 260000000U);
+  Add(3, 27U, "166", "170", 260000000U);
+  Add(3, 28U, "180", "400", 260000000U);
+  Add(3, 29U, "401", "500", 260000000U);
+  Add(3, 30U, "601", "700", 260000000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  // Picking file 8 because overlapping ratio is the biggest.
+  ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber());
+}
+
+// This test exhibits the bug where we don't properly reset parent_index in
+// PickCompaction()
+TEST_F(CompactionPickerTest, ParentIndexResetBug) {
+  int num_levels = ioptions_.num_levels;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 200;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200");       // <- marked for compaction
+  Add(1, 3U, "400", "500", 600);  // <- this one needs compacting
+  Add(2, 4U, "150", "200");
+  Add(2, 5U, "201", "210");
+  Add(2, 6U, "300", "310");
+  Add(2, 7U, "400", "500");  // <- being compacted
+
+  vstorage_->LevelFiles(2)[3]->being_compacted = true;
+  vstorage_->LevelFiles(0)[0]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+}
+
+// This test checks ExpandWhileOverlapping() by having overlapping user keys
+// ranges (with different sequence numbers) in the input files.
+TEST_F(CompactionPickerTest, OverlappingUserKeys) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  ioptions_.compaction_pri = kByCompensatedSize;
+
+  Add(1, 1U, "100", "150", 1U);
+  // Overlapping user keys
+  Add(1, 2U, "200", "400", 1U);
+  Add(1, 3U, "400", "500", 1000000000U, 0, 0);
+  Add(2, 4U, "600", "700", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(1, 1)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 2)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Chain of overlapping user key ranges (forces ExpandWhileOverlapping() to
+  // expand multiple times)
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "200", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1000000000U, 0, 0);
+  Add(1, 4U, "250", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(0, 4)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys4) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_bytes_for_level_base = 1000000;
+
+  Add(1, 1U, "100", "150", 1U);
+  Add(1, 2U, "150", "199", 1U, 0, 0);
+  Add(1, 3U, "200", "250", 1100000U, 0, 0);
+  Add(1, 4U, "251", "300", 1U, 0, 0);
+  Add(1, 5U, "300", "350", 1U, 0, 0);
+
+  Add(2, 6U, "100", "115", 1U);
+  Add(2, 7U, "125", "325", 1U);
+  Add(2, 8U, "350", "400", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys5) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1000000000U);
+  Add(1, 2U, "400", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(2)[2]->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys6) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1U, 0, 0);
+  Add(2, 3U, "000", "100", 1U);
+  Add(2, 4U, "100", "300", 1U, 0, 0);
+  Add(2, 5U, "305", "450", 1U, 0, 0);
+  Add(2, 6U, "460", "600", 1U, 0, 0);
+  Add(2, 7U, "600", "700", 1U, 0, 0);
+
+  vstorage_->LevelFiles(1)[0]->marked_for_compaction = true;
+  vstorage_->LevelFiles(1)[1]->marked_for_compaction = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(3U, compaction->num_input_files(1));
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys7) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // Overlapping user keys on same level and output level
+  Add(1, 1U, "200", "400", 1U, 0, 0);
+  Add(1, 2U, "401", "500", 1000000000U, 0, 0);
+  Add(2, 3U, "100", "250", 1U);
+  Add(2, 4U, "300", "600", 1U, 0, 0);
+  Add(2, 5U, "600", "800", 1U, 0, 0);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_GE(1U, compaction->num_input_files(0));
+  ASSERT_GE(2U, compaction->num_input_files(1));
+  // File 5 has to be included in the compaction
+  ASSERT_EQ(5U, compaction->inputs(1)->back()->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys8) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // no overlapping case
+  Add(1, 1U, "101", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "150", "200", 1U);
+  Add(2, 7U, "200", "450", 1U, 0, 0);
+  Add(2, 8U, "500", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys9) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+  // grow the number of inputs in "level" without
+  // changing the number of "level+1" files we pick up
+  // Expand input level as much as possible
+  // overlapping case
+  Add(1, 1U, "121", "150", 1U);
+  Add(1, 2U, "151", "200", 1U);
+  Add(1, 3U, "201", "300", 1000000000U);
+  Add(1, 4U, "301", "400", 1U);
+  Add(1, 5U, "401", "500", 1U);
+  Add(2, 6U, "100", "120", 1U);
+  Add(2, 7U, "150", "200", 1U);
+  Add(2, 8U, "200", "450", 1U, 0, 0);
+  Add(2, 9U, "501", "600", 1U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(4U, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+  ASSERT_EQ(8U, compaction->input(1, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys10) {
+  // Locked file encountered when pulling in extra input-level files with same
+  // user keys. Verify we pick the next-best file from the same input level.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // file_number 2U is largest and thus first choice. But it overlaps with
+  // file_number 1U which is being compacted. So instead we pick the next-
+  // biggest file, 3U, which is eligible for compaction.
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 900000000U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "150" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, OverlappingUserKeys11) {
+  // Locked file encountered when pulling in extra output-level files with same
+  // user keys. Expected to skip that compaction and pick the next-best choice.
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  // score(L1) = 3.7
+  // score(L2) = 1.85
+  // There is no eligible file in L1 to compact since both candidates pull in
+  // file_number 5U, which overlaps with a file pending compaction (6U). The
+  // first eligible compaction is from L2->L3.
+  Add(1 /* level */, 2U /* file_number */, "151" /* smallest */,
+      "200" /* largest */, 1000000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "201" /* smallest */,
+      "250" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 4U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 5000000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "201" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "201" /* smallest */,
+      "249" /* largest */, 1U /* file_size */, 0 /* smallest_seq */,
+      0 /* largest_seq */);
+  file_map_[6U].first->being_compacted = true;
+  Add(3 /* level */, 7U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1U /* file_size */);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(7U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, FileTtlBooster) {
+  // Set TTL to 2048
+  // TTL boosting for all levels starts at 1024,
+  // Whole TTL range is 2048 * 31 / 32 - 1024 = 1984 - 1024 = 960.
+  // From second last level (L5), range starts at
+  // 1024 + 480, 1024 + 240, 1024 + 120 (which is L3).
+  // Boosting step 124 / 16 = 7.75 -> 7
+  //
+  const uint64_t kCurrentTime = 1000000;
+  FileMetaData meta;
+
+  {
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 3);
+
+    // Not triggering if the file is younger than ttl/2
+    meta.oldest_ancester_time = kCurrentTime - 1023;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 1024;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime + 10;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // Within one boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 6);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+
+    // One boosting step
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 7);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 8);
+    ASSERT_EQ(2, booster.GetBoostScore(&meta));
+
+    // Multiple boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 30);
+    ASSERT_EQ(5, booster.GetBoostScore(&meta));
+
+    // Very high boosting steps
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 120 + 700);
+    ASSERT_EQ(101, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test second last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 5);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(3, booster.GetBoostScore(&meta));
+  }
+  {
+    // Test last level
+    FileTtlBooster booster(kCurrentTime, 2048, 7, 6);
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - (1024 + 480 + 60);
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+    meta.oldest_ancester_time = kCurrentTime - 3000;
+    ASSERT_EQ(1, booster.GetBoostScore(&meta));
+  }
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri1) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // No compaction should be scheduled, if L0 has higher priority than L1
+  // but L0->L1 compaction is blocked by a file in L1 being compacted.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri2) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 total size 2GB, score 2.2. If one file being compacted, score 1.1.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If no file in L1 being compacted, L0->L1 compaction will be scheduled.
+  UpdateVersionStorageInfo();  // being_compacted flag is cleared here.
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, NotScheduleL1IfL0WithHigherPri3) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.max_bytes_for_level_base = 900000000U;
+
+  // 6 L0 files, score 3.
+  Add(0, 1U, "000", "400", 1U);
+  Add(0, 2U, "001", "400", 1U, 0, 0);
+  Add(0, 3U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 31U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 32U, "001", "400", 1000000000U, 0, 0);
+  Add(0, 33U, "001", "400", 1000000000U, 0, 0);
+
+  // L1 score more than 6.
+  Add(1, 4U, "050", "300", 1000000000U, 0, 0);
+  file_map_[4u].first->being_compacted = true;
+  Add(1, 5U, "301", "350", 1000000000U, 0, 0);
+  Add(1, 51U, "351", "400", 6000000000U, 0, 0);
+
+  // Output level overlaps with the beginning and the end of the chain
+  Add(2, 6U, "050", "100", 1U);
+  Add(2, 7U, "300", "400", 1U);
+
+  // If score in L1 is larger than L0, L1 compaction goes through despite
+  // there is pending L0 compaction.
+  UpdateVersionStorageInfo();
+  ASSERT_EQ(1, vstorage_->CompactionScoreLevel(0));
+  ASSERT_EQ(0, vstorage_->CompactionScoreLevel(1));
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded1) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 3U, "150", "200", 200);
+  // Level 1 is over target by 200
+  Add(1, 4U, "400", "500", 600);
+  Add(1, 5U, "600", "700", 600);
+  // Level 2 is less than target 10000 even added size of level 1
+  // Size ratio of L2/L1 is 9600 / 1200 = 8
+  Add(2, 6U, "150", "200", 2500);
+  Add(2, 7U, "201", "210", 2000);
+  Add(2, 8U, "300", "310", 2600);
+  Add(2, 9U, "400", "500", 2500);
+  // Level 3 exceeds target 100,000 of 1000
+  Add(3, 10U, "400", "500", 101000);
+  // Level 4 exceeds target 1,000,000 by 900 after adding size from level 3
+  // Size ratio L4/L3 is 9.9
+  // After merge from L3, L4 size is 1000900
+  Add(4, 11U, "400", "500", 999900);
+  Add(5, 12U, "400", "500", 8007200);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(200u * 9u + 10900u + 900u * 9,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded2) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // Level 1 size will be 1400 after merging with L0
+  Add(1, 7U, "400", "500", 200);
+  Add(1, 8U, "600", "700", 200);
+  // Level 2 is less than target 10000 even added size of level 1
+  Add(2, 9U, "150", "200", 9100);
+  // Level 3 over the target, but since level 4 is empty, we assume it will be
+  // a trivial move.
+  Add(3, 10U, "400", "500", 101000);
+
+  UpdateVersionStorageInfo();
+
+  // estimated L1->L2 merge: 400 * (9100.0 / 1400.0 + 1.0)
+  ASSERT_EQ(1400u + 3000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeeded3) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+  Add(0, 1U, "150", "200", 2000);
+  Add(0, 2U, "150", "200", 2000);
+  Add(0, 4U, "150", "200", 2000);
+  Add(0, 5U, "150", "200", 2000);
+  Add(0, 6U, "150", "200", 1000);
+  // Level 1 size will be 10000 after merging with L0
+  Add(1, 7U, "400", "500", 500);
+  Add(1, 8U, "600", "700", 500);
+
+  Add(2, 9U, "150", "200", 10000);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(10000u + 18000u, vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
+  int num_levels = ioptions_.num_levels;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  // Set Last level size 50000
+  // num_levels - 1 target 5000
+  // num_levels - 2 is base level with target 1000 (rounded up to
+  // max_bytes_for_level_base).
+  Add(num_levels - 1, 10U, "400", "500", 50000);
+
+  Add(0, 1U, "150", "200", 200);
+  Add(0, 2U, "150", "200", 200);
+  Add(0, 4U, "150", "200", 200);
+  Add(0, 5U, "150", "200", 200);
+  Add(0, 6U, "150", "200", 200);
+  // num_levels - 3 is over target by 100 + 1000
+  Add(num_levels - 3, 7U, "400", "500", 550);
+  Add(num_levels - 3, 8U, "600", "700", 550);
+  // num_levels - 2 is over target by 1100 + 200
+  Add(num_levels - 2, 9U, "150", "200", 5200);
+
+  UpdateVersionStorageInfo();
+
+  // Merging to the second last level: (5200 / 2100 + 1) * 1100
+  // Merging to the last level: (50000 / 6300 + 1) * 1300
+  ASSERT_EQ(2100u + 3823u + 11617u,
+            vstorage_->estimated_compaction_needed_bytes());
+}
+
+TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
+  // case 1: Higher levels are empty
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  bool result =
+      Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 2: Higher levels have no overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "k", "p");
+  Add(3, 8U, "t", "w");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // case 3.1: Higher levels (level 3) have overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "e", "g");
+  Add(3, 8U, "h", "k");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.2: Higher levels (level 5) have overlap
+  DeleteVersionStorage();
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "h", "k");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
+  // one key ("d")
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "a", "m");
+  Add(0, 2U, "c", "z");
+  Add(1, 3U, "d", "e");
+  Add(1, 4U, "l", "p");
+  Add(2, 5U, "g", "i");
+  Add(2, 6U, "x", "z");
+  Add(3, 7U, "j", "k");
+  Add(3, 8U, "l", "m");
+  Add(4, 9U, "a", "b");
+  Add(5, 10U, "c", "cc");
+  Add(5, 11U, "ccc", "d");
+  Add(5, 12U, "y", "yy");
+  Add(5, 13U, "z", "zz");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 1);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(5U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "z");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  // Level 0 files don't overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(1, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_TRUE(result);
+
+  // Level 1 files overlap
+  NewVersionStorage(6, kCompactionStyleLevel);
+  Add(0, 1U, "s", "t");
+  Add(0, 2U, "a", "m");
+  Add(0, 3U, "b", "k");
+  Add(0, 4U, "e", "f");
+  Add(1, 5U, "a", "m");
+  Add(1, 6U, "n", "o");
+  Add(1, 7U, "w", "y");
+  Add(5, 10U, "y", "z");
+  UpdateVersionStorageInfo();
+  SetCompactionInputFilesLevels(2, 0);
+  AddToCompactionFiles(1U);
+  AddToCompactionFiles(2U);
+  AddToCompactionFiles(3U);
+  AddToCompactionFiles(4U);
+  AddToCompactionFiles(5U);
+  AddToCompactionFiles(6U);
+  AddToCompactionFiles(7U);
+  result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
+  ASSERT_FALSE(result);
+
+  DeleteVersionStorage();
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 800000u;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // It can expand because adding file 1 and 3, the compaction size will
+  // exceed mutable_cf_options_.max_bytes_for_level_base.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "100", "256", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, MaxCompactionBytesNotHit) {
+  mutable_cf_options_.max_bytes_for_level_base = 800000u;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  mutable_cf_options_.ignore_max_compaction_bytes_for_input = false;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2 and 5.
+  // and it expands to file 1 and 3 too.
+  Add(1, 1U, "100", "150", 300000U);
+  Add(1, 2U, "151", "200", 300001U, 0, 0);
+  Add(1, 3U, "201", "250", 300000U, 0, 0);
+  Add(1, 4U, "251", "300", 300000U, 0, 0);
+  Add(2, 5U, "000", "251", 1U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3U, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(5U, compaction->input(1, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOn) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMove1) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+  Add(0, 2U, "151", "200", 3001U, 0, 610, 700);
+  Add(0, 3U, "301", "350", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "170", "180", 7000U);
+  Add(1, 7U, "220", "230", 7000U);
+  Add(1, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveOneFile) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "100", "150", 3000U, 0, 710, 800);
+  Add(0, 2U, "551", "600", 3001U, 0, 610, 700);
+  Add(0, 3U, "101", "150", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "170", "180", 7000U);
+  Add(1, 7U, "220", "230", 7000U);
+  Add(1, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(4, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000000u;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(0, 1U, "300", "350", 3000U, 0, 710, 800);
+  Add(0, 2U, "651", "600", 3001U, 0, 610, 700);
+  Add(0, 3U, "501", "550", 3000U, 0, 510, 600);
+  Add(0, 4U, "451", "400", 3000U, 0, 410, 500);
+
+  Add(1, 5U, "120", "130", 7000U);
+  Add(1, 6U, "970", "980", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(4, compaction->num_input_files(0));
+  ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(2, compaction->input(0, 3)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
+  mutable_cf_options_.max_bytes_for_level_base = 10000u;
+  mutable_cf_options_.max_compaction_bytes = 10001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.sst_partitioner_factory = NewSstPartitionerFixedPrefixFactory(1);
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "100", "150", 3000U);
+  Add(1, 2U, "151", "200", 3001U);
+  Add(1, 3U, "201", "250", 3000U);
+  Add(1, 4U, "251", "300", 3000U);
+
+  Add(3, 5U, "120", "130", 7000U);
+  Add(3, 6U, "170", "180", 7000U);
+  Add(3, 7U, "220", "230", 7000U);
+  Add(3, 8U, "270", "280", 7000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  // No trivial move, because partitioning is applied
+  ASSERT_TRUE(!compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, IsTrivialMoveOff) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000000u;
+  mutable_cf_options_.max_compaction_bytes = 10000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+  // A compaction should be triggered and pick all files from level 1
+  Add(1, 1U, "100", "150", 300000U, 0, 0);
+  Add(1, 2U, "150", "200", 300000U, 0, 0);
+  Add(1, 3U, "200", "250", 300000U, 0, 0);
+  Add(1, 4U, "250", "300", 300000U, 0, 0);
+
+  Add(3, 5U, "120", "130", 6000U);
+  Add(3, 6U, "140", "150", 6000U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_FALSE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles1) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "200", 3001U);
+  Add(2, 3U, "301", "350", 3000U);
+  Add(2, 4U, "451", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 16U, "170", "180", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(4, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(5, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_EQ(6, compaction->input(0, 3)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles2) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "160", 3001U);
+  Add(2, 3U, "161", "179", 3000U);
+  Add(2, 4U, "220", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles3) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // Even if consecutive files can be trivial moved, we don't pick them
+  // since in case trivial move can't be issued for a reason, we cannot
+  // fall back to normal compactions.
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "160", 3001U);
+  Add(2, 5U, "551", "500", 3000U);
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles4) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 4000U);
+  Add(2, 2U, "151", "160", 4001U);
+  Add(2, 3U, "161", "179", 4000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles5) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File 4 and 5 aren't clean cut, so only 2 and 3 are picked.
+  Add(2, 1U, "100", "150", 4000U);
+  Add(2, 2U, "151", "160", 4001U);
+  Add(2, 3U, "161", "179", 4000U);
+  Add(2, 4U, "180", "185", 4000U);
+  Add(2, 5U, "185", "190", 4000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, TrivialMoveMultipleFiles6) {
+  mutable_cf_options_.max_bytes_for_level_base = 1000u;
+  mutable_cf_options_.max_compaction_bytes = 10000001u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  ioptions_.compaction_pri = kMinOverlappingRatio;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(2, 1U, "100", "150", 3000U);
+  Add(2, 2U, "151", "200", 3001U);
+  Add(2, 3U, "301", "350", 3000U);
+  Add(2, 4U, "451", "400", 3000U);
+  Add(2, 5U, "551", "500", 3000U);
+  file_map_[5U].first->being_compacted = true;
+  Add(2, 6U, "651", "600", 3000U);
+  Add(2, 7U, "751", "700", 3000U);
+  Add(2, 8U, "851", "900", 3000U);
+
+  Add(3, 15U, "120", "130", 700U);
+  Add(3, 16U, "170", "180", 700U);
+  Add(3, 17U, "220", "230", 700U);
+  Add(3, 18U, "870", "880", 700U);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_TRUE(compaction->IsTrivialMove());
+  ASSERT_EQ(1, compaction->num_input_levels());
+  // Since the next file is being compacted. Stopping at 3 and 4.
+  ASSERT_EQ(2, compaction->num_input_files(0));
+  ASSERT_EQ(3, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(4, compaction->input(0, 1)->fd.GetNumber());
+}
+
+TEST_F(CompactionPickerTest, CacheNextCompactionIndex) {
+  NewVersionStorage(6, kCompactionStyleLevel);
+  mutable_cf_options_.max_compaction_bytes = 100000000000u;
+
+  Add(1 /* level */, 1U /* file_number */, "100" /* smallest */,
+      "149" /* largest */, 1000000000U /* file_size */);
+  file_map_[1U].first->being_compacted = true;
+  Add(1 /* level */, 2U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 900000000U /* file_size */);
+  Add(1 /* level */, 3U /* file_number */, "200" /* smallest */,
+      "249" /* largest */, 800000000U /* file_size */);
+  Add(1 /* level */, 4U /* file_number */, "250" /* smallest */,
+      "299" /* largest */, 700000000U /* file_size */);
+  Add(2 /* level */, 5U /* file_number */, "150" /* smallest */,
+      "199" /* largest */, 100U /* file_size */);
+  Add(2 /* level */, 6U /* file_number */, "200" /* smallest */,
+      "240" /* largest */, 1U /* file_size */);
+  Add(2 /* level */, 7U /* file_number */, "260" /* smallest */,
+      "270" /* largest */, 1U /* file_size */);
+  file_map_[5U].first->being_compacted = true;
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(2U, compaction->num_input_levels());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+  ASSERT_EQ(4U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(3, vstorage_->NextCompactionIndex(1 /* level */));
+
+  compaction.reset(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() == nullptr);
+  ASSERT_EQ(4, vstorage_->NextCompactionIndex(1 /* level */));
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesNotHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 1000000u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // All 5 L0 files will be picked for intra L0 compaction. The one L1 file
+  // spans entire L0 key range and is marked as being compacted to avoid
+  // L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 110, 111);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(5U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 5 L0 files will be picked for intra L0 compaction due to
+  // max_compaction_bytes limit (the minimum number of files for triggering
+  // intra L0 compaction is 4). The one L1 file spans entire L0 key range and
+  // is marked as being compacted to avoid L0->L1 compaction.
+  Add(0, 1U, "100", "150", 200000U, 0, 100, 101);
+  Add(0, 2U, "151", "200", 200000U, 0, 102, 103);
+  Add(0, 3U, "201", "250", 200000U, 0, 104, 105);
+  Add(0, 4U, "251", "300", 200000U, 0, 106, 107);
+  Add(0, 5U, "301", "350", 200000U, 0, 108, 109);
+  Add(1, 6U, "100", "350", 200000U, 0, 109, 110);
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) {
+  // Intra L0 compaction triggers only if there are at least
+  // level0_file_num_compaction_trigger + 2 L0 files.
+  mutable_cf_options_.level0_file_num_compaction_trigger = 3;
+  mutable_cf_options_.max_compaction_bytes = 999999u;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // 4 out of 6 L0 files will be picked for intra L0 compaction due to
+  // being_compact limit. And the latest one L0 will be skipped due to earliest
+  // seqno. The one L1 file spans entire L0 key range and is marked as being
+  // compacted to avoid L0->L1 compaction.
+  Add(1, 1U, "100", "350", 200000U, 0, 110, 111);
+  Add(0, 2U, "301", "350", 1U, 0, 108, 109);
+  Add(0, 3U, "251", "300", 1U, 0, 106, 107);
+  Add(0, 4U, "201", "250", 1U, 0, 104, 105);
+  Add(0, 5U, "151", "200", 1U, 0, 102, 103);
+  Add(0, 6U, "100", "150", 1U, 0, 100, 101);
+  Add(0, 7U, "100", "100", 1U, 0, 99, 100);
+  vstorage_->LevelFiles(0)[5]->being_compacted = true;
+  vstorage_->LevelFiles(1)[0]->being_compacted = true;
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_, 107));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1U, compaction->num_input_levels());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_EQ(CompactionReason::kLevelL0FilesNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a "regular" universal compaction is
+  // scheduled first, followed by a delete triggered compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300);
+  Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a compaction to reduce sorted runs
+  ASSERT_EQ(CompactionReason::kUniversalSortedRunNum,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+
+  AddVersionStorage();
+  // Simulate a flush and mark the file for compaction
+  Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(5, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+  Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250);
+  Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150);
+  Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(3, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->num_input_files(1));
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_FALSE(compaction2);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedCompactionStartOutputOverlap) {
+  // The case where universal periodic compaction can be picked
+  // with some newer files being compacted.
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+
+  bool input_level_overlap = false;
+  bool output_level_overlap = false;
+  // Let's mark 2 files in 2 different levels for compaction. The
+  // compaction picker will randomly pick one, so use the sync point to
+  // ensure a deterministic order. Loop until both cases are covered
+  size_t random_index = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::PickFilesMarkedForCompaction", [&](void* arg) {
+        size_t* index = static_cast<size_t*>(arg);
+        *index = random_index;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  while (!input_level_overlap || !output_level_overlap) {
+    // Ensure that the L0 file gets picked first
+    random_index = !input_level_overlap ? 0 : 1;
+    UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+    NewVersionStorage(5, kCompactionStyleUniversal);
+
+    Add(0, 1U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true);
+    Add(3, 2U, "010", "020", 2 * kFileSize, 0, 201, 248);
+    Add(3, 3U, "250", "270", 2 * kFileSize, 0, 202, 249);
+    Add(3, 4U, "290", "310", 2 * kFileSize, 0, 203, 250);
+    Add(3, 5U, "310", "320", 2 * kFileSize, 0, 204, 251, 0, true);
+    Add(4, 6U, "301", "350", 8 * kFileSize, 0, 101, 150);
+    Add(4, 7U, "501", "750", 8 * kFileSize, 0, 101, 150);
+    UpdateVersionStorageInfo();
+
+    std::unique_ptr<Compaction> compaction(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+
+    ASSERT_TRUE(compaction);
+    // Validate that its a delete triggered compaction
+    ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+              compaction->compaction_reason());
+    ASSERT_TRUE(compaction->start_level() == 0 ||
+                compaction->start_level() == 3);
+    if (compaction->start_level() == 0) {
+      // The L0 file was picked. The next compaction will detect an
+      // overlap on its input level
+      input_level_overlap = true;
+      ASSERT_EQ(3, compaction->output_level());
+      ASSERT_EQ(1U, compaction->num_input_files(0));
+      ASSERT_EQ(3U, compaction->num_input_files(1));
+    } else {
+      // The level 3 file was picked. The next compaction will pick
+      // the L0 file and will detect overlap when adding output
+      // level inputs
+      output_level_overlap = true;
+      ASSERT_EQ(4, compaction->output_level());
+      ASSERT_EQ(2U, compaction->num_input_files(0));
+      ASSERT_EQ(1U, compaction->num_input_files(1));
+    }
+
+    vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+    // After recomputing the compaction score, only one marked file will remain
+    random_index = 0;
+    std::unique_ptr<Compaction> compaction2(
+        universal_compaction_picker.PickCompaction(
+            cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+            &log_buffer_));
+    ASSERT_FALSE(compaction2);
+    DeleteVersionStorage();
+  }
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0NoOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled and should result in a full compaction
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(4U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0WithOverlap) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a file is being compacted, and a
+  // delete triggered compaction is then scheduled. The latter should stop
+  // at the first file being compacted
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 4 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+  file_map_[3].first->being_compacted = true;
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) {
+  const uint64_t kFileSize = 100000;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  // This test covers the case where a delete triggered compaction is
+  // scheduled first, followed by a "regular" compaction. The latter
+  // should fail
+  NewVersionStorage(1, kCompactionStyleUniversal);
+
+  // Mark file number 5 for compaction
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150);
+  Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  ASSERT_TRUE(compaction);
+  // Validate that its a delete triggered compaction
+  ASSERT_EQ(CompactionReason::kFilesMarkedForCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(0, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[6].first->being_compacted);
+
+  AddVersionStorage();
+  Add(0, 1U, "150", "200", kFileSize, 0, 500, 550);
+  Add(0, 2U, "201", "250", kFileSize, 0, 401, 450);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction2(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  ASSERT_TRUE(compaction2);
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[1].first->being_compacted);
+  ASSERT_TRUE(file_map_[2].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+}
+
+TEST_F(CompactionPickerTest, UniversalMarkedManualCompaction) {
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+
+  // This test makes sure the `files_marked_for_compaction_` is updated after
+  // creating manual compaction.
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+
+  // Add 3 files marked for compaction
+  Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, 0, true);
+  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, 0, true);
+  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true);
+  UpdateVersionStorageInfo();
+
+  // All 3 files are marked for compaction
+  ASSERT_EQ(3U, vstorage_->FilesMarkedForCompaction().size());
+
+  bool manual_conflict = false;
+  InternalKey* manual_end = nullptr;
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.CompactRange(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          ColumnFamilyData::kCompactAllLevels, 6, CompactRangeOptions(),
+          nullptr, nullptr, &manual_end, &manual_conflict,
+          std::numeric_limits<uint64_t>::max(), ""));
+
+  ASSERT_TRUE(compaction);
+
+  ASSERT_EQ(CompactionReason::kManualCompaction,
+            compaction->compaction_reason());
+  ASSERT_EQ(kNumLevels - 1, compaction->output_level());
+  ASSERT_EQ(0, compaction->start_level());
+  ASSERT_EQ(3U, compaction->num_input_files(0));
+  ASSERT_TRUE(file_map_[3].first->being_compacted);
+  ASSERT_TRUE(file_map_[4].first->being_compacted);
+  ASSERT_TRUE(file_map_[5].first->being_compacted);
+
+  // After creating the manual compaction, all files should be cleared from
+  // `FilesMarkedForCompaction`. So they won't be picked by others.
+  ASSERT_EQ(0U, vstorage_->FilesMarkedForCompaction().size());
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNonLastLevel) {
+  // This test make sure size amplification compaction could still be triggered
+  // if the last sorted run is not the last level.
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(4, 90U, "100", "600", 4 * kFileSize);
+  Add(5, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // Make sure it's a size amp compaction and includes all files
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kLastLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(4)->num_files, 1);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeRatioTierCompactionLastLevel) {
+  // This test makes sure the size amp calculation skips the last level (L6), so
+  // size amp compaction is not triggered, instead a size ratio compaction is
+  // triggered.
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+  const int kPenultimateLevel = kLastLevel - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(5, 90U, "100", "600", 4 * kFileSize);
+  Add(6, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // Internally, size amp compaction is evaluated before size ratio compaction.
+  // Here to make sure it's size ratio compaction instead of size amp
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeRatio);
+  ASSERT_EQ(compaction->output_level(), kPenultimateLevel - 1);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 0);
+  ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionNotSuport) {
+  // Tiered compaction only support level_num > 2 (otherwise the penultimate
+  // level is going to be level 0, which may make thing more complicated), so
+  // when there's only 2 level, still treating level 1 as the last level for
+  // size amp compaction
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 2;
+  const int kLastLevel = kNumLevels - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 1 * kFileSize);
+  Add(0, 101U, "200", "400", 1 * kFileSize);
+  Add(0, 90U, "100", "600", 4 * kFileSize);
+  Add(1, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // size amp compaction is still triggered even preclude_last_level is set
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kLastLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 3);
+  ASSERT_EQ(compaction->input_levels(1)->num_files, 1);
+}
+
+TEST_F(CompactionPickerTest, UniversalSizeAmpTierCompactionLastLevel) {
+  // This test makes sure the size amp compaction for tiered storage could still
+  // be triggered, but only for non-last-level files
+  const uint64_t kFileSize = 100000;
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+  const int kPenultimateLevel = kLastLevel - 1;
+
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  ioptions_.preclude_last_level_data_seconds = 1000;
+  mutable_cf_options_.compaction_options_universal
+      .max_size_amplification_percent = 200;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+
+  NewVersionStorage(kNumLevels, kCompactionStyleUniversal);
+  Add(0, 100U, "100", "300", 3 * kFileSize);
+  Add(0, 101U, "200", "400", 2 * kFileSize);
+  Add(5, 90U, "100", "600", 2 * kFileSize);
+  Add(6, 80U, "200", "300", 2 * kFileSize);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+
+  // It's a Size Amp compaction, but doesn't include the last level file and
+  // output to the penultimate level.
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kUniversalSizeAmplification);
+  ASSERT_EQ(compaction->output_level(), kPenultimateLevel);
+  ASSERT_EQ(compaction->input_levels(0)->num_files, 2);
+  ASSERT_EQ(compaction->input_levels(5)->num_files, 1);
+  ASSERT_EQ(compaction->input_levels(6)->num_files, 0);
+}
+
+TEST_F(CompactionPickerU64TsTest, Overlap) {
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  constexpr int level = 0;
+  constexpr uint64_t file_number = 20ULL;
+  constexpr char smallest[4] = "500";
+  constexpr char largest[4] = "600";
+  constexpr uint64_t ts_of_smallest = 12345ULL;
+  constexpr uint64_t ts_of_largest = 56789ULL;
+
+  {
+    std::string ts1;
+    PutFixed64(&ts1, ts_of_smallest);
+    std::string ts2;
+    PutFixed64(&ts2, ts_of_largest);
+    Add(level, file_number, smallest, largest,
+        /*file_size=*/1U, /*path_id=*/0,
+        /*smallest_seq=*/100, /*largest_seq=*/100, /*compensated_file_size=*/0,
+        /*marked_for_compact=*/false, /*temperature=*/Temperature::kUnknown,
+        /*oldest_ancestor_time=*/kUnknownOldestAncesterTime, ts1, ts2);
+    UpdateVersionStorageInfo();
+  }
+
+  std::unordered_set<uint64_t> input{file_number};
+
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input, vstorage_.get(), CompactionOptions()));
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      CompactionOptions(), input_files, level, vstorage_.get(),
+      mutable_cf_options_, mutable_db_options_, /*output_path_id=*/0));
+
+  {
+    // [600, ts=50000] to [600, ts=50000] is the range to check.
+    // ucmp->Compare(smallest_user_key, c->GetLargestUserKey()) > 0, but
+    // ucmp->CompareWithoutTimestamp(smallest_user_key,
+    //                               c->GetLargestUserKey()) == 0.
+    // Should still be considered overlapping.
+    std::string user_key_with_ts1(largest);
+    PutFixed64(&user_key_with_ts1, ts_of_largest - 1);
+    std::string user_key_with_ts2(largest);
+    PutFixed64(&user_key_with_ts2, ts_of_largest - 1);
+    ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+        user_key_with_ts1, user_key_with_ts2, level));
+  }
+  {
+    // [500, ts=60000] to [500, ts=60000] is the range to check.
+    // ucmp->Compare(largest_user_key, c->GetSmallestUserKey()) < 0, but
+    // ucmp->CompareWithoutTimestamp(largest_user_key,
+    //                               c->GetSmallestUserKey()) == 0.
+    // Should still be considered overlapping.
+    std::string user_key_with_ts1(smallest);
+    PutFixed64(&user_key_with_ts1, ts_of_smallest + 1);
+    std::string user_key_with_ts2(smallest);
+    PutFixed64(&user_key_with_ts2, ts_of_smallest + 1);
+    ASSERT_TRUE(level_compaction_picker.RangeOverlapWithCompaction(
+        user_key_with_ts1, user_key_with_ts2, level));
+  }
+}
+
+TEST_F(CompactionPickerU64TsTest, CannotTrivialMoveUniversal) {
+  constexpr uint64_t kFileSize = 100000;
+
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.compaction_options_universal.allow_trivial_move = true;
+  NewVersionStorage(1, kCompactionStyleUniversal);
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  UpdateVersionStorageInfo();
+  // must return false when there's no files.
+  ASSERT_FALSE(universal_compaction_picker.NeedsCompaction(vstorage_.get()));
+
+  std::string ts1;
+  PutFixed64(&ts1, 9000);
+  std::string ts2;
+  PutFixed64(&ts2, 8000);
+  std::string ts3;
+  PutFixed64(&ts3, 7000);
+  std::string ts4;
+  PutFixed64(&ts4, 6000);
+
+  NewVersionStorage(3, kCompactionStyleUniversal);
+  // A compaction should be triggered and pick file 2
+  Add(1, 1U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+      /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, ts1, ts2);
+  Add(2, 2U, "150", "150", kFileSize, /*path_id=*/0, /*smallest_seq=*/100,
+      /*largest_seq=*/100, /*compensated_file_size=*/kFileSize,
+      /*marked_for_compact=*/false, Temperature::kUnknown,
+      kUnknownOldestAncesterTime, ts3, ts4);
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(
+      universal_compaction_picker.PickCompaction(
+          cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+          &log_buffer_));
+  assert(compaction);
+  ASSERT_TRUE(!compaction->is_trivial_move());
+}
+
+class PerKeyPlacementCompactionPickerTest
+    : public CompactionPickerTest,
+      public testing::WithParamInterface<bool> {
+ public:
+  PerKeyPlacementCompactionPickerTest() : CompactionPickerTest() {}
+
+  void SetUp() override { enable_per_key_placement_ = GetParam(); }
+
+ protected:
+  bool enable_per_key_placement_ = false;
+};
+
+TEST_P(PerKeyPlacementCompactionPickerTest, OverlapWithNormalCompaction) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(5, 40U, "200", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            level_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlap) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  NewVersionStorage(num_levels, kCompactionStyleLevel);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(level_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  input_set.insert(41);
+  ASSERT_OK(level_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            level_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       OverlapWithNormalCompactionUniveral) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(5, 40U, "200", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     0, 6)));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, NormalCompactionOverlapUniversal) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+        auto supports_per_key_placement = static_cast<bool*>(arg);
+        *supports_per_key_placement = enable_per_key_placement_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int num_levels = ioptions_.num_levels;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  Add(0, 21U, "100", "150", 60000000U);
+  Add(0, 22U, "300", "350", 60000000U);
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(6, 50U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(21);
+  input_set.insert(22);
+  input_set.insert(50);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, PenultimateOverlapUniversal) {
+  // This test is make sure the Tiered compaction would lock whole range of
+  // both output level and penultimate level
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // the existing compaction is the 1st L4 file + L6 file
+  // then compaction of the 2nd L4 file to L5 (penultimate level) is overlapped
+  // when the tiered compaction feature is on.
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+
+  // compacting the 3rd L4 file is always safe:
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(42);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest, LastLevelOnlyOverlapUniversal) {
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  // cannot compact file 41 if the preclude_last_level feature is on, otherwise
+  // compact file 41 is okay.
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(41);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 5, Compaction::kInvalidLevel));
+
+  // compacting the 3rd L4 file is always safe:
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(42);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyFailPenultimateUniversal) {
+  // This is to test last_level only compaction still unable to do the
+  // penultimate level compaction if there's already a file in the penultimate
+  // level.
+  // This should rarely happen in universal compaction, as the non-empty L5
+  // should be included in the compaction.
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]
+  // L5:              [230, 250]
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(5, 50U, "230", "250", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(60);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  // As comp1 cannot be output to the penultimate level, compacting file 40 to
+  // L5 is always safe.
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(40);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 5, Compaction::kInvalidLevel));
+
+  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+  ASSERT_TRUE(comp2);
+  ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyConflictWithOngoingUniversal) {
+  // This is to test last_level only compaction still unable to do the
+  // penultimate level compaction if there's already an ongoing compaction to
+  // the penultimate level
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // create an ongoing compaction to L5 (penultimate level)
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(40);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(60);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  ASSERT_EQ(enable_per_key_placement_,
+            universal_compaction_picker.FilesRangeOverlapWithCompaction(
+                input_files, 6,
+                Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_,
+                                                     6, 6)));
+
+  if (!enable_per_key_placement_) {
+    std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+        comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+        mutable_db_options_, 0));
+    ASSERT_TRUE(comp2);
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  }
+}
+
+TEST_P(PerKeyPlacementCompactionPickerTest,
+       LastLevelOnlyNoConflictWithOngoingUniversal) {
+  // This is similar to `LastLevelOnlyConflictWithOngoingUniversal`, the only
+  // change is the ongoing compaction to L5 has no overlap with the last level
+  // compaction, so it's safe to move data from the last level to the
+  // penultimate level.
+  if (enable_per_key_placement_) {
+    ioptions_.preclude_last_level_data_seconds = 10000;
+  }
+
+  int num_levels = ioptions_.num_levels;
+  ioptions_.compaction_style = kCompactionStyleUniversal;
+  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
+  NewVersionStorage(num_levels, kCompactionStyleUniversal);
+
+  // L4:   [200, 220]  [230, 250]       [360, 380]
+  // L5:
+  // L6: [101,                    351]
+  Add(4, 40U, "200", "220", 60000000U);
+  Add(4, 41U, "230", "250", 60000000U);
+  Add(4, 42U, "360", "380", 60000000U);
+  Add(6, 60U, "101", "351", 60000000U);
+  UpdateVersionStorageInfo();
+
+  // create an ongoing compaction to L5 (penultimate level)
+  CompactionOptions comp_options;
+  std::unordered_set<uint64_t> input_set;
+  input_set.insert(42);
+  std::vector<CompactionInputFiles> input_files;
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  std::unique_ptr<Compaction> comp1(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 5, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+
+  ASSERT_TRUE(comp1);
+  ASSERT_EQ(comp1->GetPenultimateLevel(), Compaction::kInvalidLevel);
+
+  input_set.clear();
+  input_files.clear();
+  input_set.insert(60);
+  ASSERT_OK(universal_compaction_picker.GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage_.get(), comp_options));
+
+  // always safe to move data up
+  ASSERT_FALSE(universal_compaction_picker.FilesRangeOverlapWithCompaction(
+      input_files, 6,
+      Compaction::EvaluatePenultimateLevel(vstorage_.get(), ioptions_, 6, 6)));
+
+  // 2 compactions can be run in parallel
+  std::unique_ptr<Compaction> comp2(universal_compaction_picker.CompactFiles(
+      comp_options, input_files, 6, vstorage_.get(), mutable_cf_options_,
+      mutable_db_options_, 0));
+  ASSERT_TRUE(comp2);
+  if (enable_per_key_placement_) {
+    ASSERT_NE(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  } else {
+    ASSERT_EQ(Compaction::kInvalidLevel, comp2->GetPenultimateLevel());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
+                        PerKeyPlacementCompactionPickerTest, ::testing::Bool());
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.cc b/src/rocksdb/db/compaction/compaction_picker_universal.cc
new file mode 100644
index 000000000..376e4f60f
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.cc
@@ -0,0 +1,1450 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_picker_universal.h"
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/statistics.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+// A helper class that form universal compactions. The class is used by
+// UniversalCompactionPicker::PickCompaction().
+// The usage is to create the class, and get the compaction object by calling
+// PickCompaction().
+class UniversalCompactionBuilder {
+ public:
+  UniversalCompactionBuilder(
+      const ImmutableOptions& ioptions, const InternalKeyComparator* icmp,
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      UniversalCompactionPicker* picker, LogBuffer* log_buffer)
+      : ioptions_(ioptions),
+        icmp_(icmp),
+        cf_name_(cf_name),
+        mutable_cf_options_(mutable_cf_options),
+        mutable_db_options_(mutable_db_options),
+        vstorage_(vstorage),
+        picker_(picker),
+        log_buffer_(log_buffer) {}
+
+  // Form and return the compaction object. The caller owns return object.
+  Compaction* PickCompaction();
+
+ private:
+  struct SortedRun {
+    SortedRun(int _level, FileMetaData* _file, uint64_t _size,
+              uint64_t _compensated_file_size, bool _being_compacted)
+        : level(_level),
+          file(_file),
+          size(_size),
+          compensated_file_size(_compensated_file_size),
+          being_compacted(_being_compacted) {
+      assert(compensated_file_size > 0);
+      assert(level != 0 || file != nullptr);
+    }
+
+    void Dump(char* out_buf, size_t out_buf_size,
+              bool print_path = false) const;
+
+    // sorted_run_count is added into the string to print
+    void DumpSizeInfo(char* out_buf, size_t out_buf_size,
+                      size_t sorted_run_count) const;
+
+    int level;
+    // `file` Will be null for level > 0. For level = 0, the sorted run is
+    // for this file.
+    FileMetaData* file;
+    // For level > 0, `size` and `compensated_file_size` are sum of sizes all
+    // files in the level. `being_compacted` should be the same for all files
+    // in a non-zero level. Use the value here.
+    uint64_t size;
+    uint64_t compensated_file_size;
+    bool being_compacted;
+  };
+
+  // Pick Universal compaction to limit read amplification
+  Compaction* PickCompactionToReduceSortedRuns(
+      unsigned int ratio, unsigned int max_number_of_files_to_compact);
+
+  // Pick Universal compaction to limit space amplification.
+  Compaction* PickCompactionToReduceSizeAmp();
+
+  // Try to pick incremental compaction to reduce space amplification.
+  // It will return null if it cannot find a fanout within the threshold.
+  // Fanout is defined as
+  //    total size of files to compact at output level
+  //  --------------------------------------------------
+  //    total size of files to compact at other levels
+  Compaction* PickIncrementalForReduceSizeAmp(double fanout_threshold);
+
+  Compaction* PickDeleteTriggeredCompaction();
+
+  // Form a compaction from the sorted run indicated by start_index to the
+  // oldest sorted run.
+  // The caller is responsible for making sure that those files are not in
+  // compaction.
+  Compaction* PickCompactionToOldest(size_t start_index,
+                                     CompactionReason compaction_reason);
+
+  Compaction* PickCompactionWithSortedRunRange(
+      size_t start_index, size_t end_index, CompactionReason compaction_reason);
+
+  // Try to pick periodic compaction. The caller should only call it
+  // if there is at least one file marked for periodic compaction.
+  // null will be returned if no such a compaction can be formed
+  // because some files are being compacted.
+  Compaction* PickPeriodicCompaction();
+
+  // Used in universal compaction when the allow_trivial_move
+  // option is set. Checks whether there are any overlapping files
+  // in the input. Returns true if the input files are non
+  // overlapping.
+  bool IsInputFilesNonOverlapping(Compaction* c);
+
+  uint64_t GetMaxOverlappingBytes() const;
+
+  const ImmutableOptions& ioptions_;
+  const InternalKeyComparator* icmp_;
+  double score_;
+  std::vector<SortedRun> sorted_runs_;
+  const std::string& cf_name_;
+  const MutableCFOptions& mutable_cf_options_;
+  const MutableDBOptions& mutable_db_options_;
+  VersionStorageInfo* vstorage_;
+  UniversalCompactionPicker* picker_;
+  LogBuffer* log_buffer_;
+
+  static std::vector<SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage);
+
+  // Pick a path ID to place a newly generated file, with its estimated file
+  // size.
+  static uint32_t GetPathId(const ImmutableCFOptions& ioptions,
+                            const MutableCFOptions& mutable_cf_options,
+                            uint64_t file_size);
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This structure is used for the construction of min heap
+// that contains the file meta data, the level of the file
+// and the index of the file in that level
+
+struct InputFileInfo {
+  InputFileInfo() : f(nullptr), level(0), index(0) {}
+
+  FileMetaData* f;
+  size_t level;
+  size_t index;
+};
+
+// Used in universal compaction when trivial move is enabled.
+// This comparator is used for the construction of min heap
+// based on the smallest key of the file.
+struct SmallestKeyHeapComparator {
+  explicit SmallestKeyHeapComparator(const Comparator* ucmp) { ucmp_ = ucmp; }
+
+  bool operator()(InputFileInfo i1, InputFileInfo i2) const {
+    return (ucmp_->CompareWithoutTimestamp(i1.f->smallest.user_key(),
+                                           i2.f->smallest.user_key()) > 0);
+  }
+
+ private:
+  const Comparator* ucmp_;
+};
+
+using SmallestKeyHeap =
+    std::priority_queue<InputFileInfo, std::vector<InputFileInfo>,
+                        SmallestKeyHeapComparator>;
+
+// This function creates the heap that is used to find if the files are
+// overlapping during universal compaction when the allow_trivial_move
+// is set.
+SmallestKeyHeap create_level_heap(Compaction* c, const Comparator* ucmp) {
+  SmallestKeyHeap smallest_key_priority_q =
+      SmallestKeyHeap(SmallestKeyHeapComparator(ucmp));
+
+  InputFileInfo input_file;
+
+  for (size_t l = 0; l < c->num_input_levels(); l++) {
+    if (c->num_input_files(l) != 0) {
+      if (l == 0 && c->start_level() == 0) {
+        for (size_t i = 0; i < c->num_input_files(0); i++) {
+          input_file.f = c->input(0, i);
+          input_file.level = 0;
+          input_file.index = i;
+          smallest_key_priority_q.push(std::move(input_file));
+        }
+      } else {
+        input_file.f = c->input(l, 0);
+        input_file.level = l;
+        input_file.index = 0;
+        smallest_key_priority_q.push(std::move(input_file));
+      }
+    }
+  }
+  return smallest_key_priority_q;
+}
+
+#ifndef NDEBUG
+// smallest_seqno and largest_seqno are set iff. `files` is not empty.
+void GetSmallestLargestSeqno(const std::vector<FileMetaData*>& files,
+                             SequenceNumber* smallest_seqno,
+                             SequenceNumber* largest_seqno) {
+  bool is_first = true;
+  for (FileMetaData* f : files) {
+    assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+    if (is_first) {
+      is_first = false;
+      *smallest_seqno = f->fd.smallest_seqno;
+      *largest_seqno = f->fd.largest_seqno;
+    } else {
+      if (f->fd.smallest_seqno < *smallest_seqno) {
+        *smallest_seqno = f->fd.smallest_seqno;
+      }
+      if (f->fd.largest_seqno > *largest_seqno) {
+        *largest_seqno = f->fd.largest_seqno;
+      }
+    }
+  }
+}
+#endif
+}  // namespace
+
+// Algorithm that checks to see if there are any overlapping
+// files in the input
+bool UniversalCompactionBuilder::IsInputFilesNonOverlapping(Compaction* c) {
+  auto comparator = icmp_->user_comparator();
+  int first_iter = 1;
+
+  InputFileInfo prev, curr, next;
+
+  SmallestKeyHeap smallest_key_priority_q =
+      create_level_heap(c, icmp_->user_comparator());
+
+  while (!smallest_key_priority_q.empty()) {
+    curr = smallest_key_priority_q.top();
+    smallest_key_priority_q.pop();
+
+    if (first_iter) {
+      prev = curr;
+      first_iter = 0;
+    } else {
+      if (comparator->CompareWithoutTimestamp(
+              prev.f->largest.user_key(), curr.f->smallest.user_key()) >= 0) {
+        // found overlapping files, return false
+        return false;
+      }
+      assert(comparator->CompareWithoutTimestamp(
+                 curr.f->largest.user_key(), prev.f->largest.user_key()) > 0);
+      prev = curr;
+    }
+
+    next.f = nullptr;
+
+    if (c->level(curr.level) != 0 &&
+        curr.index < c->num_input_files(curr.level) - 1) {
+      next.f = c->input(curr.level, curr.index + 1);
+      next.level = curr.level;
+      next.index = curr.index + 1;
+    }
+
+    if (next.f) {
+      smallest_key_priority_q.push(std::move(next));
+    }
+  }
+  return true;
+}
+
+bool UniversalCompactionPicker::NeedsCompaction(
+    const VersionStorageInfo* vstorage) const {
+  const int kLevel0 = 0;
+  if (vstorage->CompactionScore(kLevel0) >= 1) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForPeriodicCompaction().empty()) {
+    return true;
+  }
+  if (!vstorage->FilesMarkedForCompaction().empty()) {
+    return true;
+  }
+  return false;
+}
+
+Compaction* UniversalCompactionPicker::PickCompaction(
+    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+    LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) {
+  UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name,
+                                     mutable_cf_options, mutable_db_options,
+                                     vstorage, this, log_buffer);
+  return builder.PickCompaction();
+}
+
+void UniversalCompactionBuilder::SortedRun::Dump(char* out_buf,
+                                                 size_t out_buf_size,
+                                                 bool print_path) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    if (file->fd.GetPathId() == 0 || !print_path) {
+      snprintf(out_buf, out_buf_size, "file %" PRIu64, file->fd.GetNumber());
+    } else {
+      snprintf(out_buf, out_buf_size,
+               "file %" PRIu64
+               "(path "
+               "%" PRIu32 ")",
+               file->fd.GetNumber(), file->fd.GetPathId());
+    }
+  } else {
+    snprintf(out_buf, out_buf_size, "level %d", level);
+  }
+}
+
+void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
+    char* out_buf, size_t out_buf_size, size_t sorted_run_count) const {
+  if (level == 0) {
+    assert(file != nullptr);
+    snprintf(out_buf, out_buf_size,
+             "file %" PRIu64 "[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             file->fd.GetNumber(), sorted_run_count, file->fd.GetFileSize(),
+             file->compensated_file_size);
+  } else {
+    snprintf(out_buf, out_buf_size,
+             "level %d[%" ROCKSDB_PRIszt
+             "] "
+             "with size %" PRIu64 " (compensated size %" PRIu64 ")",
+             level, sorted_run_count, size, compensated_file_size);
+  }
+}
+
+std::vector<UniversalCompactionBuilder::SortedRun>
+UniversalCompactionBuilder::CalculateSortedRuns(
+    const VersionStorageInfo& vstorage) {
+  std::vector<UniversalCompactionBuilder::SortedRun> ret;
+  for (FileMetaData* f : vstorage.LevelFiles(0)) {
+    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
+                     f->being_compacted);
+  }
+  for (int level = 1; level < vstorage.num_levels(); level++) {
+    uint64_t total_compensated_size = 0U;
+    uint64_t total_size = 0U;
+    bool being_compacted = false;
+    for (FileMetaData* f : vstorage.LevelFiles(level)) {
+      total_compensated_size += f->compensated_file_size;
+      total_size += f->fd.GetFileSize();
+      // Size amp, read amp and periodic compactions always include all files
+      // for a non-zero level. However, a delete triggered compaction and
+      // a trivial move might pick a subset of files in a sorted run. So
+      // always check all files in a sorted run and mark the entire run as
+      // being compacted if one or more files are being compacted
+      if (f->being_compacted) {
+        being_compacted = f->being_compacted;
+      }
+    }
+    if (total_compensated_size > 0) {
+      ret.emplace_back(level, nullptr, total_size, total_compensated_size,
+                       being_compacted);
+    }
+  }
+  return ret;
+}
+
+// Universal style of compaction. Pick files that are contiguous in
+// time-range to compact.
+Compaction* UniversalCompactionBuilder::PickCompaction() {
+  const int kLevel0 = 0;
+  score_ = vstorage_->CompactionScore(kLevel0);
+  sorted_runs_ = CalculateSortedRuns(*vstorage_);
+
+  if (sorted_runs_.size() == 0 ||
+      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
+       vstorage_->FilesMarkedForCompaction().empty() &&
+       sorted_runs_.size() < (unsigned int)mutable_cf_options_
+                                 .level0_file_num_compaction_trigger)) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: nothing to do\n",
+                     cf_name_.c_str());
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+  VersionStorageInfo::LevelSummaryStorage tmp;
+  ROCKS_LOG_BUFFER_MAX_SZ(
+      log_buffer_, 3072,
+      "[%s] Universal: sorted runs: %" ROCKSDB_PRIszt " files: %s\n",
+      cf_name_.c_str(), sorted_runs_.size(), vstorage_->LevelSummary(&tmp));
+
+  Compaction* c = nullptr;
+  // Periodic compaction has higher priority than other type of compaction
+  // because it's a hard requirement.
+  if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) {
+    // Always need to do a full compaction for periodic compaction.
+    c = PickPeriodicCompaction();
+  }
+
+  // Check for size amplification.
+  if (c == nullptr &&
+      sorted_runs_.size() >=
+          static_cast<size_t>(
+              mutable_cf_options_.level0_file_num_compaction_trigger)) {
+    if ((c = PickCompactionToReduceSizeAmp()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n",
+                       cf_name_.c_str());
+    } else {
+      // Size amplification is within limits. Try reducing read
+      // amplification while maintaining file size ratios.
+      unsigned int ratio =
+          mutable_cf_options_.compaction_options_universal.size_ratio;
+
+      if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) {
+        ROCKS_LOG_BUFFER(log_buffer_,
+                         "[%s] Universal: compacting for size ratio\n",
+                         cf_name_.c_str());
+      } else {
+        // Size amplification and file size ratios are within configured limits.
+        // If max read amplification is exceeding configured limits, then force
+        // compaction without looking at filesize ratios and try to reduce
+        // the number of files to fewer than level0_file_num_compaction_trigger.
+        // This is guaranteed by NeedsCompaction()
+        assert(sorted_runs_.size() >=
+               static_cast<size_t>(
+                   mutable_cf_options_.level0_file_num_compaction_trigger));
+        // Get the total number of sorted runs that are not being compacted
+        int num_sr_not_compacted = 0;
+        for (size_t i = 0; i < sorted_runs_.size(); i++) {
+          if (sorted_runs_[i].being_compacted == false) {
+            num_sr_not_compacted++;
+          }
+        }
+
+        // The number of sorted runs that are not being compacted is greater
+        // than the maximum allowed number of sorted runs
+        if (num_sr_not_compacted >
+            mutable_cf_options_.level0_file_num_compaction_trigger) {
+          unsigned int num_files =
+              num_sr_not_compacted -
+              mutable_cf_options_.level0_file_num_compaction_trigger + 1;
+          if ((c = PickCompactionToReduceSortedRuns(UINT_MAX, num_files)) !=
+              nullptr) {
+            ROCKS_LOG_BUFFER(log_buffer_,
+                             "[%s] Universal: compacting for file num -- %u\n",
+                             cf_name_.c_str(), num_files);
+          }
+        }
+      }
+    }
+  }
+
+  if (c == nullptr) {
+    if ((c = PickDeleteTriggeredCompaction()) != nullptr) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: delete triggered compaction\n",
+                       cf_name_.c_str());
+    }
+  }
+
+  if (c == nullptr) {
+    TEST_SYNC_POINT_CALLBACK(
+        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
+    return nullptr;
+  }
+
+  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
+          true &&
+      c->compaction_reason() != CompactionReason::kPeriodicCompaction) {
+    c->set_is_trivial_move(IsInputFilesNonOverlapping(c));
+  }
+
+// validate that all the chosen files of L0 are non overlapping in time
+#ifndef NDEBUG
+  bool is_first = true;
+
+  size_t level_index = 0U;
+  if (c->start_level() == 0) {
+    for (auto f : *c->inputs(0)) {
+      assert(f->fd.smallest_seqno <= f->fd.largest_seqno);
+      if (is_first) {
+        is_first = false;
+      }
+    }
+    level_index = 1U;
+  }
+  for (; level_index < c->num_input_levels(); level_index++) {
+    if (c->num_input_files(level_index) != 0) {
+      SequenceNumber smallest_seqno = 0U;
+      SequenceNumber largest_seqno = 0U;
+      GetSmallestLargestSeqno(*(c->inputs(level_index)), &smallest_seqno,
+                              &largest_seqno);
+      if (is_first) {
+        is_first = false;
+      }
+    }
+  }
+#endif
+  // update statistics
+  size_t num_files = 0;
+  for (auto& each_level : *c->inputs()) {
+    num_files += each_level.files.size();
+  }
+  RecordInHistogram(ioptions_.stats, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+  picker_->RegisterCompaction(c);
+  vstorage_->ComputeCompactionScore(ioptions_, mutable_cf_options_);
+
+  TEST_SYNC_POINT_CALLBACK("UniversalCompactionBuilder::PickCompaction:Return",
+                           c);
+  return c;
+}
+
+uint32_t UniversalCompactionBuilder::GetPathId(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options, uint64_t file_size) {
+  // Two conditions need to be satisfied:
+  // (1) the target path needs to be able to hold the file's size
+  // (2) Total size left in this and previous paths need to be not
+  //     smaller than expected future file size before this new file is
+  //     compacted, which is estimated based on size_ratio.
+  // For example, if now we are compacting files of size (1, 1, 2, 4, 8),
+  // we will make sure the target file, probably with size of 16, will be
+  // placed in a path so that eventually when new files are generated and
+  // compacted to (1, 1, 2, 4, 8, 16), all those files can be stored in or
+  // before the path we chose.
+  //
+  // TODO(sdong): now the case of multiple column families is not
+  // considered in this algorithm. So the target size can be violated in
+  // that case. We need to improve it.
+  uint64_t accumulated_size = 0;
+  uint64_t future_size =
+      file_size *
+      (100 - mutable_cf_options.compaction_options_universal.size_ratio) / 100;
+  uint32_t p = 0;
+  assert(!ioptions.cf_paths.empty());
+  for (; p < ioptions.cf_paths.size() - 1; p++) {
+    uint64_t target_size = ioptions.cf_paths[p].target_size;
+    if (target_size > file_size &&
+        accumulated_size + (target_size - file_size) > future_size) {
+      return p;
+    }
+    accumulated_size += target_size;
+  }
+  return p;
+}
+
+//
+// Consider compaction files based on their size differences with
+// the next file in time order.
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
+    unsigned int ratio, unsigned int max_number_of_files_to_compact) {
+  unsigned int min_merge_width =
+      mutable_cf_options_.compaction_options_universal.min_merge_width;
+  unsigned int max_merge_width =
+      mutable_cf_options_.compaction_options_universal.max_merge_width;
+
+  const SortedRun* sr = nullptr;
+  bool done = false;
+  size_t start_index = 0;
+  unsigned int candidate_count = 0;
+
+  unsigned int max_files_to_compact =
+      std::min(max_merge_width, max_number_of_files_to_compact);
+  min_merge_width = std::max(min_merge_width, 2U);
+
+  // Caller checks the size before executing this function. This invariant is
+  // important because otherwise we may have a possible integer underflow when
+  // dealing with unsigned types.
+  assert(sorted_runs_.size() > 0);
+
+  // Considers a candidate file only if it is smaller than the
+  // total size accumulated so far.
+  for (size_t loop = 0; loop < sorted_runs_.size(); loop++) {
+    candidate_count = 0;
+
+    // Skip files that are already being compacted
+    for (sr = nullptr; loop < sorted_runs_.size(); loop++) {
+      sr = &sorted_runs_[loop];
+
+      if (!sr->being_compacted) {
+        candidate_count = 1;
+        break;
+      }
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf));
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: %s"
+                       "[%d] being compacted, skipping",
+                       cf_name_.c_str(), file_num_buf, loop);
+
+      sr = nullptr;
+    }
+
+    // This file is not being compacted. Consider it as the
+    // first candidate to be compacted.
+    uint64_t candidate_size = sr != nullptr ? sr->compensated_file_size : 0;
+    if (sr != nullptr) {
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Possible candidate %s[%d].",
+                       cf_name_.c_str(), file_num_buf, loop);
+    }
+
+    // Check if the succeeding files need compaction.
+    for (size_t i = loop + 1;
+         candidate_count < max_files_to_compact && i < sorted_runs_.size();
+         i++) {
+      const SortedRun* succeeding_sr = &sorted_runs_[i];
+      if (succeeding_sr->being_compacted) {
+        break;
+      }
+      // Pick files if the total/last candidate file size (increased by the
+      // specified ratio) is still larger than the next candidate file.
+      // candidate_size is the total size of files picked so far with the
+      // default kCompactionStopStyleTotalSize; with
+      // kCompactionStopStyleSimilarSize, it's simply the size of the last
+      // picked file.
+      double sz = candidate_size * (100.0 + ratio) / 100.0;
+      if (sz < static_cast<double>(succeeding_sr->size)) {
+        break;
+      }
+      if (mutable_cf_options_.compaction_options_universal.stop_style ==
+          kCompactionStopStyleSimilarSize) {
+        // Similar-size stopping rule: also check the last picked file isn't
+        // far larger than the next candidate file.
+        sz = (succeeding_sr->size * (100.0 + ratio)) / 100.0;
+        if (sz < static_cast<double>(candidate_size)) {
+          // If the small file we've encountered begins a run of similar-size
+          // files, we'll pick them up on a future iteration of the outer
+          // loop. If it's some lonely straggler, it'll eventually get picked
+          // by the last-resort read amp strategy which disregards size ratios.
+          break;
+        }
+        candidate_size = succeeding_sr->compensated_file_size;
+      } else {  // default kCompactionStopStyleTotalSize
+        candidate_size += succeeding_sr->compensated_file_size;
+      }
+      candidate_count++;
+    }
+
+    // Found a series of consecutive files that need compaction.
+    if (candidate_count >= (unsigned int)min_merge_width) {
+      start_index = loop;
+      done = true;
+      break;
+    } else {
+      for (size_t i = loop;
+           i < loop + candidate_count && i < sorted_runs_.size(); i++) {
+        const SortedRun* skipping_sr = &sorted_runs_[i];
+        char file_num_buf[256];
+        skipping_sr->DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+        ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Skipping %s",
+                         cf_name_.c_str(), file_num_buf);
+      }
+    }
+  }
+  if (!done || candidate_count <= 1) {
+    return nullptr;
+  }
+  size_t first_index_after = start_index + candidate_count;
+  // Compression is enabled if files compacted earlier already reached
+  // size ratio of compression.
+  bool enable_compression = true;
+  int ratio_to_compress =
+      mutable_cf_options_.compaction_options_universal.compression_size_percent;
+  if (ratio_to_compress >= 0) {
+    uint64_t total_size = 0;
+    for (auto& sorted_run : sorted_runs_) {
+      total_size += sorted_run.compensated_file_size;
+    }
+
+    uint64_t older_file_size = 0;
+    for (size_t i = sorted_runs_.size() - 1; i >= first_index_after; i--) {
+      older_file_size += sorted_runs_[i].size;
+      if (older_file_size * 100L >= total_size * (long)ratio_to_compress) {
+        enable_compression = false;
+        break;
+      }
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  for (unsigned int i = 0; i < first_index_after; i++) {
+    estimated_total_size += sorted_runs_[i].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+  int output_level;
+  if (first_index_after == sorted_runs_.size()) {
+    output_level = vstorage_->num_levels() - 1;
+  } else if (sorted_runs_[first_index_after].level == 0) {
+    output_level = 0;
+  } else {
+    output_level = sorted_runs_[first_index_after].level - 1;
+  }
+
+  // last level is reserved for the files ingested behind
+  if (ioptions_.allow_ingest_behind &&
+      (output_level == vstorage_->num_levels() - 1)) {
+    assert(output_level > 1);
+    output_level--;
+  }
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t i = start_index; i < first_index_after; i++) {
+    auto& picking_sr = sorted_runs_[i];
+    if (picking_sr.level == 0) {
+      FileMetaData* picking_file = picking_sr.file;
+      inputs[0].files.push_back(picking_file);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), i);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Picking %s",
+                     cf_name_.c_str(), file_num_buf);
+  }
+
+  std::vector<FileMetaData*> grandparents;
+  // Include grandparents for potential file cutting in incremental
+  // mode. It is for aligning file cutting boundaries across levels,
+  // so that subsequent compactions can pick files with aligned
+  // buffer.
+  // Single files are only picked up in incremental mode, so that
+  // there is no need for full range.
+  if (mutable_cf_options_.compaction_options_universal.incremental &&
+      first_index_after < sorted_runs_.size() &&
+      sorted_runs_[first_index_after].level > 1) {
+    grandparents = vstorage_->LevelFiles(sorted_runs_[first_index_after].level);
+  }
+
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+  CompactionReason compaction_reason;
+  if (max_number_of_files_to_compact == UINT_MAX) {
+    compaction_reason = CompactionReason::kUniversalSizeRatio;
+  } else {
+    compaction_reason = CompactionReason::kUniversalSortedRunNum;
+  }
+  return new Compaction(vstorage_, ioptions_, mutable_cf_options_,
+                        mutable_db_options_, std::move(inputs), output_level,
+                        MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                                            kCompactionStyleUniversal),
+                        GetMaxOverlappingBytes(), path_id,
+                        GetCompressionType(vstorage_, mutable_cf_options_,
+                                           output_level, 1, enable_compression),
+                        GetCompressionOptions(mutable_cf_options_, vstorage_,
+                                              output_level, enable_compression),
+                        Temperature::kUnknown,
+                        /* max_subcompactions */ 0, grandparents,
+                        /* is manual */ false, /* trim_ts */ "", score_,
+                        false /* deletion_compaction */,
+                        /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+// Look at overall size amplification. If size amplification
+// exceeds the configured value, then do a compaction
+// of the candidate files all the way upto the earliest
+// base file (overrides configured values of file-size ratios,
+// min_merge_width and max_merge_width).
+//
+Compaction* UniversalCompactionBuilder::PickCompactionToReduceSizeAmp() {
+  // percentage flexibility while reducing size amplification
+  uint64_t ratio = mutable_cf_options_.compaction_options_universal
+                       .max_size_amplification_percent;
+
+  unsigned int candidate_count = 0;
+  uint64_t candidate_size = 0;
+  size_t start_index = 0;
+  const SortedRun* sr = nullptr;
+
+  assert(!sorted_runs_.empty());
+  if (sorted_runs_.back().being_compacted) {
+    return nullptr;
+  }
+
+  // Skip files that are already being compacted
+  for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+    sr = &sorted_runs_[loop];
+    if (!sr->being_compacted) {
+      start_index = loop;  // Consider this as the first candidate.
+      break;
+    }
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] Universal: skipping %s[%d] compacted %s",
+                     cf_name_.c_str(), file_num_buf, loop,
+                     " cannot be a candidate to reduce size amp.\n");
+    sr = nullptr;
+  }
+
+  if (sr == nullptr) {
+    return nullptr;  // no candidate files
+  }
+  {
+    char file_num_buf[kFormatFileNumberBufSize];
+    sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: First candidate %s[%" ROCKSDB_PRIszt "] %s",
+        cf_name_.c_str(), file_num_buf, start_index, " to reduce size amp.\n");
+  }
+
+  // size of the base sorted run for size amp calculation
+  uint64_t base_sr_size = sorted_runs_.back().size;
+  size_t sr_end_idx = sorted_runs_.size() - 1;
+  // If tiered compaction is enabled and the last sorted run is the last level
+  if (ioptions_.preclude_last_level_data_seconds > 0 &&
+      ioptions_.num_levels > 2 &&
+      sorted_runs_.back().level == ioptions_.num_levels - 1 &&
+      sorted_runs_.size() > 1) {
+    sr_end_idx = sorted_runs_.size() - 2;
+    base_sr_size = sorted_runs_[sr_end_idx].size;
+  }
+
+  // keep adding up all the remaining files
+  for (size_t loop = start_index; loop < sr_end_idx; loop++) {
+    sr = &sorted_runs_[loop];
+    if (sr->being_compacted) {
+      // TODO with incremental compaction is supported, we might want to
+      // schedule some incremental compactions in parallel if needed.
+      char file_num_buf[kFormatFileNumberBufSize];
+      sr->Dump(file_num_buf, sizeof(file_num_buf), true);
+      ROCKS_LOG_BUFFER(
+          log_buffer_, "[%s] Universal: Possible candidate %s[%d] %s",
+          cf_name_.c_str(), file_num_buf, start_index,
+          " is already being compacted. No size amp reduction possible.\n");
+      return nullptr;
+    }
+    candidate_size += sr->compensated_file_size;
+    candidate_count++;
+  }
+  if (candidate_count == 0) {
+    return nullptr;
+  }
+
+  // size amplification = percentage of additional size
+  if (candidate_size * 100 < ratio * base_sr_size) {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, base_sr_size);
+    return nullptr;
+  } else {
+    ROCKS_LOG_BUFFER(
+        log_buffer_,
+        "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64
+        " earliest-file-size %" PRIu64,
+        cf_name_.c_str(), candidate_size, base_sr_size);
+  }
+  // Since incremental compaction can't include more than second last
+  // level, it can introduce penalty, compared to full compaction. We
+  // hard code the pentalty to be 80%. If we end up with a compaction
+  // fanout higher than 80% of full level compactions, we fall back
+  // to full level compaction.
+  // The 80% threshold is arbitrary and can be adjusted or made
+  // configurable in the future.
+  // This also prevent the case when compaction falls behind and we
+  // need to compact more levels for compactions to catch up.
+  if (mutable_cf_options_.compaction_options_universal.incremental) {
+    double fanout_threshold = static_cast<double>(base_sr_size) /
+                              static_cast<double>(candidate_size) * 1.8;
+    Compaction* picked = PickIncrementalForReduceSizeAmp(fanout_threshold);
+    if (picked != nullptr) {
+      // As the feature is still incremental, picking incremental compaction
+      // might fail and we will fall bck to compacting full level.
+      return picked;
+    }
+  }
+  return PickCompactionWithSortedRunRange(
+      start_index, sr_end_idx, CompactionReason::kUniversalSizeAmplification);
+}
+
+Compaction* UniversalCompactionBuilder::PickIncrementalForReduceSizeAmp(
+    double fanout_threshold) {
+  // Try find all potential compactions with total size just over
+  // options.max_compaction_size / 2, and take the one with the lowest
+  // fanout (defined in declaration of the function).
+  // This is done by having a sliding window of the files at the second
+  // lowest level, and keep expanding while finding overlapping in the
+  // last level. Once total size exceeds the size threshold, calculate
+  // the fanout value. And then shrinking from the small side of the
+  // window. Keep doing it until the end.
+  // Finally, we try to include upper level files if they fall into
+  // the range.
+  //
+  // Note that it is a similar problem as leveled compaction's
+  // kMinOverlappingRatio priority, but instead of picking single files
+  // we expand to a target compaction size. The reason is that in
+  // leveled compaction, actual fanout value tends to high, e.g. 10, so
+  // even with single file in down merging level, the extra size
+  // compacted in boundary files is at a lower ratio. But here users
+  // often have size of second last level size to be 1/4, 1/3 or even
+  // 1/2 of the bottommost level, so picking single file in second most
+  // level will cause significant waste, which is not desirable.
+  //
+  // This algorithm has lots of room to improve to pick more efficient
+  // compactions.
+  assert(sorted_runs_.size() >= 2);
+  int second_last_level = sorted_runs_[sorted_runs_.size() - 2].level;
+  if (second_last_level == 0) {
+    // Can't split Level 0.
+    return nullptr;
+  }
+  int output_level = sorted_runs_.back().level;
+  const std::vector<FileMetaData*>& bottom_files =
+      vstorage_->LevelFiles(output_level);
+  const std::vector<FileMetaData*>& files =
+      vstorage_->LevelFiles(second_last_level);
+  assert(!bottom_files.empty());
+  assert(!files.empty());
+
+  //  std::unordered_map<uint64_t, uint64_t> file_to_order;
+
+  int picked_start_idx = 0;
+  int picked_end_idx = 0;
+  double picked_fanout = fanout_threshold;
+
+  // Use half target compaction bytes as anchor to stop growing second most
+  // level files, and reserve growing space for more overlapping bottom level,
+  // clean cut, files from other levels, etc.
+  uint64_t comp_thres_size = mutable_cf_options_.max_compaction_bytes / 2;
+  int start_idx = 0;
+  int bottom_end_idx = 0;
+  int bottom_start_idx = 0;
+  uint64_t non_bottom_size = 0;
+  uint64_t bottom_size = 0;
+  bool end_bottom_size_counted = false;
+  for (int end_idx = 0; end_idx < static_cast<int>(files.size()); end_idx++) {
+    FileMetaData* end_file = files[end_idx];
+
+    // Include bottom most level files smaller than the current second
+    // last level file.
+    int num_skipped = 0;
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                          end_file->smallest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+      num_skipped++;
+    }
+
+    if (num_skipped > 1) {
+      // At least a file in the bottom most level falls into the file gap. No
+      // reason to include the file. We cut the range and start a new sliding
+      // window.
+      start_idx = end_idx;
+    }
+
+    if (start_idx == end_idx) {
+      // new sliding window.
+      non_bottom_size = 0;
+      bottom_size = 0;
+      bottom_start_idx = bottom_end_idx;
+      end_bottom_size_counted = false;
+    }
+
+    non_bottom_size += end_file->fd.file_size;
+
+    // Include all overlapping files in bottom level.
+    while (bottom_end_idx < static_cast<int>(bottom_files.size()) &&
+           icmp_->Compare(bottom_files[bottom_end_idx]->smallest,
+                          end_file->largest) < 0) {
+      if (!end_bottom_size_counted) {
+        bottom_size += bottom_files[bottom_end_idx]->fd.file_size;
+        end_bottom_size_counted = true;
+      }
+      if (icmp_->Compare(bottom_files[bottom_end_idx]->largest,
+                         end_file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      bottom_end_idx++;
+      end_bottom_size_counted = false;
+    }
+
+    if ((non_bottom_size + bottom_size > comp_thres_size ||
+         end_idx == static_cast<int>(files.size()) - 1) &&
+        non_bottom_size > 0) {  // Do we alow 0 size file at all?
+      // If it is a better compaction, remember it in picked* variables.
+      double fanout = static_cast<double>(bottom_size) /
+                      static_cast<double>(non_bottom_size);
+      if (fanout < picked_fanout) {
+        picked_start_idx = start_idx;
+        picked_end_idx = end_idx;
+        picked_fanout = fanout;
+      }
+      // Shrink from the start end to under comp_thres_size
+      while (non_bottom_size + bottom_size > comp_thres_size &&
+             start_idx <= end_idx) {
+        non_bottom_size -= files[start_idx]->fd.file_size;
+        start_idx++;
+        if (start_idx < static_cast<int>(files.size())) {
+          while (bottom_start_idx <= bottom_end_idx &&
+                 icmp_->Compare(bottom_files[bottom_start_idx]->largest,
+                                files[start_idx]->smallest) < 0) {
+            bottom_size -= bottom_files[bottom_start_idx]->fd.file_size;
+            bottom_start_idx++;
+          }
+        }
+      }
+    }
+  }
+
+  if (picked_fanout >= fanout_threshold) {
+    assert(picked_fanout == fanout_threshold);
+    return nullptr;
+  }
+
+  std::vector<CompactionInputFiles> inputs;
+  CompactionInputFiles bottom_level_inputs;
+  CompactionInputFiles second_last_level_inputs;
+  second_last_level_inputs.level = second_last_level;
+  bottom_level_inputs.level = output_level;
+  for (int i = picked_start_idx; i <= picked_end_idx; i++) {
+    if (files[i]->being_compacted) {
+      return nullptr;
+    }
+    second_last_level_inputs.files.push_back(files[i]);
+  }
+  assert(!second_last_level_inputs.empty());
+  if (!picker_->ExpandInputsToCleanCut(cf_name_, vstorage_,
+                                       &second_last_level_inputs,
+                                       /*next_smallest=*/nullptr)) {
+    return nullptr;
+  }
+  // We might be able to avoid this binary search if we save and expand
+  // from bottom_start_idx and bottom_end_idx, but for now, we use
+  // SetupOtherInputs() for simplicity.
+  int parent_index = -1;  // Create and use bottom_start_idx?
+  if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                 &second_last_level_inputs,
+                                 &bottom_level_inputs, &parent_index,
+                                 /*base_index=*/-1)) {
+    return nullptr;
+  }
+
+  // Try to include files in upper levels if they fall into the range.
+  // Since we need to go from lower level up and this is in the reverse
+  // order, compared to level order, we first write to an reversed
+  // data structure and finally copy them to compaction inputs.
+  InternalKey smallest, largest;
+  picker_->GetRange(second_last_level_inputs, &smallest, &largest);
+  std::vector<CompactionInputFiles> inputs_reverse;
+  for (auto it = ++(++sorted_runs_.rbegin()); it != sorted_runs_.rend(); it++) {
+    SortedRun& sr = *it;
+    if (sr.level == 0) {
+      break;
+    }
+    std::vector<FileMetaData*> level_inputs;
+    vstorage_->GetCleanInputsWithinInterval(sr.level, &smallest, &largest,
+                                            &level_inputs);
+    if (!level_inputs.empty()) {
+      inputs_reverse.push_back({});
+      inputs_reverse.back().level = sr.level;
+      inputs_reverse.back().files = level_inputs;
+      picker_->GetRange(inputs_reverse.back(), &smallest, &largest);
+    }
+  }
+  for (auto it = inputs_reverse.rbegin(); it != inputs_reverse.rend(); it++) {
+    inputs.push_back(*it);
+  }
+
+  inputs.push_back(second_last_level_inputs);
+  inputs.push_back(bottom_level_inputs);
+
+  int start_level = Compaction::kInvalidLevel;
+  for (const auto& in : inputs) {
+    if (!in.empty()) {
+      // inputs should already be sorted by level
+      start_level = in.level;
+      break;
+    }
+  }
+
+  // intra L0 compactions outputs could have overlap
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+
+  // TODO support multi paths?
+  uint32_t path_id = 0;
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+                         true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+                            true /* enable_compression */),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true,
+      CompactionReason::kUniversalSizeAmplification);
+}
+
+// Pick files marked for compaction. Typically, files are marked by
+// CompactOnDeleteCollector due to the presence of tombstones.
+Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
+  CompactionInputFiles start_level_inputs;
+  int output_level;
+  std::vector<CompactionInputFiles> inputs;
+  std::vector<FileMetaData*> grandparents;
+
+  if (vstorage_->num_levels() == 1) {
+    // This is single level universal. Since we're basically trying to reclaim
+    // space by processing files marked for compaction due to high tombstone
+    // density, let's do the same thing as compaction to reduce size amp which
+    // has the same goals.
+    int start_index = -1;
+
+    start_level_inputs.level = 0;
+    start_level_inputs.files.clear();
+    output_level = 0;
+    // Find the first file marked for compaction. Ignore the last file
+    for (size_t loop = 0; loop + 1 < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        continue;
+      }
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      if (f->marked_for_compaction) {
+        start_level_inputs.files.push_back(f);
+        start_index =
+            static_cast<int>(loop);  // Consider this as the first candidate.
+        break;
+      }
+    }
+    if (start_index < 0) {
+      // Either no file marked, or they're already being compacted
+      return nullptr;
+    }
+
+    for (size_t loop = start_index + 1; loop < sorted_runs_.size(); loop++) {
+      SortedRun* sr = &sorted_runs_[loop];
+      if (sr->being_compacted) {
+        break;
+      }
+
+      FileMetaData* f = vstorage_->LevelFiles(0)[loop];
+      start_level_inputs.files.push_back(f);
+    }
+    if (start_level_inputs.size() <= 1) {
+      // If only the last file in L0 is marked for compaction, ignore it
+      return nullptr;
+    }
+    inputs.push_back(start_level_inputs);
+  } else {
+    int start_level;
+
+    // For multi-level universal, the strategy is to make this look more like
+    // leveled. We pick one of the files marked for compaction and compact with
+    // overlapping files in the adjacent level.
+    picker_->PickFilesMarkedForCompaction(cf_name_, vstorage_, &start_level,
+                                          &output_level, &start_level_inputs);
+    if (start_level_inputs.empty()) {
+      return nullptr;
+    }
+
+    // Pick the first non-empty level after the start_level
+    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+         output_level++) {
+      if (vstorage_->NumLevelFiles(output_level) != 0) {
+        break;
+      }
+    }
+
+    // If all higher levels are empty, pick the highest level as output level
+    if (output_level == vstorage_->num_levels()) {
+      if (start_level == 0) {
+        output_level = vstorage_->num_levels() - 1;
+      } else {
+        // If start level is non-zero and all higher levels are empty, this
+        // compaction will translate into a trivial move. Since the idea is
+        // to reclaim space and trivial move doesn't help with that, we
+        // skip compaction in this case and return nullptr
+        return nullptr;
+      }
+    }
+    if (ioptions_.allow_ingest_behind &&
+        output_level == vstorage_->num_levels() - 1) {
+      assert(output_level > 1);
+      output_level--;
+    }
+
+    if (output_level != 0) {
+      if (start_level == 0) {
+        if (!picker_->GetOverlappingL0Files(vstorage_, &start_level_inputs,
+                                            output_level, nullptr)) {
+          return nullptr;
+        }
+      }
+
+      CompactionInputFiles output_level_inputs;
+      int parent_index = -1;
+
+      output_level_inputs.level = output_level;
+      if (!picker_->SetupOtherInputs(cf_name_, mutable_cf_options_, vstorage_,
+                                     &start_level_inputs, &output_level_inputs,
+                                     &parent_index, -1)) {
+        return nullptr;
+      }
+      inputs.push_back(start_level_inputs);
+      if (!output_level_inputs.empty()) {
+        inputs.push_back(output_level_inputs);
+      }
+      if (picker_->FilesRangeOverlapWithCompaction(
+              inputs, output_level,
+              Compaction::EvaluatePenultimateLevel(
+                  vstorage_, ioptions_, start_level, output_level))) {
+        return nullptr;
+      }
+
+      picker_->GetGrandparents(vstorage_, start_level_inputs,
+                               output_level_inputs, &grandparents);
+    } else {
+      inputs.push_back(start_level_inputs);
+    }
+  }
+
+  uint64_t estimated_total_size = 0;
+  // Use size of the output level as estimated file size
+  for (FileMetaData* f : vstorage_->LevelFiles(output_level)) {
+    estimated_total_size += f->fd.GetFileSize();
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      /* max_grandparent_overlap_bytes */ GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, grandparents, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true,
+      CompactionReason::kFilesMarkedForCompaction);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionToOldest(
+    size_t start_index, CompactionReason compaction_reason) {
+  return PickCompactionWithSortedRunRange(start_index, sorted_runs_.size() - 1,
+                                          compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
+    size_t start_index, size_t end_index, CompactionReason compaction_reason) {
+  assert(start_index < sorted_runs_.size());
+
+  // Estimate total file size
+  uint64_t estimated_total_size = 0;
+  for (size_t loop = start_index; loop <= end_index; loop++) {
+    estimated_total_size += sorted_runs_[loop].size;
+  }
+  uint32_t path_id =
+      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
+  int start_level = sorted_runs_[start_index].level;
+
+  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    inputs[i].level = start_level + static_cast<int>(i);
+  }
+  for (size_t loop = start_index; loop <= end_index; loop++) {
+    auto& picking_sr = sorted_runs_[loop];
+    if (picking_sr.level == 0) {
+      FileMetaData* f = picking_sr.file;
+      inputs[0].files.push_back(f);
+    } else {
+      auto& files = inputs[picking_sr.level - start_level].files;
+      for (auto* f : vstorage_->LevelFiles(picking_sr.level)) {
+        files.push_back(f);
+      }
+    }
+    std::string comp_reason_print_string;
+    if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+      comp_reason_print_string = "periodic compaction";
+    } else if (compaction_reason ==
+               CompactionReason::kUniversalSizeAmplification) {
+      comp_reason_print_string = "size amp";
+    } else {
+      assert(false);
+      comp_reason_print_string = "unknown: ";
+      comp_reason_print_string.append(
+          std::to_string(static_cast<int>(compaction_reason)));
+    }
+
+    char file_num_buf[256];
+    picking_sr.DumpSizeInfo(file_num_buf, sizeof(file_num_buf), loop);
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: %s picking %s",
+                     cf_name_.c_str(), comp_reason_print_string.c_str(),
+                     file_num_buf);
+  }
+
+  int output_level;
+  if (end_index == sorted_runs_.size() - 1) {
+    // output files at the last level, unless it's reserved
+    output_level = vstorage_->num_levels() - 1;
+    // last level is reserved for the files ingested behind
+    if (ioptions_.allow_ingest_behind) {
+      assert(output_level > 1);
+      output_level--;
+    }
+  } else {
+    // if it's not including all sorted_runs, it can only output to the level
+    // above the `end_index + 1` sorted_run.
+    output_level = sorted_runs_[end_index + 1].level - 1;
+  }
+
+  // intra L0 compactions outputs could have overlap
+  if (output_level != 0 &&
+      picker_->FilesRangeOverlapWithCompaction(
+          inputs, output_level,
+          Compaction::EvaluatePenultimateLevel(vstorage_, ioptions_,
+                                               start_level, output_level))) {
+    return nullptr;
+  }
+
+  // We never check size for
+  // compaction_options_universal.compression_size_percent,
+  // because we always compact all the files, so always compress.
+  return new Compaction(
+      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
+      std::move(inputs), output_level,
+      MaxFileSizeForLevel(mutable_cf_options_, output_level,
+                          kCompactionStyleUniversal),
+      GetMaxOverlappingBytes(), path_id,
+      GetCompressionType(vstorage_, mutable_cf_options_, output_level, 1,
+                         true /* enable_compression */),
+      GetCompressionOptions(mutable_cf_options_, vstorage_, output_level,
+                            true /* enable_compression */),
+      Temperature::kUnknown,
+      /* max_subcompactions */ 0, /* grandparents */ {}, /* is manual */ false,
+      /* trim_ts */ "", score_, false /* deletion_compaction */,
+      /* l0_files_might_overlap */ true, compaction_reason);
+}
+
+Compaction* UniversalCompactionBuilder::PickPeriodicCompaction() {
+  ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: Periodic Compaction",
+                   cf_name_.c_str());
+
+  // In universal compaction, sorted runs contain older data are almost always
+  // generated earlier too. To simplify the problem, we just try to trigger
+  // a full compaction. We start from the oldest sorted run and include
+  // all sorted runs, until we hit a sorted already being compacted.
+  // Since usually the largest (which is usually the oldest) sorted run is
+  // included anyway, doing a full compaction won't increase write
+  // amplification much.
+
+  // Get some information from marked files to check whether a file is
+  // included in the compaction.
+
+  size_t start_index = sorted_runs_.size();
+  while (start_index > 0 && !sorted_runs_[start_index - 1].being_compacted) {
+    start_index--;
+  }
+  if (start_index == sorted_runs_.size()) {
+    return nullptr;
+  }
+
+  // There is a rare corner case where we can't pick up all the files
+  // because some files are being compacted and we end up with picking files
+  // but none of them need periodic compaction. Unless we simply recompact
+  // the last sorted run (either the last level or last L0 file), we would just
+  // execute the compaction, in order to simplify  the logic.
+  if (start_index == sorted_runs_.size() - 1) {
+    bool included_file_marked = false;
+    int start_level = sorted_runs_[start_index].level;
+    FileMetaData* start_file = sorted_runs_[start_index].file;
+    for (const std::pair<int, FileMetaData*>& level_file_pair :
+         vstorage_->FilesMarkedForPeriodicCompaction()) {
+      if (start_level != 0) {
+        // Last sorted run is a level
+        if (start_level == level_file_pair.first) {
+          included_file_marked = true;
+          break;
+        }
+      } else {
+        // Last sorted run is a L0 file.
+        if (start_file == level_file_pair.second) {
+          included_file_marked = true;
+          break;
+        }
+      }
+    }
+    if (!included_file_marked) {
+      ROCKS_LOG_BUFFER(log_buffer_,
+                       "[%s] Universal: Cannot form a compaction covering file "
+                       "marked for periodic compaction",
+                       cf_name_.c_str());
+      return nullptr;
+    }
+  }
+
+  Compaction* c = PickCompactionToOldest(start_index,
+                                         CompactionReason::kPeriodicCompaction);
+
+  TEST_SYNC_POINT_CALLBACK(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return", c);
+
+  return c;
+}
+
+uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
+  if (!mutable_cf_options_.compaction_options_universal.incremental) {
+    return std::numeric_limits<uint64_t>::max();
+  } else {
+    // Try to align cutting boundary with files at the next level if the
+    // file isn't end up with 1/2 of target size, or it would overlap
+    // with two full size files at the next level.
+    return mutable_cf_options_.target_file_size_base / 2 * 3;
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_picker_universal.h b/src/rocksdb/db/compaction/compaction_picker_universal.h
new file mode 100644
index 000000000..5f897cc9b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_picker_universal.h
@@ -0,0 +1,32 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#ifndef ROCKSDB_LITE
+
+#include "db/compaction/compaction_picker.h"
+
+namespace ROCKSDB_NAMESPACE {
+class UniversalCompactionPicker : public CompactionPicker {
+ public:
+  UniversalCompactionPicker(const ImmutableOptions& ioptions,
+                            const InternalKeyComparator* icmp)
+      : CompactionPicker(ioptions, icmp) {}
+  virtual Compaction* PickCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer,
+      SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override;
+  virtual int MaxOutputLevel() const override { return NumberLevels() - 1; }
+
+  virtual bool NeedsCompaction(
+      const VersionStorageInfo* vstorage) const override;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_job.cc b/src/rocksdb/db/compaction/compaction_service_job.cc
new file mode 100644
index 000000000..1d2e99d99
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_job.cc
@@ -0,0 +1,829 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_job.h"
+#include "db/compaction/compaction_state.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "options/options_helper.h"
+#include "rocksdb/utilities/options_type.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+class SubcompactionState;
+
+CompactionServiceJobStatus
+CompactionJob::ProcessKeyValueCompactionWithCompactionService(
+    SubcompactionState* sub_compact) {
+  assert(sub_compact);
+  assert(sub_compact->compaction);
+  assert(db_options_.compaction_service);
+
+  const Compaction* compaction = sub_compact->compaction;
+  CompactionServiceInput compaction_input;
+  compaction_input.output_level = compaction->output_level();
+  compaction_input.db_id = db_id_;
+
+  const std::vector<CompactionInputFiles>& inputs =
+      *(compact_->compaction->inputs());
+  for (const auto& files_per_level : inputs) {
+    for (const auto& file : files_per_level.files) {
+      compaction_input.input_files.emplace_back(
+          MakeTableFileName(file->fd.GetNumber()));
+    }
+  }
+  compaction_input.column_family.name =
+      compaction->column_family_data()->GetName();
+  compaction_input.column_family.options =
+      compaction->column_family_data()->GetLatestCFOptions();
+  compaction_input.db_options =
+      BuildDBOptions(db_options_, mutable_db_options_copy_);
+  compaction_input.snapshots = existing_snapshots_;
+  compaction_input.has_begin = sub_compact->start.has_value();
+  compaction_input.begin =
+      compaction_input.has_begin ? sub_compact->start->ToString() : "";
+  compaction_input.has_end = sub_compact->end.has_value();
+  compaction_input.end =
+      compaction_input.has_end ? sub_compact->end->ToString() : "";
+
+  std::string compaction_input_binary;
+  Status s = compaction_input.Write(&compaction_input_binary);
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  std::ostringstream input_files_oss;
+  bool is_first_one = true;
+  for (const auto& file : compaction_input.input_files) {
+    input_files_oss << (is_first_one ? "" : ", ") << file;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "[%s] [JOB %d] Starting remote compaction (output level: %d): %s",
+      compaction_input.column_family.name.c_str(), job_id_,
+      compaction_input.output_level, input_files_oss.str().c_str());
+  CompactionServiceJobInfo info(dbname_, db_id_, db_session_id_,
+                                GetCompactionId(sub_compact), thread_pri_);
+  CompactionServiceJobStatus compaction_status =
+      db_options_.compaction_service->StartV2(info, compaction_input_binary);
+  switch (compaction_status) {
+    case CompactionServiceJobStatus::kSuccess:
+      break;
+    case CompactionServiceJobStatus::kFailure:
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to start compaction job.");
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "[%s] [JOB %d] Remote compaction failed to start.",
+                     compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    case CompactionServiceJobStatus::kUseLocal:
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Remote compaction fallback to local by API Start.",
+          compaction_input.column_family.name.c_str(), job_id_);
+      return compaction_status;
+    default:
+      assert(false);  // unknown status
+      break;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Waiting for remote compaction...",
+                 compaction_input.column_family.name.c_str(), job_id_);
+  std::string compaction_result_binary;
+  compaction_status = db_options_.compaction_service->WaitForCompleteV2(
+      info, &compaction_result_binary);
+
+  if (compaction_status == CompactionServiceJobStatus::kUseLocal) {
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction fallback to local by API "
+                   "WaitForComplete.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  CompactionServiceResult compaction_result;
+  s = CompactionServiceResult::Read(compaction_result_binary,
+                                    &compaction_result);
+
+  if (compaction_status == CompactionServiceJobStatus::kFailure) {
+    if (s.ok()) {
+      if (compaction_result.status.ok()) {
+        sub_compact->status = Status::Incomplete(
+            "CompactionService failed to run the compaction job (even though "
+            "the internal status is okay).");
+      } else {
+        // set the current sub compaction status with the status returned from
+        // remote
+        sub_compact->status = compaction_result.status;
+      }
+    } else {
+      sub_compact->status = Status::Incomplete(
+          "CompactionService failed to run the compaction job (and no valid "
+          "result is returned).");
+      compaction_result.status.PermitUncheckedError();
+    }
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "[%s] [JOB %d] Remote compaction failed.",
+                   compaction_input.column_family.name.c_str(), job_id_);
+    return compaction_status;
+  }
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    compaction_result.status.PermitUncheckedError();
+    return CompactionServiceJobStatus::kFailure;
+  }
+  sub_compact->status = compaction_result.status;
+
+  std::ostringstream output_files_oss;
+  is_first_one = true;
+  for (const auto& file : compaction_result.output_files) {
+    output_files_oss << (is_first_one ? "" : ", ") << file.file_name;
+    is_first_one = false;
+  }
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Receive remote compaction result, output path: "
+                 "%s, files: %s",
+                 compaction_input.column_family.name.c_str(), job_id_,
+                 compaction_result.output_path.c_str(),
+                 output_files_oss.str().c_str());
+
+  if (!s.ok()) {
+    sub_compact->status = s;
+    return CompactionServiceJobStatus::kFailure;
+  }
+
+  for (const auto& file : compaction_result.output_files) {
+    uint64_t file_num = versions_->NewFileNumber();
+    auto src_file = compaction_result.output_path + "/" + file.file_name;
+    auto tgt_file = TableFileName(compaction->immutable_options()->cf_paths,
+                                  file_num, compaction->output_path_id());
+    s = fs_->RenameFile(src_file, tgt_file, IOOptions(), nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+
+    FileMetaData meta;
+    uint64_t file_size;
+    s = fs_->GetFileSize(tgt_file, IOOptions(), &file_size, nullptr);
+    if (!s.ok()) {
+      sub_compact->status = s;
+      return CompactionServiceJobStatus::kFailure;
+    }
+    meta.fd = FileDescriptor(file_num, compaction->output_path_id(), file_size,
+                             file.smallest_seqno, file.largest_seqno);
+    meta.smallest.DecodeFrom(file.smallest_internal_key);
+    meta.largest.DecodeFrom(file.largest_internal_key);
+    meta.oldest_ancester_time = file.oldest_ancester_time;
+    meta.file_creation_time = file.file_creation_time;
+    meta.marked_for_compaction = file.marked_for_compaction;
+    meta.unique_id = file.unique_id;
+
+    auto cfd = compaction->column_family_data();
+    sub_compact->Current().AddOutput(std::move(meta),
+                                     cfd->internal_comparator(), false, false,
+                                     true, file.paranoid_hash);
+  }
+  sub_compact->compaction_job_stats = compaction_result.stats;
+  sub_compact->Current().SetNumOutputRecords(
+      compaction_result.num_output_records);
+  sub_compact->Current().SetTotalBytes(compaction_result.total_bytes);
+  RecordTick(stats_, REMOTE_COMPACT_READ_BYTES, compaction_result.bytes_read);
+  RecordTick(stats_, REMOTE_COMPACT_WRITE_BYTES,
+             compaction_result.bytes_written);
+  return CompactionServiceJobStatus::kSuccess;
+}
+
+std::string CompactionServiceCompactionJob::GetTableFileName(
+    uint64_t file_number) {
+  return MakeTableFileName(output_path_, file_number);
+}
+
+void CompactionServiceCompactionJob::RecordCompactionIOStats() {
+  compaction_result_->bytes_read += IOSTATS(bytes_read);
+  compaction_result_->bytes_written += IOSTATS(bytes_written);
+  CompactionJob::RecordCompactionIOStats();
+}
+
+CompactionServiceCompactionJob::CompactionServiceCompactionJob(
+    int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
+    const MutableDBOptions& mutable_db_options, const FileOptions& file_options,
+    VersionSet* versions, const std::atomic<bool>* shutting_down,
+    LogBuffer* log_buffer, FSDirectory* output_directory, Statistics* stats,
+    InstrumentedMutex* db_mutex, ErrorHandler* db_error_handler,
+    std::vector<SequenceNumber> existing_snapshots,
+    std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
+    const std::string& dbname, const std::shared_ptr<IOTracer>& io_tracer,
+    const std::atomic<bool>& manual_compaction_canceled,
+    const std::string& db_id, const std::string& db_session_id,
+    std::string output_path,
+    const CompactionServiceInput& compaction_service_input,
+    CompactionServiceResult* compaction_service_result)
+    : CompactionJob(
+          job_id, compaction, db_options, mutable_db_options, file_options,
+          versions, shutting_down, log_buffer, nullptr, output_directory,
+          nullptr, stats, db_mutex, db_error_handler,
+          std::move(existing_snapshots), kMaxSequenceNumber, nullptr, nullptr,
+          std::move(table_cache), event_logger,
+          compaction->mutable_cf_options()->paranoid_file_checks,
+          compaction->mutable_cf_options()->report_bg_io_stats, dbname,
+          &(compaction_service_result->stats), Env::Priority::USER, io_tracer,
+          manual_compaction_canceled, db_id, db_session_id,
+          compaction->column_family_data()->GetFullHistoryTsLow()),
+      output_path_(std::move(output_path)),
+      compaction_input_(compaction_service_input),
+      compaction_result_(compaction_service_result) {}
+
+Status CompactionServiceCompactionJob::Run() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_COMPACTION_RUN);
+
+  auto* c = compact_->compaction;
+  assert(c->column_family_data() != nullptr);
+  assert(c->column_family_data()->current()->storage_info()->NumLevelFiles(
+             compact_->compaction->level()) > 0);
+
+  write_hint_ =
+      c->column_family_data()->CalculateSSTWriteHint(c->output_level());
+  bottommost_level_ = c->bottommost_level();
+
+  Slice begin = compaction_input_.begin;
+  Slice end = compaction_input_.end;
+  compact_->sub_compact_states.emplace_back(
+      c,
+      compaction_input_.has_begin ? std::optional<Slice>(begin)
+                                  : std::optional<Slice>(),
+      compaction_input_.has_end ? std::optional<Slice>(end)
+                                : std::optional<Slice>(),
+      /*sub_job_id*/ 0);
+
+  log_buffer_->FlushBufferToLog();
+  LogCompaction();
+  const uint64_t start_micros = db_options_.clock->NowMicros();
+  // Pick the only sub-compaction we should have
+  assert(compact_->sub_compact_states.size() == 1);
+  SubcompactionState* sub_compact = compact_->sub_compact_states.data();
+
+  ProcessKeyValueCompaction(sub_compact);
+
+  compaction_stats_.stats.micros =
+      db_options_.clock->NowMicros() - start_micros;
+  compaction_stats_.stats.cpu_micros =
+      sub_compact->compaction_job_stats.cpu_micros;
+
+  RecordTimeToHistogram(stats_, COMPACTION_TIME,
+                        compaction_stats_.stats.micros);
+  RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
+                        compaction_stats_.stats.cpu_micros);
+
+  Status status = sub_compact->status;
+  IOStatus io_s = sub_compact->io_status;
+
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  if (status.ok()) {
+    constexpr IODebugContext* dbg = nullptr;
+
+    if (output_directory_) {
+      io_s = output_directory_->FsyncWithDirOptions(IOOptions(), dbg,
+                                                    DirFsyncOptions());
+    }
+  }
+  if (io_status_.ok()) {
+    io_status_ = io_s;
+  }
+  if (status.ok()) {
+    status = io_s;
+  }
+  if (status.ok()) {
+    // TODO: Add verify_table()
+  }
+
+  // Finish up all book-keeping to unify the subcompaction results
+  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
+  UpdateCompactionStats();
+  RecordCompactionIOStats();
+
+  LogFlush(db_options_.info_log);
+  compact_->status = status;
+  compact_->status.PermitUncheckedError();
+
+  // Build compaction result
+  compaction_result_->output_level = compact_->compaction->output_level();
+  compaction_result_->output_path = output_path_;
+  for (const auto& output_file : sub_compact->GetOutputs()) {
+    auto& meta = output_file.meta;
+    compaction_result_->output_files.emplace_back(
+        MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno,
+        meta.fd.largest_seqno, meta.smallest.Encode().ToString(),
+        meta.largest.Encode().ToString(), meta.oldest_ancester_time,
+        meta.file_creation_time, output_file.validator.GetHash(),
+        meta.marked_for_compaction, meta.unique_id);
+  }
+  InternalStats::CompactionStatsFull compaction_stats;
+  sub_compact->AggregateCompactionStats(compaction_stats);
+  compaction_result_->num_output_records =
+      compaction_stats.stats.num_output_records;
+  compaction_result_->total_bytes = compaction_stats.TotalBytesWritten();
+
+  return status;
+}
+
+void CompactionServiceCompactionJob::CleanupCompaction() {
+  CompactionJob::CleanupCompaction();
+}
+
+// Internal binary format for the input and result data
+enum BinaryFormatVersion : uint32_t {
+  kOptionsString = 1,  // Use string format similar to Option string format
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cfd_type_info = {
+    {"name",
+     {offsetof(struct ColumnFamilyDescriptor, name), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"options",
+     {offsetof(struct ColumnFamilyDescriptor, options),
+      OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto cf_options = static_cast<ColumnFamilyOptions*>(addr);
+        return GetColumnFamilyOptionsFromString(opts, ColumnFamilyOptions(),
+                                                value, cf_options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto cf_options = static_cast<const ColumnFamilyOptions*>(addr);
+        std::string result;
+        auto status =
+            GetStringFromColumnFamilyOptions(opts, *cf_options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const ColumnFamilyOptions*>(addr1);
+        const auto that_one = static_cast<const ColumnFamilyOptions*>(addr2);
+        auto this_conf = CFOptionsAsConfigurable(*this_one);
+        auto that_conf = CFOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_input_type_info = {
+    {"column_family",
+     OptionTypeInfo::Struct(
+         "column_family", &cfd_type_info,
+         offsetof(struct CompactionServiceInput, column_family),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+    {"db_options",
+     {offsetof(struct CompactionServiceInput, db_options),
+      OptionType::kConfigurable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto options = static_cast<DBOptions*>(addr);
+        return GetDBOptionsFromString(opts, DBOptions(), value, options);
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto options = static_cast<const DBOptions*>(addr);
+        std::string result;
+        auto status = GetStringFromDBOptions(opts, *options, &result);
+        *value = "{" + result + "}";
+        return status;
+      },
+      [](const ConfigOptions& opts, const std::string& name, const void* addr1,
+         const void* addr2, std::string* mismatch) {
+        const auto this_one = static_cast<const DBOptions*>(addr1);
+        const auto that_one = static_cast<const DBOptions*>(addr2);
+        auto this_conf = DBOptionsAsConfigurable(*this_one);
+        auto that_conf = DBOptionsAsConfigurable(*that_one);
+        std::string mismatch_opt;
+        bool result =
+            this_conf->AreEquivalent(opts, that_conf.get(), &mismatch_opt);
+        if (!result) {
+          *mismatch = name + "." + mismatch_opt;
+        }
+        return result;
+      }}},
+    {"snapshots", OptionTypeInfo::Vector<uint64_t>(
+                      offsetof(struct CompactionServiceInput, snapshots),
+                      OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                      {0, OptionType::kUInt64T})},
+    {"input_files", OptionTypeInfo::Vector<std::string>(
+                        offsetof(struct CompactionServiceInput, input_files),
+                        OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+                        {0, OptionType::kEncodedString})},
+    {"output_level",
+     {offsetof(struct CompactionServiceInput, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"db_id",
+     {offsetof(struct CompactionServiceInput, db_id),
+      OptionType::kEncodedString}},
+    {"has_begin",
+     {offsetof(struct CompactionServiceInput, has_begin), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"begin",
+     {offsetof(struct CompactionServiceInput, begin),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"has_end",
+     {offsetof(struct CompactionServiceInput, has_end), OptionType::kBoolean,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"end",
+     {offsetof(struct CompactionServiceInput, end), OptionType::kEncodedString,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    cs_output_file_type_info = {
+        {"file_name",
+         {offsetof(struct CompactionServiceOutputFile, file_name),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, smallest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_seqno",
+         {offsetof(struct CompactionServiceOutputFile, largest_seqno),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, smallest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_internal_key",
+         {offsetof(struct CompactionServiceOutputFile, largest_internal_key),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"oldest_ancester_time",
+         {offsetof(struct CompactionServiceOutputFile, oldest_ancester_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_creation_time",
+         {offsetof(struct CompactionServiceOutputFile, file_creation_time),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"paranoid_hash",
+         {offsetof(struct CompactionServiceOutputFile, paranoid_hash),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"marked_for_compaction",
+         {offsetof(struct CompactionServiceOutputFile, marked_for_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"unique_id",
+         OptionTypeInfo::Array<uint64_t, 2>(
+             offsetof(struct CompactionServiceOutputFile, unique_id),
+             OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+             {0, OptionType::kUInt64T})},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    compaction_job_stats_type_info = {
+        {"elapsed_micros",
+         {offsetof(struct CompactionJobStats, elapsed_micros),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"cpu_micros",
+         {offsetof(struct CompactionJobStats, cpu_micros), OptionType::kUInt64T,
+          OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+        {"num_input_records",
+         {offsetof(struct CompactionJobStats, num_input_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_blobs_read",
+         {offsetof(struct CompactionJobStats, num_blobs_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files",
+         {offsetof(struct CompactionJobStats, num_input_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_files_at_output_level",
+         {offsetof(struct CompactionJobStats, num_input_files_at_output_level),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_records",
+         {offsetof(struct CompactionJobStats, num_output_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files",
+         {offsetof(struct CompactionJobStats, num_output_files),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_output_files_blob",
+         {offsetof(struct CompactionJobStats, num_output_files_blob),
+          OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_full_compaction",
+         {offsetof(struct CompactionJobStats, is_full_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"is_manual_compaction",
+         {offsetof(struct CompactionJobStats, is_manual_compaction),
+          OptionType::kBoolean, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_bytes",
+         {offsetof(struct CompactionJobStats, total_input_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_blob_bytes_read",
+         {offsetof(struct CompactionJobStats, total_blob_bytes_read),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes",
+         {offsetof(struct CompactionJobStats, total_output_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_output_bytes_blob",
+         {offsetof(struct CompactionJobStats, total_output_bytes_blob),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_records_replaced",
+         {offsetof(struct CompactionJobStats, num_records_replaced),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_key_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_key_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"total_input_raw_value_bytes",
+         {offsetof(struct CompactionJobStats, total_input_raw_value_bytes),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_input_deletion_records",
+         {offsetof(struct CompactionJobStats, num_input_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_expired_deletion_records",
+         {offsetof(struct CompactionJobStats, num_expired_deletion_records),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_corrupt_keys",
+         {offsetof(struct CompactionJobStats, num_corrupt_keys),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_write_nanos",
+         {offsetof(struct CompactionJobStats, file_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_range_sync_nanos",
+         {offsetof(struct CompactionJobStats, file_range_sync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_fsync_nanos",
+         {offsetof(struct CompactionJobStats, file_fsync_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"file_prepare_write_nanos",
+         {offsetof(struct CompactionJobStats, file_prepare_write_nanos),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"smallest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, smallest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"largest_output_key_prefix",
+         {offsetof(struct CompactionJobStats, largest_output_key_prefix),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_fallthru",
+         {offsetof(struct CompactionJobStats, num_single_del_fallthru),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"num_single_del_mismatch",
+         {offsetof(struct CompactionJobStats, num_single_del_mismatch),
+          OptionType::kUInt64T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+namespace {
+// this is a helper struct to serialize and deserialize class Status, because
+// Status's members are not public.
+struct StatusSerializationAdapter {
+  uint8_t code;
+  uint8_t subcode;
+  uint8_t severity;
+  std::string message;
+
+  StatusSerializationAdapter() = default;
+  explicit StatusSerializationAdapter(const Status& s) {
+    code = s.code();
+    subcode = s.subcode();
+    severity = s.severity();
+    auto msg = s.getState();
+    message = msg ? msg : "";
+  }
+
+  Status GetStatus() const {
+    return Status{static_cast<Status::Code>(code),
+                  static_cast<Status::SubCode>(subcode),
+                  static_cast<Status::Severity>(severity), message};
+  }
+};
+}  // namespace
+
+static std::unordered_map<std::string, OptionTypeInfo>
+    status_adapter_type_info = {
+        {"code",
+         {offsetof(struct StatusSerializationAdapter, code),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"subcode",
+         {offsetof(struct StatusSerializationAdapter, subcode),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"severity",
+         {offsetof(struct StatusSerializationAdapter, severity),
+          OptionType::kUInt8T, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+        {"message",
+         {offsetof(struct StatusSerializationAdapter, message),
+          OptionType::kEncodedString, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+};
+
+static std::unordered_map<std::string, OptionTypeInfo> cs_result_type_info = {
+    {"status",
+     {offsetof(struct CompactionServiceResult, status),
+      OptionType::kCustomizable, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone,
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const std::string& value, void* addr) {
+        auto status_obj = static_cast<Status*>(addr);
+        StatusSerializationAdapter adapter;
+        Status s = OptionTypeInfo::ParseType(
+            opts, value, status_adapter_type_info, &adapter);
+        *status_obj = adapter.GetStatus();
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr, std::string* value) {
+        const auto status_obj = static_cast<const Status*>(addr);
+        StatusSerializationAdapter adapter(*status_obj);
+        std::string result;
+        Status s = OptionTypeInfo::SerializeType(opts, status_adapter_type_info,
+                                                 &adapter, &result);
+        *value = "{" + result + "}";
+        return s;
+      },
+      [](const ConfigOptions& opts, const std::string& /*name*/,
+         const void* addr1, const void* addr2, std::string* mismatch) {
+        const auto status1 = static_cast<const Status*>(addr1);
+        const auto status2 = static_cast<const Status*>(addr2);
+
+        StatusSerializationAdapter adatper1(*status1);
+        StatusSerializationAdapter adapter2(*status2);
+        return OptionTypeInfo::TypesAreEqual(opts, status_adapter_type_info,
+                                             &adatper1, &adapter2, mismatch);
+      }}},
+    {"output_files",
+     OptionTypeInfo::Vector<CompactionServiceOutputFile>(
+         offsetof(struct CompactionServiceResult, output_files),
+         OptionVerificationType::kNormal, OptionTypeFlags::kNone,
+         OptionTypeInfo::Struct("output_files", &cs_output_file_type_info, 0,
+                                OptionVerificationType::kNormal,
+                                OptionTypeFlags::kNone))},
+    {"output_level",
+     {offsetof(struct CompactionServiceResult, output_level), OptionType::kInt,
+      OptionVerificationType::kNormal, OptionTypeFlags::kNone}},
+    {"output_path",
+     {offsetof(struct CompactionServiceResult, output_path),
+      OptionType::kEncodedString, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"num_output_records",
+     {offsetof(struct CompactionServiceResult, num_output_records),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"total_bytes",
+     {offsetof(struct CompactionServiceResult, total_bytes),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_read",
+     {offsetof(struct CompactionServiceResult, bytes_read),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"bytes_written",
+     {offsetof(struct CompactionServiceResult, bytes_written),
+      OptionType::kUInt64T, OptionVerificationType::kNormal,
+      OptionTypeFlags::kNone}},
+    {"stats", OptionTypeInfo::Struct(
+                  "stats", &compaction_job_stats_type_info,
+                  offsetof(struct CompactionServiceResult, stats),
+                  OptionVerificationType::kNormal, OptionTypeFlags::kNone)},
+};
+
+Status CompactionServiceInput::Read(const std::string& data_str,
+                                    CompactionServiceInput* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceInput string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_input_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Input data version not supported: " +
+        std::to_string(format_version));
+  }
+}
+
+Status CompactionServiceInput::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_input_type_info, this, output);
+}
+
+Status CompactionServiceResult::Read(const std::string& data_str,
+                                     CompactionServiceResult* obj) {
+  if (data_str.size() <= sizeof(BinaryFormatVersion)) {
+    return Status::InvalidArgument("Invalid CompactionServiceResult string");
+  }
+  auto format_version = DecodeFixed32(data_str.data());
+  if (format_version == kOptionsString) {
+    ConfigOptions cf;
+    cf.invoke_prepare_options = false;
+    cf.ignore_unknown_options = true;
+    return OptionTypeInfo::ParseType(
+        cf, data_str.substr(sizeof(BinaryFormatVersion)), cs_result_type_info,
+        obj);
+  } else {
+    return Status::NotSupported(
+        "Compaction Service Result data version not supported: " +
+        std::to_string(format_version));
+  }
+}
+
+Status CompactionServiceResult::Write(std::string* output) {
+  char buf[sizeof(BinaryFormatVersion)];
+  EncodeFixed32(buf, kOptionsString);
+  output->append(buf, sizeof(BinaryFormatVersion));
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::SerializeType(cf, cs_result_type_info, this, output);
+}
+
+#ifndef NDEBUG
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceResult::TEST_Equals(CompactionServiceResult* other,
+                                          std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_result_type_info, this, other,
+                                       mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other) {
+  std::string mismatch;
+  return TEST_Equals(other, &mismatch);
+}
+
+bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
+                                         std::string* mismatch) {
+  ConfigOptions cf;
+  cf.invoke_prepare_options = false;
+  return OptionTypeInfo::TypesAreEqual(cf, cs_input_type_info, this, other,
+                                       mismatch);
+}
+#endif  // NDEBUG
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_service_test.cc b/src/rocksdb/db/compaction/compaction_service_test.cc
new file mode 100644
index 000000000..c475c4e3b
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_service_test.cc
@@ -0,0 +1,966 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "table/unique_id_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MyTestCompactionService : public CompactionService {
+ public:
+  MyTestCompactionService(
+      std::string db_path, Options& options,
+      std::shared_ptr<Statistics>& statistics,
+      std::vector<std::shared_ptr<EventListener>>& listeners,
+      std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+          table_properties_collector_factories)
+      : db_path_(std::move(db_path)),
+        options_(options),
+        statistics_(statistics),
+        start_info_("na", "na", "na", 0, Env::TOTAL),
+        wait_info_("na", "na", "na", 0, Env::TOTAL),
+        listeners_(listeners),
+        table_properties_collector_factories_(
+            std::move(table_properties_collector_factories)) {}
+
+  static const char* kClassName() { return "MyTestCompactionService"; }
+
+  const char* Name() const override { return kClassName(); }
+
+  CompactionServiceJobStatus StartV2(
+      const CompactionServiceJobInfo& info,
+      const std::string& compaction_service_input) override {
+    InstrumentedMutexLock l(&mutex_);
+    start_info_ = info;
+    assert(info.db_name == db_path_);
+    jobs_.emplace(info.job_id, compaction_service_input);
+    CompactionServiceJobStatus s = CompactionServiceJobStatus::kSuccess;
+    if (is_override_start_status_) {
+      return override_start_status_;
+    }
+    return s;
+  }
+
+  CompactionServiceJobStatus WaitForCompleteV2(
+      const CompactionServiceJobInfo& info,
+      std::string* compaction_service_result) override {
+    std::string compaction_input;
+    assert(info.db_name == db_path_);
+    {
+      InstrumentedMutexLock l(&mutex_);
+      wait_info_ = info;
+      auto i = jobs_.find(info.job_id);
+      if (i == jobs_.end()) {
+        return CompactionServiceJobStatus::kFailure;
+      }
+      compaction_input = std::move(i->second);
+      jobs_.erase(i);
+    }
+
+    if (is_override_wait_status_) {
+      return override_wait_status_;
+    }
+
+    CompactionServiceOptionsOverride options_override;
+    options_override.env = options_.env;
+    options_override.file_checksum_gen_factory =
+        options_.file_checksum_gen_factory;
+    options_override.comparator = options_.comparator;
+    options_override.merge_operator = options_.merge_operator;
+    options_override.compaction_filter = options_.compaction_filter;
+    options_override.compaction_filter_factory =
+        options_.compaction_filter_factory;
+    options_override.prefix_extractor = options_.prefix_extractor;
+    options_override.table_factory = options_.table_factory;
+    options_override.sst_partitioner_factory = options_.sst_partitioner_factory;
+    options_override.statistics = statistics_;
+    if (!listeners_.empty()) {
+      options_override.listeners = listeners_;
+    }
+
+    if (!table_properties_collector_factories_.empty()) {
+      options_override.table_properties_collector_factories =
+          table_properties_collector_factories_;
+    }
+
+    OpenAndCompactOptions options;
+    options.canceled = &canceled_;
+
+    Status s = DB::OpenAndCompact(
+        options, db_path_, db_path_ + "/" + std::to_string(info.job_id),
+        compaction_input, compaction_service_result, options_override);
+    if (is_override_wait_result_) {
+      *compaction_service_result = override_wait_result_;
+    }
+    compaction_num_.fetch_add(1);
+    if (s.ok()) {
+      return CompactionServiceJobStatus::kSuccess;
+    } else {
+      return CompactionServiceJobStatus::kFailure;
+    }
+  }
+
+  int GetCompactionNum() { return compaction_num_.load(); }
+
+  CompactionServiceJobInfo GetCompactionInfoForStart() { return start_info_; }
+  CompactionServiceJobInfo GetCompactionInfoForWait() { return wait_info_; }
+
+  void OverrideStartStatus(CompactionServiceJobStatus s) {
+    is_override_start_status_ = true;
+    override_start_status_ = s;
+  }
+
+  void OverrideWaitStatus(CompactionServiceJobStatus s) {
+    is_override_wait_status_ = true;
+    override_wait_status_ = s;
+  }
+
+  void OverrideWaitResult(std::string str) {
+    is_override_wait_result_ = true;
+    override_wait_result_ = std::move(str);
+  }
+
+  void ResetOverride() {
+    is_override_wait_result_ = false;
+    is_override_start_status_ = false;
+    is_override_wait_status_ = false;
+  }
+
+  void SetCanceled(bool canceled) { canceled_ = canceled; }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::atomic_int compaction_num_{0};
+  std::map<uint64_t, std::string> jobs_;
+  const std::string db_path_;
+  Options options_;
+  std::shared_ptr<Statistics> statistics_;
+  CompactionServiceJobInfo start_info_;
+  CompactionServiceJobInfo wait_info_;
+  bool is_override_start_status_ = false;
+  CompactionServiceJobStatus override_start_status_ =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_status_ = false;
+  CompactionServiceJobStatus override_wait_status_ =
+      CompactionServiceJobStatus::kFailure;
+  bool is_override_wait_result_ = false;
+  std::string override_wait_result_;
+  std::vector<std::shared_ptr<EventListener>> listeners_;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      table_properties_collector_factories_;
+  std::atomic_bool canceled_{false};
+};
+
+class CompactionServiceTest : public DBTestBase {
+ public:
+  explicit CompactionServiceTest()
+      : DBTestBase("compaction_service_test", true) {}
+
+ protected:
+  void ReopenWithCompactionService(Options* options) {
+    options->env = env_;
+    primary_statistics_ = CreateDBStatistics();
+    options->statistics = primary_statistics_;
+    compactor_statistics_ = CreateDBStatistics();
+
+    compaction_service_ = std::make_shared<MyTestCompactionService>(
+        dbname_, *options, compactor_statistics_, remote_listeners,
+        remote_table_properties_collector_factories);
+    options->compaction_service = compaction_service_;
+    DestroyAndReopen(*options);
+  }
+
+  Statistics* GetCompactorStatistics() { return compactor_statistics_.get(); }
+
+  Statistics* GetPrimaryStatistics() { return primary_statistics_.get(); }
+
+  MyTestCompactionService* GetCompactionService() {
+    CompactionService* cs = compaction_service_.get();
+    return static_cast_with_check<MyTestCompactionService>(cs);
+  }
+
+  void GenerateTestData() {
+    // Generate 20 files @ L2
+    for (int i = 0; i < 20; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 10 + j;
+        ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    // Generate 10 files @ L1 overlap with all 20 files @ L2
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        int key_id = i * 20 + j * 2;
+        ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+    ASSERT_EQ(FilesPerLevel(), "0,10,20");
+  }
+
+  void VerifyTestData() {
+    for (int i = 0; i < 200; i++) {
+      auto result = Get(Key(i));
+      if (i % 2) {
+        ASSERT_EQ(result, "value" + std::to_string(i));
+      } else {
+        ASSERT_EQ(result, "value_new" + std::to_string(i));
+      }
+    }
+  }
+
+  std::vector<std::shared_ptr<EventListener>> remote_listeners;
+  std::vector<std::shared_ptr<TablePropertiesCollectorFactory>>
+      remote_table_properties_collector_factories;
+
+ private:
+  std::shared_ptr<Statistics> compactor_statistics_;
+  std::shared_ptr<Statistics> primary_statistics_;
+  std::shared_ptr<CompactionService> compaction_service_;
+};
+
+TEST_F(CompactionServiceTest, BasicCompactions) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES), 1);
+  ASSERT_GE(compactor_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES), 0);
+  // even with remote compaction, primary host still needs to read SST files to
+  // `verify_table()`.
+  ASSERT_GE(primary_statistics->getTickerCount(COMPACT_READ_BYTES), 1);
+  // all the compaction write happens on the remote side
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_GE(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 1);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_READ_BYTES),
+            primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES));
+  // compactor is already the remote side, which doesn't have remote
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(compactor_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            0);
+
+  // Test failed compaction
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CompactWithoutInstallation::End", [&](void* status) {
+        // override job status
+        auto s = static_cast<Status*>(status);
+        *s = Status::Aborted("MyTestCompactionService failed to compact!");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      s = Put(Key(key_id), "value_new" + std::to_string(key_id));
+      if (s.IsAborted()) {
+        break;
+      }
+    }
+    if (s.IsAborted()) {
+      break;
+    }
+    s = Flush();
+    if (s.IsAborted()) {
+      break;
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (s.IsAborted()) {
+      break;
+    }
+  }
+  ASSERT_TRUE(s.IsAborted());
+
+  // Test re-open and successful unique id verification
+  std::atomic_int verify_passed{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+        // override job status
+        auto id = static_cast<UniqueId64x2*>(arg);
+        assert(*id != kNullUniqueId64x2);
+        verify_passed++;
+      });
+  Reopen(options);
+  ASSERT_GT(verify_passed, 0);
+  Close();
+}
+
+TEST_F(CompactionServiceTest, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  end_str = Key(92);
+  end = end_str;
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+
+  comp_num = my_cs->GetCompactionNum();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, CancelCompactionOnRemoteSide) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  // Test cancel compaction at the beginning
+  my_cs->SetCanceled(true);
+  auto s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+  // compaction number is not increased
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+  VerifyTestData();
+
+  // Test cancel compaction in progress
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+  my_cs = GetCompactionService();
+  my_cs->SetCanceled(false);
+
+  std::atomic_bool cancel_issued{false};
+  SyncPoint::GetInstance()->SetCallBack("CompactionJob::Run():Inprogress",
+                                        [&](void* /*arg*/) {
+                                          cancel_issued = true;
+                                          my_cs->SetCanceled(true);
+                                        });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+  ASSERT_TRUE(cancel_issued);
+  // compaction number is not increased
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num);
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, FailedToStart) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kFailure);
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_TRUE(s.IsIncomplete());
+}
+
+TEST_F(CompactionServiceTest, InvalidResult) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+
+  auto my_cs = GetCompactionService();
+  my_cs->OverrideWaitResult("Invalid Str");
+
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  Status s = db_->CompactRange(CompactRangeOptions(), &start, &end);
+  ASSERT_FALSE(s.ok());
+}
+
+TEST_F(CompactionServiceTest, SubCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  int compaction_num_before = my_cs->GetCompactionNum();
+
+  auto cro = CompactRangeOptions();
+  cro.max_subcompactions = 10;
+  Status s = db_->CompactRange(cro, nullptr, nullptr);
+  ASSERT_OK(s);
+  VerifyTestData();
+  int compaction_num = my_cs->GetCompactionNum() - compaction_num_before;
+  // make sure there's sub-compaction by checking the compaction number
+  ASSERT_GE(compaction_num, 2);
+}
+
+class PartialDeleteCompactionFilter : public CompactionFilter {
+ public:
+  CompactionFilter::Decision FilterV2(
+      int /*level*/, const Slice& key, ValueType /*value_type*/,
+      const Slice& /*existing_value*/, std::string* /*new_value*/,
+      std::string* /*skip_until*/) const override {
+    int i = std::stoi(key.ToString().substr(3));
+    if (i > 5 && i <= 105) {
+      return CompactionFilter::Decision::kRemove;
+    }
+    return CompactionFilter::Decision::kKeep;
+  }
+
+  const char* Name() const override { return "PartialDeleteCompactionFilter"; }
+};
+
+TEST_F(CompactionServiceTest, CompactionFilter) {
+  Options options = CurrentOptions();
+  std::unique_ptr<CompactionFilter> delete_comp_filter(
+      new PartialDeleteCompactionFilter());
+  options.compaction_filter = delete_comp_filter.get();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i > 5 && i <= 105) {
+      ASSERT_EQ(result, "NOT_FOUND");
+    } else if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+}
+
+TEST_F(CompactionServiceTest, Snapshot) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  ASSERT_OK(Put(Key(1), "value1"));
+  ASSERT_OK(Put(Key(2), "value1"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), "value2"));
+  ASSERT_OK(Put(Key(3), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  auto my_cs = GetCompactionService();
+  ASSERT_GE(my_cs->GetCompactionNum(), 1);
+  ASSERT_EQ("value1", Get(Key(1), s1));
+  ASSERT_EQ("value2", Get(Key(1)));
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(CompactionServiceTest, ConcurrentCompaction) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 100;
+  options.max_background_jobs = 20;
+  ReopenWithCompactionService(&options);
+  GenerateTestData();
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  std::vector<std::thread> threads;
+  for (const auto& file : meta.levels[1].files) {
+    threads.emplace_back(std::thread([&]() {
+      std::string fname = file.db_path + "/" + file.name;
+      ASSERT_OK(db_->CompactFiles(CompactionOptions(), {fname}, 2));
+    }));
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+  auto my_cs = GetCompactionService();
+  ASSERT_EQ(my_cs->GetCompactionNum(), 10);
+  ASSERT_EQ(FilesPerLevel(), "0,0,10");
+}
+
+TEST_F(CompactionServiceTest, CompactionInfo) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  auto my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+  uint64_t comp_num = my_cs->GetCompactionNum();
+  ASSERT_GE(comp_num, 1);
+
+  CompactionServiceJobInfo info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(dbname_, info.db_name);
+  std::string db_id, db_session_id;
+  ASSERT_OK(db_->GetDbIdentity(db_id));
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_OK(db_->GetDbSessionId(db_session_id));
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(dbname_, info.db_name);
+  ASSERT_EQ(db_id, info.db_id);
+  ASSERT_EQ(db_session_id, info.db_session_id);
+  ASSERT_EQ(Env::LOW, info.priority);
+
+  // Test priority USER
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  SstFileMetaData file = meta.levels[1].files[0];
+  ASSERT_OK(db_->CompactFiles(CompactionOptions(),
+                              {file.db_path + "/" + file.name}, 2));
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::USER, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::USER, info.priority);
+
+  // Test priority BOTTOM
+  env_->SetBackgroundThreads(1, Env::BOTTOM);
+  options.num_levels = 2;
+  ReopenWithCompactionService(&options);
+  my_cs =
+      static_cast_with_check<MyTestCompactionService>(GetCompactionService());
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  info = my_cs->GetCompactionInfoForStart();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+  info = my_cs->GetCompactionInfoForWait();
+  ASSERT_EQ(Env::BOTTOM, info.priority);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalAuto) {
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  my_cs->OverrideStartStatus(CompactionServiceJobStatus::kUseLocal);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+
+  ASSERT_EQ(my_cs->GetCompactionNum(), 0);
+
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_READ_BYTES), 0);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES), 0);
+}
+
+TEST_F(CompactionServiceTest, FallbackLocalManual) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  ReopenWithCompactionService(&options);
+
+  GenerateTestData();
+  VerifyTestData();
+
+  auto my_cs = GetCompactionService();
+  Statistics* compactor_statistics = GetCompactorStatistics();
+  Statistics* primary_statistics = GetPrimaryStatistics();
+  uint64_t compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  uint64_t primary_write_bytes =
+      primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  // re-enable remote compaction
+  my_cs->ResetOverride();
+  std::string start_str = Key(15);
+  std::string end_str = Key(45);
+  Slice start(start_str);
+  Slice end(end_str);
+  uint64_t comp_num = my_cs->GetCompactionNum();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+  ASSERT_GE(my_cs->GetCompactionNum(), comp_num + 1);
+  // make sure the compaction statistics is only recorded on the remote side
+  ASSERT_GT(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES));
+  ASSERT_EQ(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+
+  // return run local again with API WaitForComplete
+  my_cs->OverrideWaitStatus(CompactionServiceJobStatus::kUseLocal);
+  start_str = Key(120);
+  start = start_str;
+  comp_num = my_cs->GetCompactionNum();
+  compactor_write_bytes =
+      compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+  primary_write_bytes = primary_statistics->getTickerCount(COMPACT_WRITE_BYTES);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, nullptr));
+  ASSERT_EQ(my_cs->GetCompactionNum(),
+            comp_num);  // no remote compaction is run
+  // make sure the compaction statistics is only recorded on the local side
+  ASSERT_EQ(compactor_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+  ASSERT_GT(primary_statistics->getTickerCount(COMPACT_WRITE_BYTES),
+            primary_write_bytes);
+  ASSERT_EQ(primary_statistics->getTickerCount(REMOTE_COMPACT_WRITE_BYTES),
+            compactor_write_bytes);
+
+  // verify result after 2 manual compactions
+  VerifyTestData();
+}
+
+TEST_F(CompactionServiceTest, RemoteEventListener) {
+  class RemoteEventListenerTest : public EventListener {
+   public:
+    const char* Name() const override { return "RemoteEventListenerTest"; }
+
+    void OnSubcompactionBegin(const SubcompactionJobInfo& info) override {
+      auto result = on_going_compactions.emplace(info.job_id);
+      ASSERT_TRUE(result.second);  // make sure there's no duplication
+      compaction_num++;
+      EventListener::OnSubcompactionBegin(info);
+    }
+    void OnSubcompactionCompleted(const SubcompactionJobInfo& info) override {
+      auto num = on_going_compactions.erase(info.job_id);
+      ASSERT_TRUE(num == 1);  // make sure the compaction id exists
+      EventListener::OnSubcompactionCompleted(info);
+    }
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+      file_created++;
+      EventListener::OnTableFileCreated(info);
+    }
+    void OnTableFileCreationStarted(
+        const TableFileCreationBriefInfo& info) override {
+      ASSERT_EQ(on_going_compactions.count(info.job_id), 1);
+      file_creation_started++;
+      EventListener::OnTableFileCreationStarted(info);
+    }
+
+    bool ShouldBeNotifiedOnFileIO() override {
+      file_io_notified++;
+      return EventListener::ShouldBeNotifiedOnFileIO();
+    }
+
+    std::atomic_uint64_t file_io_notified{0};
+    std::atomic_uint64_t file_creation_started{0};
+    std::atomic_uint64_t file_created{0};
+
+    std::set<int> on_going_compactions;  // store the job_id
+    std::atomic_uint64_t compaction_num{0};
+  };
+
+  auto listener = new RemoteEventListenerTest();
+  remote_listeners.emplace_back(listener);
+
+  Options options = CurrentOptions();
+  ReopenWithCompactionService(&options);
+
+  for (int i = 0; i < 20; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value_new" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // check the events are triggered
+  ASSERT_TRUE(listener->file_io_notified > 0);
+  ASSERT_TRUE(listener->file_creation_started > 0);
+  ASSERT_TRUE(listener->file_created > 0);
+  ASSERT_TRUE(listener->compaction_num > 0);
+  ASSERT_TRUE(listener->on_going_compactions.empty());
+
+  // verify result
+  for (int i = 0; i < 200; i++) {
+    auto result = Get(Key(i));
+    if (i % 2) {
+      ASSERT_EQ(result, "value" + std::to_string(i));
+    } else {
+      ASSERT_EQ(result, "value_new" + std::to_string(i));
+    }
+  }
+}
+
+TEST_F(CompactionServiceTest, TablePropertiesCollector) {
+  const static std::string kUserPropertyName = "TestCount";
+
+  class TablePropertiesCollectorTest : public TablePropertiesCollector {
+   public:
+    Status Finish(UserCollectedProperties* properties) override {
+      *properties = UserCollectedProperties{
+          {kUserPropertyName, std::to_string(count_)},
+      };
+      return Status::OK();
+    }
+
+    UserCollectedProperties GetReadableProperties() const override {
+      return UserCollectedProperties();
+    }
+
+    const char* Name() const override { return "TablePropertiesCollectorTest"; }
+
+    Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                      EntryType /*type*/, SequenceNumber /*seq*/,
+                      uint64_t /*file_size*/) override {
+      count_++;
+      return Status::OK();
+    }
+
+   private:
+    uint32_t count_ = 0;
+  };
+
+  class TablePropertiesCollectorFactoryTest
+      : public TablePropertiesCollectorFactory {
+   public:
+    TablePropertiesCollector* CreateTablePropertiesCollector(
+        TablePropertiesCollectorFactory::Context /*context*/) override {
+      return new TablePropertiesCollectorTest();
+    }
+
+    const char* Name() const override {
+      return "TablePropertiesCollectorFactoryTest";
+    }
+  };
+
+  auto factory = new TablePropertiesCollectorFactoryTest();
+  remote_table_properties_collector_factories.emplace_back(factory);
+
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  ReopenWithCompactionService(&options);
+
+  // generate a few SSTs locally which should not have user property
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection fname_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    ASSERT_EQ(it, properties.end());
+  }
+
+  // trigger compaction
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
+
+  bool has_user_property = false;
+  for (const auto& file_props : fname_to_props) {
+    auto properties = file_props.second->user_collected_properties;
+    auto it = properties.find(kUserPropertyName);
+    if (it != properties.end()) {
+      has_user_property = true;
+      ASSERT_GT(std::stoi(it->second), 0);
+    }
+  }
+  ASSERT_TRUE(has_user_property);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/compaction/compaction_state.cc b/src/rocksdb/db/compaction/compaction_state.cc
new file mode 100644
index 000000000..ee4b0c189
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction_state.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Slice CompactionState::SmallestUserKey() {
+  for (const auto& sub_compact_state : sub_compact_states) {
+    Slice smallest = sub_compact_state.SmallestUserKey();
+    if (!smallest.empty()) {
+      return smallest;
+    }
+  }
+  // If there is no finished output, return an empty slice.
+  return Slice{nullptr, 0};
+}
+
+Slice CompactionState::LargestUserKey() {
+  for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
+       ++it) {
+    Slice largest = it->LargestUserKey();
+    if (!largest.empty()) {
+      return largest;
+    }
+  }
+  // If there is no finished output, return an empty slice.
+  return Slice{nullptr, 0};
+}
+
+void CompactionState::AggregateCompactionStats(
+    InternalStats::CompactionStatsFull& compaction_stats,
+    CompactionJobStats& compaction_job_stats) {
+  for (const auto& sc : sub_compact_states) {
+    sc.AggregateCompactionStats(compaction_stats);
+    compaction_job_stats.Add(sc.compaction_job_stats);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/compaction_state.h b/src/rocksdb/db/compaction/compaction_state.h
new file mode 100644
index 000000000..cc5b66c68
--- /dev/null
+++ b/src/rocksdb/db/compaction/compaction_state.h
@@ -0,0 +1,42 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/compaction/compaction.h"
+#include "db/compaction/subcompaction_state.h"
+#include "db/internal_stats.h"
+
+// Data structures used for compaction_job and compaction_service_job which has
+// the list of sub_compact_states and the aggregated information for the
+// compaction.
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state for the entire compaction
+class CompactionState {
+ public:
+  Compaction* const compaction;
+
+  // REQUIRED: subcompaction states are stored in order of increasing key-range
+  std::vector<SubcompactionState> sub_compact_states;
+  Status status;
+
+  void AggregateCompactionStats(
+      InternalStats::CompactionStatsFull& compaction_stats,
+      CompactionJobStats& compaction_job_stats);
+
+  explicit CompactionState(Compaction* c) : compaction(c) {}
+
+  Slice SmallestUserKey();
+
+  Slice LargestUserKey();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/file_pri.h b/src/rocksdb/db/compaction/file_pri.h
new file mode 100644
index 000000000..82dddcf93
--- /dev/null
+++ b/src/rocksdb/db/compaction/file_pri.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#pragma once
+#include <algorithm>
+
+#include "db/version_edit.h"
+
+namespace ROCKSDB_NAMESPACE {
+// We boost files that are closer to TTL limit. This boosting could be
+// through FileMetaData.compensated_file_size but this compensated size
+// is widely used as something similar to file size so dramatically boost
+// the value might cause unintended consequences.
+//
+// This boosting algorithm can go very fancy, but here we use a simple
+// formula which can satisify:
+// (1) Different levels are triggered slightly differently to avoid
+//     too many cascading cases
+// (2) Files in the same level get boosting more when TTL gets closer.
+//
+// Don't do any boosting before TTL has past by half. This is to make
+// sure lower write amp for most of the case. And all levels should be
+// fully boosted when total TTL compaction threshold triggers.
+// Differientiate boosting ranges of each level by 1/2. This will make
+// range for each level exponentially increasing. We could do it by
+// having them to be equal, or go even fancier. We can adjust it after
+// we observe the behavior in production.
+// The threshold starting boosting:
+// +------------------------------------------------------------------ +
+// ^                            ^   ^     ^       ^                 ^
+// Age 0                        ... |     |    second last level    thresold
+//                                  |     |
+//                                  |  third last level
+//                                  |
+//                            forth last level
+//
+// We arbitrarily set with 0 when a file is aged boost_age_start and
+// grow linearly. The ratio is arbitrarily set so that when the next level
+// starts to boost, the previous level's boosting amount is 16.
+class FileTtlBooster {
+ public:
+  FileTtlBooster(uint64_t current_time, uint64_t ttl, int num_non_empty_levels,
+                 int level)
+      : current_time_(current_time) {
+    if (ttl == 0 || level == 0 || level >= num_non_empty_levels - 1) {
+      enabled_ = false;
+      boost_age_start_ = 0;
+      boost_step_ = 1;
+    } else {
+      enabled_ = true;
+      uint64_t all_boost_start_age = ttl / 2;
+      uint64_t all_boost_age_range = (ttl / 32) * 31 - all_boost_start_age;
+      uint64_t boost_age_range =
+          all_boost_age_range >> (num_non_empty_levels - level - 1);
+      boost_age_start_ = all_boost_start_age + boost_age_range;
+      const uint64_t kBoostRatio = 16;
+      // prevent 0 value to avoid divide 0 error.
+      boost_step_ = std::max(boost_age_range / kBoostRatio, uint64_t{1});
+    }
+  }
+
+  uint64_t GetBoostScore(FileMetaData* f) {
+    if (!enabled_) {
+      return 1;
+    }
+    uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+    if (oldest_ancester_time >= current_time_) {
+      return 1;
+    }
+    uint64_t age = current_time_ - oldest_ancester_time;
+    if (age > boost_age_start_) {
+      // Use integer just for convenience.
+      // We could make all file_to_order double if we want.
+      // Technically this can overflow if users override timing and
+      // give a very high current time. Ignore the case for simplicity.
+      // Boosting is addition to current value, so +1. This will effectively
+      // make boosting to kick in after the first boost_step_ is reached.
+      return (age - boost_age_start_) / boost_step_ + 1;
+    }
+    return 1;
+  }
+
+ private:
+  bool enabled_;
+  uint64_t current_time_;
+  uint64_t boost_age_start_;
+  uint64_t boost_step_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/sst_partitioner.cc b/src/rocksdb/db/compaction/sst_partitioner.cc
new file mode 100644
index 000000000..9e7f9fa89
--- /dev/null
+++ b/src/rocksdb/db/compaction/sst_partitioner.cc
@@ -0,0 +1,90 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#include "rocksdb/sst_partitioner.h"
+
+#include <algorithm>
+
+#include "rocksdb/utilities/customizable_util.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "rocksdb/utilities/options_type.h"
+
+namespace ROCKSDB_NAMESPACE {
+static std::unordered_map<std::string, OptionTypeInfo>
+    sst_fixed_prefix_type_info = {
+#ifndef ROCKSDB_LITE
+        {"length",
+         {0, OptionType::kSizeT, OptionVerificationType::kNormal,
+          OptionTypeFlags::kNone}},
+#endif  // ROCKSDB_LITE
+};
+
+SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
+    : len_(len) {
+  RegisterOptions("Length", &len_, &sst_fixed_prefix_type_info);
+}
+
+PartitionerResult SstPartitionerFixedPrefix::ShouldPartition(
+    const PartitionerRequest& request) {
+  Slice last_key_fixed(*request.prev_user_key);
+  if (last_key_fixed.size() > len_) {
+    last_key_fixed.size_ = len_;
+  }
+  Slice current_key_fixed(*request.current_user_key);
+  if (current_key_fixed.size() > len_) {
+    current_key_fixed.size_ = len_;
+  }
+  return last_key_fixed.compare(current_key_fixed) != 0 ? kRequired
+                                                        : kNotRequired;
+}
+
+bool SstPartitionerFixedPrefix::CanDoTrivialMove(
+    const Slice& smallest_user_key, const Slice& largest_user_key) {
+  return ShouldPartition(PartitionerRequest(smallest_user_key, largest_user_key,
+                                            0)) == kNotRequired;
+}
+
+std::unique_ptr<SstPartitioner>
+SstPartitionerFixedPrefixFactory::CreatePartitioner(
+    const SstPartitioner::Context& /* context */) const {
+  return std::unique_ptr<SstPartitioner>(new SstPartitionerFixedPrefix(len_));
+}
+
+std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
+    size_t prefix_len) {
+  return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+static int RegisterSstPartitionerFactories(ObjectLibrary& library,
+                                           const std::string& /*arg*/) {
+  library.AddFactory<SstPartitionerFactory>(
+      SstPartitionerFixedPrefixFactory::kClassName(),
+      [](const std::string& /*uri*/,
+         std::unique_ptr<SstPartitionerFactory>* guard,
+         std::string* /* errmsg */) {
+        guard->reset(new SstPartitionerFixedPrefixFactory(0));
+        return guard->get();
+      });
+  return 1;
+}
+}  // namespace
+#endif  // ROCKSDB_LITE
+
+Status SstPartitionerFactory::CreateFromString(
+    const ConfigOptions& options, const std::string& value,
+    std::shared_ptr<SstPartitionerFactory>* result) {
+#ifndef ROCKSDB_LITE
+  static std::once_flag once;
+  std::call_once(once, [&]() {
+    RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
+  });
+#endif  // ROCKSDB_LITE
+  return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
+                                                 result);
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.cc b/src/rocksdb/db/compaction/subcompaction_state.cc
new file mode 100644
index 000000000..0c56471e9
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.cc
@@ -0,0 +1,106 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/subcompaction_state.h"
+
+#include "rocksdb/sst_partitioner.h"
+
+namespace ROCKSDB_NAMESPACE {
+void SubcompactionState::AggregateCompactionStats(
+    InternalStats::CompactionStatsFull& compaction_stats) const {
+  compaction_stats.stats.Add(compaction_outputs_.stats_);
+  if (HasPenultimateLevelOutputs()) {
+    compaction_stats.has_penultimate_level_output = true;
+    compaction_stats.penultimate_level_stats.Add(
+        penultimate_level_outputs_.stats_);
+  }
+}
+
+OutputIterator SubcompactionState::GetOutputs() const {
+  return OutputIterator(penultimate_level_outputs_.outputs_,
+                        compaction_outputs_.outputs_);
+}
+
+void SubcompactionState::Cleanup(Cache* cache) {
+  penultimate_level_outputs_.Cleanup();
+  compaction_outputs_.Cleanup();
+
+  if (!status.ok()) {
+    for (const auto& out : GetOutputs()) {
+      // If this file was inserted into the table cache then remove
+      // them here because this compaction was not committed.
+      TableCache::Evict(cache, out.meta.fd.GetNumber());
+    }
+  }
+  // TODO: sub_compact.io_status is not checked like status. Not sure if thats
+  // intentional. So ignoring the io_status as of now.
+  io_status.PermitUncheckedError();
+}
+
+Slice SubcompactionState::SmallestUserKey() const {
+  if (has_penultimate_level_outputs_) {
+    Slice a = compaction_outputs_.SmallestUserKey();
+    Slice b = penultimate_level_outputs_.SmallestUserKey();
+    if (a.empty()) {
+      return b;
+    }
+    if (b.empty()) {
+      return a;
+    }
+    const Comparator* user_cmp =
+        compaction->column_family_data()->user_comparator();
+    if (user_cmp->Compare(a, b) > 0) {
+      return b;
+    } else {
+      return a;
+    }
+  } else {
+    return compaction_outputs_.SmallestUserKey();
+  }
+}
+
+Slice SubcompactionState::LargestUserKey() const {
+  if (has_penultimate_level_outputs_) {
+    Slice a = compaction_outputs_.LargestUserKey();
+    Slice b = penultimate_level_outputs_.LargestUserKey();
+    if (a.empty()) {
+      return b;
+    }
+    if (b.empty()) {
+      return a;
+    }
+    const Comparator* user_cmp =
+        compaction->column_family_data()->user_comparator();
+    if (user_cmp->Compare(a, b) < 0) {
+      return b;
+    } else {
+      return a;
+    }
+  } else {
+    return compaction_outputs_.LargestUserKey();
+  }
+}
+
+Status SubcompactionState::AddToOutput(
+    const CompactionIterator& iter,
+    const CompactionFileOpenFunc& open_file_func,
+    const CompactionFileCloseFunc& close_file_func) {
+  // update target output first
+  is_current_penultimate_level_ = iter.output_to_penultimate_level();
+  current_outputs_ = is_current_penultimate_level_ ? &penultimate_level_outputs_
+                                                   : &compaction_outputs_;
+  if (is_current_penultimate_level_) {
+    has_penultimate_level_outputs_ = true;
+  }
+
+  return Current().AddToOutput(iter, open_file_func, close_file_func);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/subcompaction_state.h b/src/rocksdb/db/compaction/subcompaction_state.h
new file mode 100644
index 000000000..13e63120f
--- /dev/null
+++ b/src/rocksdb/db/compaction/subcompaction_state.h
@@ -0,0 +1,214 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <optional>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_garbage_meter.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_outputs.h"
+#include "db/internal_stats.h"
+#include "db/output_validator.h"
+#include "db/range_del_aggregator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maintains state and outputs for each sub-compaction
+// It contains 2 `CompactionOutputs`:
+//  1. one for the normal output files
+//  2. another for the penultimate level outputs
+// a `current` pointer maintains the current output group, when calling
+// `AddToOutput()`, it checks the output of the current compaction_iterator key
+// and point `current` to the target output group. By default, it just points to
+// normal compaction_outputs, if the compaction_iterator key should be placed on
+// the penultimate level, `current` is changed to point to
+// `penultimate_level_outputs`.
+// The later operations uses `Current()` to get the target group.
+//
+// +----------+          +-----------------------------+      +---------+
+// | *current |--------> | compaction_outputs          |----->| output  |
+// +----------+          +-----------------------------+      +---------+
+//       |                                                    | output  |
+//       |                                                    +---------+
+//       |                                                    |  ...    |
+//       |
+//       |               +-----------------------------+      +---------+
+//       +-------------> | penultimate_level_outputs   |----->| output  |
+//                       +-----------------------------+      +---------+
+//                                                            |  ...    |
+
+class SubcompactionState {
+ public:
+  const Compaction* compaction;
+
+  // The boundaries of the key-range this compaction is interested in. No two
+  // sub-compactions may have overlapping key-ranges.
+  // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
+  const std::optional<Slice> start, end;
+
+  // The return status of this sub-compaction
+  Status status;
+
+  // The return IO Status of this sub-compaction
+  IOStatus io_status;
+
+  // Notify on sub-compaction completion only if listener was notified on
+  // sub-compaction begin.
+  bool notify_on_subcompaction_completion = false;
+
+  // compaction job stats for this sub-compaction
+  CompactionJobStats compaction_job_stats;
+
+  // sub-compaction job id, which is used to identify different sub-compaction
+  // within the same compaction job.
+  const uint32_t sub_job_id;
+
+  Slice SmallestUserKey() const;
+
+  Slice LargestUserKey() const;
+
+  // Get all outputs from the subcompaction. For per_key_placement compaction,
+  // it returns both the last level outputs and penultimate level outputs.
+  OutputIterator GetOutputs() const;
+
+  // Assign range dels aggregator, for each range_del, it can only be assigned
+  // to one output level, for per_key_placement, it's going to be the
+  // penultimate level.
+  void AssignRangeDelAggregator(
+      std::unique_ptr<CompactionRangeDelAggregator>&& range_del_agg) {
+    if (compaction->SupportsPerKeyPlacement()) {
+      penultimate_level_outputs_.AssignRangeDelAggregator(
+          std::move(range_del_agg));
+    } else {
+      compaction_outputs_.AssignRangeDelAggregator(std::move(range_del_agg));
+    }
+  }
+
+  void RemoveLastEmptyOutput() {
+    compaction_outputs_.RemoveLastEmptyOutput();
+    penultimate_level_outputs_.RemoveLastEmptyOutput();
+  }
+
+#ifndef ROCKSDB_LITE
+  void BuildSubcompactionJobInfo(
+      SubcompactionJobInfo& subcompaction_job_info) const {
+    const Compaction* c = compaction;
+    const ColumnFamilyData* cfd = c->column_family_data();
+
+    subcompaction_job_info.cf_id = cfd->GetID();
+    subcompaction_job_info.cf_name = cfd->GetName();
+    subcompaction_job_info.status = status;
+    subcompaction_job_info.subcompaction_job_id = static_cast<int>(sub_job_id);
+    subcompaction_job_info.base_input_level = c->start_level();
+    subcompaction_job_info.output_level = c->output_level();
+    subcompaction_job_info.stats = compaction_job_stats;
+  }
+#endif  // !ROCKSDB_LITE
+
+  SubcompactionState() = delete;
+  SubcompactionState(const SubcompactionState&) = delete;
+  SubcompactionState& operator=(const SubcompactionState&) = delete;
+
+  SubcompactionState(Compaction* c, const std::optional<Slice> _start,
+                     const std::optional<Slice> _end, uint32_t _sub_job_id)
+      : compaction(c),
+        start(_start),
+        end(_end),
+        sub_job_id(_sub_job_id),
+        compaction_outputs_(c, /*is_penultimate_level=*/false),
+        penultimate_level_outputs_(c, /*is_penultimate_level=*/true) {
+    assert(compaction != nullptr);
+    // Set output split key (used for RoundRobin feature) only for normal
+    // compaction_outputs, output to penultimate_level feature doesn't support
+    // RoundRobin feature (and may never going to be supported, because for
+    // RoundRobin, the data time is mostly naturally sorted, no need to have
+    // per-key placement with output_to_penultimate_level).
+    compaction_outputs_.SetOutputSlitKey(start, end);
+  }
+
+  SubcompactionState(SubcompactionState&& state) noexcept
+      : compaction(state.compaction),
+        start(state.start),
+        end(state.end),
+        status(std::move(state.status)),
+        io_status(std::move(state.io_status)),
+        notify_on_subcompaction_completion(
+            state.notify_on_subcompaction_completion),
+        compaction_job_stats(std::move(state.compaction_job_stats)),
+        sub_job_id(state.sub_job_id),
+        compaction_outputs_(std::move(state.compaction_outputs_)),
+        penultimate_level_outputs_(std::move(state.penultimate_level_outputs_)),
+        is_current_penultimate_level_(state.is_current_penultimate_level_),
+        has_penultimate_level_outputs_(state.has_penultimate_level_outputs_) {
+    current_outputs_ = is_current_penultimate_level_
+                           ? &penultimate_level_outputs_
+                           : &compaction_outputs_;
+  }
+
+  bool HasPenultimateLevelOutputs() const {
+    return has_penultimate_level_outputs_ ||
+           penultimate_level_outputs_.HasRangeDel();
+  }
+
+  bool IsCurrentPenultimateLevel() const {
+    return is_current_penultimate_level_;
+  }
+
+  // Add all the new files from this compaction to version_edit
+  void AddOutputsEdit(VersionEdit* out_edit) const {
+    for (const auto& file : penultimate_level_outputs_.outputs_) {
+      out_edit->AddFile(compaction->GetPenultimateLevel(), file.meta);
+    }
+    for (const auto& file : compaction_outputs_.outputs_) {
+      out_edit->AddFile(compaction->output_level(), file.meta);
+    }
+  }
+
+  void Cleanup(Cache* cache);
+
+  void AggregateCompactionStats(
+      InternalStats::CompactionStatsFull& compaction_stats) const;
+
+  CompactionOutputs& Current() const {
+    assert(current_outputs_);
+    return *current_outputs_;
+  }
+
+  // Add compaction_iterator key/value to the `Current` output group.
+  Status AddToOutput(const CompactionIterator& iter,
+                     const CompactionFileOpenFunc& open_file_func,
+                     const CompactionFileCloseFunc& close_file_func);
+
+  // Close all compaction output files, both output_to_penultimate_level outputs
+  // and normal outputs.
+  Status CloseCompactionFiles(const Status& curr_status,
+                              const CompactionFileOpenFunc& open_file_func,
+                              const CompactionFileCloseFunc& close_file_func) {
+    // Call FinishCompactionOutputFile() even if status is not ok: it needs to
+    // close the output file.
+    Status s = penultimate_level_outputs_.CloseOutput(
+        curr_status, open_file_func, close_file_func);
+    s = compaction_outputs_.CloseOutput(s, open_file_func, close_file_func);
+    return s;
+  }
+
+ private:
+  // State kept for output being generated
+  CompactionOutputs compaction_outputs_;
+  CompactionOutputs penultimate_level_outputs_;
+  CompactionOutputs* current_outputs_ = &compaction_outputs_;
+  bool is_current_penultimate_level_ = false;
+  bool has_penultimate_level_outputs_ = false;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/compaction/tiered_compaction_test.cc b/src/rocksdb/db/compaction/tiered_compaction_test.cc
new file mode 100644
index 000000000..aaebcfd94
--- /dev/null
+++ b/src/rocksdb/db/compaction/tiered_compaction_test.cc
@@ -0,0 +1,2028 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#if !defined(ROCKSDB_LITE)
+
+class TieredCompactionTest : public DBTestBase,
+                             public testing::WithParamInterface<bool> {
+ public:
+  TieredCompactionTest()
+      : DBTestBase("tiered_compaction_test", /*env_do_fsync=*/true),
+        kBasicCompStats(CompactionReason::kUniversalSizeAmplification, 1),
+        kBasicPerKeyPlacementCompStats(
+            CompactionReason::kUniversalSizeAmplification, 1),
+        kBasicFlushStats(CompactionReason::kFlush, 1) {
+    kBasicCompStats.micros = kHasValue;
+    kBasicCompStats.cpu_micros = kHasValue;
+    kBasicCompStats.bytes_read_non_output_levels = kHasValue;
+    kBasicCompStats.num_input_files_in_non_output_levels = kHasValue;
+    kBasicCompStats.num_input_records = kHasValue;
+    kBasicCompStats.num_dropped_records = kHasValue;
+
+    kBasicPerLevelStats.num_output_records = kHasValue;
+    kBasicPerLevelStats.bytes_written = kHasValue;
+    kBasicPerLevelStats.num_output_files = kHasValue;
+
+    kBasicPerKeyPlacementCompStats.micros = kHasValue;
+    kBasicPerKeyPlacementCompStats.cpu_micros = kHasValue;
+    kBasicPerKeyPlacementCompStats.Add(kBasicPerLevelStats);
+
+    kBasicFlushStats.micros = kHasValue;
+    kBasicFlushStats.cpu_micros = kHasValue;
+    kBasicFlushStats.bytes_written = kHasValue;
+    kBasicFlushStats.num_output_files = kHasValue;
+  }
+
+ protected:
+  static constexpr uint8_t kHasValue = 1;
+
+  InternalStats::CompactionStats kBasicCompStats;
+  InternalStats::CompactionStats kBasicPerKeyPlacementCompStats;
+  InternalStats::CompactionOutputsStats kBasicPerLevelStats;
+  InternalStats::CompactionStats kBasicFlushStats;
+
+  std::atomic_bool enable_per_key_placement = true;
+
+  void SetUp() override {
+    SyncPoint::GetInstance()->SetCallBack(
+        "Compaction::SupportsPerKeyPlacement:Enabled", [&](void* arg) {
+          auto supports_per_key_placement = static_cast<bool*>(arg);
+          *supports_per_key_placement = enable_per_key_placement;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetCompactionStats();
+  }
+
+  const InternalStats::CompactionStats& GetPerKeyPlacementCompactionStats() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    return internal_stats->TEST_GetPerKeyPlacementCompactionStats();
+  }
+
+  // Verify the compaction stats, the stats are roughly compared
+  void VerifyCompactionStats(
+      const std::vector<InternalStats::CompactionStats>& expect_stats,
+      const InternalStats::CompactionStats& expect_pl_stats) {
+    const std::vector<InternalStats::CompactionStats>& stats =
+        GetCompactionStats();
+    const size_t kLevels = expect_stats.size();
+    ASSERT_EQ(kLevels, stats.size());
+
+    for (auto it = stats.begin(), expect = expect_stats.begin();
+         it != stats.end(); it++, expect++) {
+      VerifyCompactionStats(*it, *expect);
+    }
+
+    const InternalStats::CompactionStats& pl_stats =
+        GetPerKeyPlacementCompactionStats();
+    VerifyCompactionStats(pl_stats, expect_pl_stats);
+  }
+
+  void ResetAllStats(std::vector<InternalStats::CompactionStats>& stats,
+                     InternalStats::CompactionStats& pl_stats) {
+    ASSERT_OK(dbfull()->ResetStats());
+    for (auto& level_stats : stats) {
+      level_stats.Clear();
+    }
+    pl_stats.Clear();
+  }
+
+  // bottommost_temperature is renaming to last_level_temperature, set either
+  // of them should have the same effect.
+  void SetColdTemperature(Options& options) {
+    if (GetParam()) {
+      options.bottommost_temperature = Temperature::kCold;
+    } else {
+      options.last_level_temperature = Temperature::kCold;
+    }
+  }
+
+ private:
+  void CompareStats(uint64_t val, uint64_t expect) {
+    if (expect > 0) {
+      ASSERT_TRUE(val > 0);
+    } else {
+      ASSERT_EQ(val, 0);
+    }
+  }
+
+  void VerifyCompactionStats(
+      const InternalStats::CompactionStats& stats,
+      const InternalStats::CompactionStats& expect_stats) {
+    CompareStats(stats.micros, expect_stats.micros);
+    CompareStats(stats.cpu_micros, expect_stats.cpu_micros);
+    CompareStats(stats.bytes_read_non_output_levels,
+                 expect_stats.bytes_read_non_output_levels);
+    CompareStats(stats.bytes_read_output_level,
+                 expect_stats.bytes_read_output_level);
+    CompareStats(stats.bytes_read_blob, expect_stats.bytes_read_blob);
+    CompareStats(stats.bytes_written, expect_stats.bytes_written);
+    CompareStats(stats.bytes_moved, expect_stats.bytes_moved);
+    CompareStats(stats.num_input_files_in_non_output_levels,
+                 expect_stats.num_input_files_in_non_output_levels);
+    CompareStats(stats.num_input_files_in_output_level,
+                 expect_stats.num_input_files_in_output_level);
+    CompareStats(stats.num_output_files, expect_stats.num_output_files);
+    CompareStats(stats.num_output_files_blob,
+                 expect_stats.num_output_files_blob);
+    CompareStats(stats.num_input_records, expect_stats.num_input_records);
+    CompareStats(stats.num_dropped_records, expect_stats.num_dropped_records);
+    CompareStats(stats.num_output_records, expect_stats.num_output_records);
+    ASSERT_EQ(stats.count, expect_stats.count);
+    for (int i = 0; i < static_cast<int>(CompactionReason::kNumOfReasons);
+         i++) {
+      ASSERT_EQ(stats.counts[i], expect_stats.counts[i]);
+    }
+  }
+};
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+  std::vector<SequenceNumber> seq_history;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // the penultimate level file temperature is not cold, all data are output to
+  // the penultimate level.
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // basic compaction stats are still counted to the last level
+  expect_stats[kLastLevel].Add(kBasicCompStats);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq to split the file into 2 levels, so should have
+  // both the last level stats and the output_to_penultimate_level stats
+  latest_cold_seq = seq_history[0];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // delete all cold data, so all data will be on penultimate level
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq again with range delete, take a snapshot to keep
+  // the range dels in both cold and hot SSTs
+  auto snap = db_->GetSnapshot();
+  latest_cold_seq = seq_history[2];
+  std::string start = Key(25), end = Key(35);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // verify data
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    if (i < 10 || (i >= 25 && i < 35)) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  // range delete all hot data
+  start = Key(30);
+  end = Key(130);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // release the snapshot and do compaction again should remove all hot data
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // 2 range dels are dropped
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      3);
+
+  // move backward the cold_seq, for example the user may change the setting of
+  // hot/cold data, but it won't impact the existing cold data, as the sequence
+  // number is zeroed out.
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+  auto cmp = options.comparator;
+
+  port::Mutex mutex;
+  std::string hot_start = Key(10);
+  std::string hot_end = Key(50);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        MutexLock l(&mutex);
+        context->output_to_penultimate_level =
+            cmp->Compare(context->key, hot_start) >= 0 &&
+            cmp->Compare(context->key, hot_end) < 0;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // change to all cold, no output_to_penultimate_level output
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(100);
+    hot_end = Key(200);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // change to all hot, universal compaction support moving data to up level if
+  // it's within compaction level range.
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(0);
+    hot_end = Key(100);
+  }
+
+  // No data is moved from cold tier to hot tier because no input files from L5
+  // or higher, it's not safe to move data to output_to_penultimate_level level.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+  // Add 2 keys in higher level, but in separated files, all keys can be moved
+  // up if it's hot
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(50), "value" + std::to_string(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to only 1 key cold, to test compaction could stop even it matches
+  // size amp compaction threshold
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(1);
+    hot_end = Key(1000);
+  }
+
+  // generate files just enough to trigger compaction
+  for (int i = 0; i < kNumTrigger - 1; i++) {
+    for (int j = 0; j < 1000; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(
+      true));  // make sure the compaction is able to finish
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  auto opts = db_->GetOptions();
+  auto max_size_amp =
+      opts.compaction_options_universal.max_size_amplification_percent / 100;
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown),
+            GetSstSizeHelper(Temperature::kCold) * max_size_amp);
+
+  // delete all cold data
+  ASSERT_OK(Delete(Key(0)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range delete overlap with both hot/cold data, with a snapshot to make sure
+  // the range del is saved
+  auto snap = db_->GetSnapshot();
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(50);
+    hot_end = Key(100);
+  }
+  std::string start = Key(1), end = Key(70);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped until snapshot is released
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // verify data
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    if (i < 70) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range del is dropped
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      1);
+}
+
+TEST_P(TieredCompactionTest, LevelColdRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1",
+            FilesPerLevel());  // bottommost but not last level file is hot
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // explicitly move the data to the last level
+  MoveFilesToLevel(kLastLevel);
+
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  auto snap = db_->GetSnapshot();
+
+  std::string start = Key(10);
+  std::string end = Key(50);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+
+  // 20->30 will be marked as cold data, but it cannot be placed to cold tier
+  // (bottommost) otherwise, it will be "deleted" by the range del in
+  // output_to_penultimate_level level verify that these data will be able to
+  // queried
+  for (int i = 20; i < 30; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  // make the range tombstone and data after that cold
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+
+  // add home hot data, just for test
+  for (int i = 30; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  std::string value;
+  for (int i = 0; i < kNumKeys; i++) {
+    auto s = db_->Get(ReadOptions(), Key(i), &value);
+    if ((i >= 10 && i < 20) || (i >= 40 && i < 50)) {
+      ASSERT_TRUE(s.IsNotFound());
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+
+  db_->ReleaseSnapshot(snap);
+}
+
+// Test SST partitioner cut after every single key
+class SingleKeySstPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& /*request*/) override {
+    return kRequired;
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+};
+
+class SingleKeySstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() { return "SingleKeySstPartitionerFactory"; }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new SingleKeySstPartitioner());
+  }
+};
+
+TEST_P(TieredCompactionTest, LevelOutofBoundaryRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 3;
+  const int kNumKeys = 10;
+
+  auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.sst_partitioner_factory = factory;
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(kNumLevels - 1);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ("0,0,10", FilesPerLevel());
+
+  auto snap = db_->GetSnapshot();
+
+  // only range delete
+  std::string start = Key(3);
+  std::string end = Key(5);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown),
+            0);  // tombstone has no size, even it's in hot tier
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ("0,1,10",
+            FilesPerLevel());  // one file is at the penultimate level which
+                               // only contains a range delete
+
+  // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+  // range del, but they don't have overlap with range del, make sure the range
+  // del will still be placed there
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Put(Key(0), "new value" + std::to_string(0)));
+  auto snap2 = db_->GetSnapshot();
+  ASSERT_OK(Put(Key(6), "new value" + std::to_string(6)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,2,10",
+            FilesPerLevel());  // one file is at the penultimate level
+                               // which only contains a range delete
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  bool found_sst_with_del = false;
+  uint64_t sst_with_del_num = 0;
+  for (const auto& meta : live_file_meta) {
+    if (meta.num_deletions > 0) {
+      // found SST with del, which has 2 entries, one for data one for range del
+      ASSERT_EQ(meta.level,
+                kNumLevels - 2);  // output to penultimate level
+      ASSERT_EQ(meta.num_entries, 2);
+      ASSERT_EQ(meta.num_deletions, 1);
+      found_sst_with_del = true;
+      sst_with_del_num = meta.file_number;
+    }
+  }
+  ASSERT_TRUE(found_sst_with_del);
+
+  // release the first snapshot and compact, which should compact the range del
+  // but new inserted key `0` and `6` are still hot data which will be placed on
+  // the penultimate level
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,2,7", FilesPerLevel());
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  found_sst_with_del = false;
+  for (const auto& meta : live_file_meta) {
+    // check new SST with del (the old one may not yet be deleted after
+    // compaction)
+    if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+      found_sst_with_del = true;
+    }
+  }
+  ASSERT_FALSE(found_sst_with_del);
+
+  // Now make all data cold, key 0 will be moved to the last level, but key 6 is
+  // still in snap2, so it will be kept at the penultimate level
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,1,8", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->ReleaseSnapshot(snap2);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,8", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, UniversalRangeDelete) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 10;
+
+  auto factory = std::make_shared<SingleKeySstPartitionerFactory>();
+
+  auto options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.statistics = CreateDBStatistics();
+  options.sst_partitioner_factory = factory;
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  // compact to the penultimate level with 10 files
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,0,0,10", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // make all data cold
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,10", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // range del which considered as hot data, but it will be merged and deleted
+  // with the last level data
+  std::string start = Key(3);
+  std::string end = Key(5);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,0,0,0,8", FilesPerLevel());
+
+  // range del with snapshot should be preserved in the penultimate level
+  auto snap = db_->GetSnapshot();
+
+  start = Key(6);
+  end = Key(8);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,8", FilesPerLevel());
+
+  // Add 2 hot keys, each is a new SST, they will be placed in the same level as
+  // range del, but no overlap with range del.
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Put(Key(4), "new value" + std::to_string(0)));
+  auto snap2 = db_->GetSnapshot();
+  ASSERT_OK(Put(Key(9), "new value" + std::to_string(6)));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,2,8", FilesPerLevel());
+  // find the SST with range del
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  bool found_sst_with_del = false;
+  uint64_t sst_with_del_num = 0;
+  for (const auto& meta : live_file_meta) {
+    if (meta.num_deletions > 0) {
+      // found SST with del, which has 2 entries, one for data one for range del
+      ASSERT_EQ(meta.level,
+                kNumLevels - 2);  // output_to_penultimate_level level
+      ASSERT_EQ(meta.num_entries, 2);
+      ASSERT_EQ(meta.num_deletions, 1);
+      found_sst_with_del = true;
+      sst_with_del_num = meta.file_number;
+    }
+  }
+  ASSERT_TRUE(found_sst_with_del);
+
+  // release the first snapshot which should compact the range del, but data on
+  // the same level is still hot
+  db_->ReleaseSnapshot(snap);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,2,6", FilesPerLevel());
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  // no range del should be found in SST
+  found_sst_with_del = false;
+  for (const auto& meta : live_file_meta) {
+    // check new SST with del (the old one may not yet be deleted after
+    // compaction)
+    if (meta.num_deletions > 0 && meta.file_number != sst_with_del_num) {
+      found_sst_with_del = true;
+    }
+  }
+  ASSERT_FALSE(found_sst_with_del);
+
+  // make all data to cold, but key 6 is still protected by snap2
+  latest_cold_seq = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,7", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->ReleaseSnapshot(snap2);
+
+  // release snapshot, everything go to bottommost
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,7", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, SequenceBasedTieredStorageLevel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t latest_cold_seq = 0;
+  std::vector<SequenceNumber> seq_history;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        context->output_to_penultimate_level =
+            context->seq_num > latest_cold_seq;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<InternalStats::CompactionStats> expect_stats(kNumLevels);
+  InternalStats::CompactionStats& last_stats = expect_stats[kLastLevel];
+  InternalStats::CompactionStats expect_pl_stats;
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    expect_stats[0].Add(kBasicFlushStats);
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // non last level is hot
+  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  expect_stats[1].Add(kBasicCompStats);
+  expect_stats[1].Add(kBasicPerLevelStats);
+  expect_stats[1].ResetCompactionReason(CompactionReason::kLevelL0FilesNum);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move all data to the last level
+  MoveFilesToLevel(kLastLevel);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // The compaction won't move the data up
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.Add(kBasicPerLevelStats);
+  last_stats.num_dropped_records = 0;
+  last_stats.bytes_read_non_output_levels = 0;
+  last_stats.num_input_files_in_non_output_levels = 0;
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // Add new data, which is all hot and overriding all existing data
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  ResetAllStats(expect_stats, expect_pl_stats);
+
+  // after compaction, all data are hot
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  for (int level = 2; level < kNumLevels - 1; level++) {
+    expect_stats[level].bytes_moved = kHasValue;
+  }
+
+  last_stats.Add(kBasicCompStats);
+  last_stats.bytes_read_output_level = kHasValue;
+  last_stats.num_input_files_in_output_level = kHasValue;
+  last_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  expect_pl_stats.Add(kBasicPerKeyPlacementCompStats);
+  expect_pl_stats.ResetCompactionReason(CompactionReason::kManualCompaction);
+  VerifyCompactionStats(expect_stats, expect_pl_stats);
+
+  // move forward the cold_seq, try to split the data into cold and hot, but in
+  // this case it's unsafe to split the data
+  // because it's non-last-level but bottommost file, the sequence number will
+  // be zeroed out and lost the time information (with
+  // `level_compaction_dynamic_level_bytes` or Universal Compaction, it should
+  // be rare.)
+  // TODO(zjay): ideally we should avoid zero out non-last-level bottommost file
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  seq_history.clear();
+
+  // manually move all data (cold) to last level
+  MoveFilesToLevel(kLastLevel);
+  seq_history.clear();
+  // Add new data once again
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+    seq_history.emplace_back(dbfull()->GetLatestSequenceNumber());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  latest_cold_seq = seq_history[0];
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // delete all cold data
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  latest_cold_seq = seq_history[2];
+
+  MoveFilesToLevel(kLastLevel);
+
+  // move forward the cold_seq again with range delete, take a snapshot to keep
+  // the range dels in bottommost
+  auto snap = db_->GetSnapshot();
+
+  std::string start = Key(25), end = Key(35);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  // add one small key and large key in the input level, to make sure it's able
+  // to move hot data to input level within that range
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Put(Key(100), "value" + std::to_string(0)));
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // verify data
+  std::string value;
+  for (int i = 1; i < 130; i++) {
+    if (i < 10 || (i >= 25 && i < 35)) {
+      ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+    } else {
+      ASSERT_OK(db_->Get(ReadOptions(), Key(i), &value));
+    }
+  }
+
+  // delete all hot data
+  ASSERT_OK(Delete(Key(0)));
+  start = Key(30);
+  end = Key(101);  // range [101, 130] is cold, because it's not in input range
+                   // in previous compaction
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  db_->ReleaseSnapshot(snap);
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // 3 range dels dropped, the first one is double counted as expected, which is
+  // spread into 2 SST files
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      3);
+
+  // move backward of cold_seq, which might happen when the user change the
+  // setting. the hot data won't move up, just to make sure it still runs
+  // fine, which is because:
+  // 1. sequence number is zeroed out, so no time information
+  // 2. leveled compaction only support move data up within the higher level
+  // input range
+  latest_cold_seq = seq_history[1];
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+}
+
+TEST_P(TieredCompactionTest, RangeBasedTieredStorageLevel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  auto options = CurrentOptions();
+  SetColdTemperature(options);
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.max_subcompactions = 10;
+  DestroyAndReopen(options);
+  auto cmp = options.comparator;
+
+  port::Mutex mutex;
+  std::string hot_start = Key(10);
+  std::string hot_end = Key(50);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput.context", [&](void* arg) {
+        auto context = static_cast<PerKeyPlacementContext*>(arg);
+        MutexLock l(&mutex);
+        context->output_to_penultimate_level =
+            cmp->Compare(context->key, hot_start) >= 0 &&
+            cmp->Compare(context->key, hot_end) < 0;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < kNumTrigger; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      ASSERT_OK(Put(Key(j), "value" + std::to_string(j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to all cold
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(100);
+    hot_end = Key(200);
+  }
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // change to all hot, but level compaction only support move cold to hot
+  // within it's higher level input range.
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(0);
+    hot_end = Key(100);
+  }
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // with mixed hot/cold data
+  {
+    MutexLock l(&mutex);
+    hot_start = Key(50);
+    hot_end = Key(100);
+  }
+  ASSERT_OK(Put(Key(0), "value" + std::to_string(0)));
+  ASSERT_OK(Put(Key(100), "value" + std::to_string(100)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // delete all hot data, but with snapshot to keep the range del
+  auto snap = db_->GetSnapshot();
+  std::string start = Key(50);
+  std::string end = Key(100);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), start, end));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // no range del is dropped because of snapshot
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      0);
+
+  // release the snapshot and do compaction again should remove all hot data
+  db_->ReleaseSnapshot(snap);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  ASSERT_EQ(
+      options.statistics->getTickerCount(COMPACTION_RANGE_DEL_DROP_OBSOLETE),
+      1);
+}
+
+INSTANTIATE_TEST_CASE_P(TieredCompactionTest, TieredCompactionTest,
+                        testing::Bool());
+
+class PrecludeLastLevelTest : public DBTestBase {
+ public:
+  PrecludeLastLevelTest()
+      : DBTestBase("preclude_last_level_test", /*env_do_fsync=*/false) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+    mock_clock_->SetCurrentTime(0);
+  }
+};
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeManualCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 10000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  // all data is hot, even they're in the last level
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // Generate a sstable and trigger manual compaction
+  ASSERT_OK(Put(Key(10), "value"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // all data is moved up to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimeAutoCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 10000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  // make sure it won't trigger Size Amp compaction, unlike normal Size Amp
+  // compaction which is typically a last level compaction, when tiered Storage
+  // ("preclude_last_level") is enabled, size amp won't include the last level.
+  // As the last level would be in cold tier and the size would not be a
+  // problem, which also avoid frequent hot to cold storage compaction.
+  options.compaction_options_universal.max_size_amplification_percent = 400;
+  Reopen(options);
+
+  // all data is hot, even they're in the last level
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // Write more data, but still all hot until the 10th SST, as:
+  // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+  // The preclude_last_level_data_seconds is 10k
+  Random rnd(301);
+  for (; sst_num < kNumTrigger * 2 - 1; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      // the value needs to be big enough to trigger full compaction
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+  }
+
+  // all data is moved up to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, MigrationFromPreserveTimePartial) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 2000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  // Generate a sstable and trigger manual compaction
+  ASSERT_OK(Put(Key(10), "value"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // some data are moved up, some are not
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, SmallPrecludeTime) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preclude_last_level_data_seconds = 60;
+  options.preserve_internal_time_seconds = 0;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  options.last_level_temperature = Temperature::kCold;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  dbfull()->TEST_WaitForPeridicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(10) + 1));
+  });
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun([&] {
+      mock_clock_->MockSleepForSeconds(static_cast<int>(rnd.Uniform(2)));
+    });
+  }
+  ASSERT_OK(Flush());
+
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  ASSERT_FALSE(tables_props.begin()->second->seqno_to_time_mapping.empty());
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(
+      tp_mapping.Add(tables_props.begin()->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_FALSE(seqs.empty());
+
+  // Wait more than preclude_last_level time, then make sure all the data is
+  // compacted to the last level even there's no write (no seqno -> time
+  // information was flushed to any SST).
+  mock_clock_->MockSleepForSeconds(100);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  Close();
+}
+
+TEST_F(PrecludeLastLevelTest, LastLevelOnlyCompactionPartial) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // enable preclude feature
+  options.preclude_last_level_data_seconds = 2000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // some data are moved up, some are not
+  ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  Close();
+}
+
+class PrecludeLastLevelTestWithParms
+    : public PrecludeLastLevelTest,
+      public testing::WithParamInterface<bool> {
+ public:
+  PrecludeLastLevelTestWithParms() : PrecludeLastLevelTest() {}
+};
+
+TEST_P(PrecludeLastLevelTestWithParms, LastLevelOnlyCompactionNoPreclude) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  bool enable_preclude_last_level = GetParam();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preserve_internal_time_seconds = 2000;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  Random rnd(301);
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), rnd.RandomString(100)));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // all data is pushed to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  std::atomic_bool is_manual_compaction_running = false;
+  std::atomic_bool verified_compaction_order = false;
+
+  // Make sure the manual compaction is in progress and try to trigger a
+  // SizeRatio compaction by flushing 4 files to L0. The compaction will try to
+  // compact 4 files at L0 to L5 (the last empty level).
+  // If the preclude_last_feature is enabled, the auto triggered compaction
+  // cannot be picked. Otherwise, the auto triggered compaction can run in
+  // parallel with the last level compaction.
+  // L0: [a] [b] [c] [d]
+  // L5:     (locked if preclude_last_level is enabled)
+  // L6: [z] (locked: manual compaction in progress)
+  // TODO: in this case, L0 files should just be compacted to L4, so the 2
+  //  compactions won't be overlapped.
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::ProcessKeyValueCompaction()::Processing", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (compaction->is_manual_compaction()) {
+          is_manual_compaction_running = true;
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "ManualCompaction1");
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "ManualCompaction2");
+          is_manual_compaction_running = false;
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (enable_preclude_last_level && is_manual_compaction_running) {
+          ASSERT_TRUE(compaction == nullptr);
+          verified_compaction_order = true;
+        } else {
+          ASSERT_TRUE(compaction != nullptr);
+          verified_compaction_order = true;
+        }
+        if (!compaction || !compaction->is_manual_compaction()) {
+          TEST_SYNC_POINT(
+              "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+              "AutoCompactionPicked");
+        }
+      });
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "ManualCompaction1",
+       "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite"},
+      {"PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "AutoCompactionPicked",
+       "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:"
+       "ManualCompaction2"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // only enable if the Parameter is true
+  if (enable_preclude_last_level) {
+    options.preclude_last_level_data_seconds = 2000;
+  }
+  options.max_background_jobs = 8;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "PrecludeLastLevelTest::LastLevelOnlyCompactionConflit:StartWrite");
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  for (; sst_num < kNumTrigger * 2; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      // the value needs to be big enough to trigger full compaction
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+
+  manual_compaction_thread.join();
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  if (enable_preclude_last_level) {
+    ASSERT_NE("0,0,0,0,0,1,1", FilesPerLevel());
+  } else {
+    ASSERT_EQ("0,0,0,0,0,1,1", FilesPerLevel());
+  }
+  ASSERT_TRUE(verified_compaction_order);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  stop_token.reset();
+
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(PrecludeLastLevelTestWithParms,
+                        PrecludeLastLevelTestWithParms, testing::Bool());
+
+// partition the SST into 3 ranges [0, 19] [20, 39] [40, ...]
+class ThreeRangesPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) override {
+    if ((cmp->CompareWithoutTimestamp(*request.current_user_key,
+                                      DBTestBase::Key(20)) >= 0 &&
+         cmp->CompareWithoutTimestamp(*request.prev_user_key,
+                                      DBTestBase::Key(20)) < 0) ||
+        (cmp->CompareWithoutTimestamp(*request.current_user_key,
+                                      DBTestBase::Key(40)) >= 0 &&
+         cmp->CompareWithoutTimestamp(*request.prev_user_key,
+                                      DBTestBase::Key(40)) < 0)) {
+      return kRequired;
+    } else {
+      return kNotRequired;
+    }
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+
+  const Comparator* cmp = BytewiseComparator();
+};
+
+class ThreeRangesPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() {
+    return "TombstoneTestSstPartitionerFactory";
+  }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new ThreeRangesPartitioner());
+  }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompaction) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.preserve_internal_time_seconds = 10000;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  Random rnd(301);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // make sure all data is compacted to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // Create 3 L5 files
+  auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  Reopen(options);
+
+  for (int i = 0; i < kNumTrigger - 1; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // L5: [0,19] [20,39] [40,299]
+  // L6: [0,                299]
+  ASSERT_EQ("0,0,0,0,0,3,1", FilesPerLevel());
+
+  // enable tiered storage feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  options.statistics = CreateDBStatistics();
+  Reopen(options);
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[5].files.size(), 3);
+  ASSERT_EQ(meta.levels[6].files.size(), 1);
+  ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+  ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(299));
+
+  std::string file_path = meta.levels[5].files[1].db_path;
+  std::vector<std::string> files;
+  // pick 3rd file @L5 + file@L6 for compaction
+  files.push_back(file_path + "/" + meta.levels[5].files[2].name);
+  files.push_back(file_path + "/" + meta.levels[6].files[0].name);
+  ASSERT_OK(db_->CompactFiles(CompactionOptions(), files, 6));
+
+  // The compaction only moved partial of the hot data to hot tier, range[0,39]
+  // is unsafe to move up, otherwise, they will be overlapped with the existing
+  // files@L5.
+  // The output should be:
+  //  L5: [0,19] [20,39] [40,299]    <-- Temperature::kUnknown
+  //  L6: [0,19] [20,39]             <-- Temperature::kCold
+  // L6 file is split because of the customized partitioner
+  ASSERT_EQ("0,0,0,0,0,3,2", FilesPerLevel());
+
+  // even all the data is hot, but not all data are moved to the hot tier
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+
+  db_->GetColumnFamilyMetaData(&meta);
+  ASSERT_EQ(meta.levels[5].files.size(), 3);
+  ASSERT_EQ(meta.levels[6].files.size(), 2);
+  for (const auto& file : meta.levels[5].files) {
+    ASSERT_EQ(file.temperature, Temperature::kUnknown);
+  }
+  for (const auto& file : meta.levels[6].files) {
+    ASSERT_EQ(file.temperature, Temperature::kCold);
+  }
+  ASSERT_EQ(meta.levels[6].files[0].smallestkey, Key(0));
+  ASSERT_EQ(meta.levels[6].files[0].largestkey, Key(19));
+  ASSERT_EQ(meta.levels[6].files[1].smallestkey, Key(20));
+  ASSERT_EQ(meta.levels[6].files[1].largestkey, Key(39));
+
+  Close();
+}
+
+struct TestPropertiesCollector : public TablePropertiesCollector {
+  Status AddUserKey(const Slice& key, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    if (cmp->Compare(key, DBTestBase::Key(100)) == 0) {
+      has_key_100 = true;
+    }
+    if (cmp->Compare(key, DBTestBase::Key(200)) == 0) {
+      has_key_200 = true;
+    }
+
+    return Status::OK();
+  }
+
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    UserCollectedProperties ret;
+    return ret;
+  }
+
+  Status Finish(UserCollectedProperties* /*properties*/) override {
+    // The LSM tree would be like:
+    // L5: [0,19] [20,39] [40,299]
+    // L6: [0,                299]
+    // the 3rd file @L5 has both 100 and 200, which will be marked for
+    // compaction
+    // Also avoid marking flushed SST for compaction, which won't have both 100
+    // and 200
+    if (has_key_100 && has_key_200) {
+      need_compact_ = true;
+    } else {
+      need_compact_ = false;
+    }
+    has_key_100 = false;
+    has_key_200 = false;
+    return Status::OK();
+  }
+
+  bool NeedCompact() const override { return need_compact_; }
+
+  const Comparator* cmp = BytewiseComparator();
+
+ private:
+  bool has_key_100 = false;
+  bool has_key_200 = false;
+
+  bool need_compact_ = false;
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new TestPropertiesCollector;
+  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+TEST_F(PrecludeLastLevelTest, PartialPenultimateLevelCompactionWithRangeDel) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = mock_env_.get();
+  options.level0_file_num_compaction_trigger = kNumTrigger;
+  options.preserve_internal_time_seconds = 10000;
+  options.num_levels = kNumLevels;
+  // set a small max_compaction_bytes to avoid input level expansion
+  options.max_compaction_bytes = 30000;
+  options.ignore_max_compaction_bytes_for_input = false;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  Random rnd(301);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(kKeyPerSec); });
+  }
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // make sure all data is compacted to the last level
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+
+  // Create 3 L5 files
+  auto factory = std::make_shared<ThreeRangesPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  // the user defined properties_collector will mark the 3rd file for compaction
+  auto collector_factory = std::make_shared<TestPropertiesCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+  // enable tiered storage feature
+  options.preclude_last_level_data_seconds = 10000;
+  options.last_level_temperature = Temperature::kCold;
+  Reopen(options);
+
+  for (int i = 0; i < kNumTrigger - 2; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // make sure there is one and only one compaction supports per-key placement
+  // but has the penultimate level output disabled.
+  std::atomic_int per_key_comp_num = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        auto compaction = static_cast<Compaction*>(arg);
+        if (compaction->SupportsPerKeyPlacement()) {
+          ASSERT_EQ(compaction->GetPenultimateOutputRangeType(),
+                    Compaction::PenultimateOutputRangeType::kDisabled);
+          per_key_comp_num++;
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int j = 0; j < 100; j++) {
+    ASSERT_OK(Put(Key(200 + j), rnd.RandomString(10)));
+  }
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(32), Key(40)));
+  ASSERT_OK(Flush());
+
+  // Before the per-key placement compaction, the LSM tress should be like:
+  // L5: [0,19] [20,40] [40,299]
+  // L6: [0,                299]
+  // The 2nd file @L5 has the largest key 40 because of range del
+
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  ASSERT_EQ(per_key_comp_num, 1);
+
+  // the compaction won't move any data to the penultimate level
+  ASSERT_EQ("0,0,0,0,0,2,3", FilesPerLevel());
+
+  Close();
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/comparator_db_test.cc b/src/rocksdb/db/comparator_db_test.cc
new file mode 100644
index 000000000..e5e3493b3
--- /dev/null
+++ b/src/rocksdb/db/comparator_db_test.cc
@@ -0,0 +1,678 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <array>
+#include <map>
+#include <string>
+
+#include "memtable/stl_wrappers.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/kv_map.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+
+static const Comparator* kTestComparator = nullptr;
+
+class KVIter : public Iterator {
+ public:
+  explicit KVIter(const stl_wrappers::KVMap* map)
+      : map_(map), iter_(map_->end()) {}
+  bool Valid() const override { return iter_ != map_->end(); }
+  void SeekToFirst() override { iter_ = map_->begin(); }
+  void SeekToLast() override {
+    if (map_->empty()) {
+      iter_ = map_->end();
+    } else {
+      iter_ = map_->find(map_->rbegin()->first);
+    }
+  }
+  void Seek(const Slice& k) override {
+    iter_ = map_->lower_bound(k.ToString());
+  }
+  void SeekForPrev(const Slice& k) override {
+    iter_ = map_->upper_bound(k.ToString());
+    Prev();
+  }
+  void Next() override { ++iter_; }
+  void Prev() override {
+    if (iter_ == map_->begin()) {
+      iter_ = map_->end();
+      return;
+    }
+    --iter_;
+  }
+
+  Slice key() const override { return iter_->first; }
+  Slice value() const override { return iter_->second; }
+  Status status() const override { return Status::OK(); }
+
+ private:
+  const stl_wrappers::KVMap* const map_;
+  stl_wrappers::KVMap::const_iterator iter_;
+};
+
+void AssertItersEqual(Iterator* iter1, Iterator* iter2) {
+  ASSERT_EQ(iter1->Valid(), iter2->Valid());
+  if (iter1->Valid()) {
+    ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString());
+    ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString());
+  }
+}
+
+// Measuring operations on DB (expect to be empty).
+// source_strings are candidate keys
+void DoRandomIteraratorTest(DB* db, std::vector<std::string> source_strings,
+                            Random* rnd, int num_writes, int num_iter_ops,
+                            int num_trigger_flush) {
+  stl_wrappers::KVMap map((stl_wrappers::LessOfComparator(kTestComparator)));
+
+  for (int i = 0; i < num_writes; i++) {
+    if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) {
+      db->Flush(FlushOptions());
+    }
+
+    int type = rnd->Uniform(2);
+    int index = rnd->Uniform(static_cast<int>(source_strings.size()));
+    auto& key = source_strings[index];
+    switch (type) {
+      case 0:
+        // put
+        map[key] = key;
+        ASSERT_OK(db->Put(WriteOptions(), key, key));
+        break;
+      case 1:
+        // delete
+        if (map.find(key) != map.end()) {
+          map.erase(key);
+        }
+        ASSERT_OK(db->Delete(WriteOptions(), key));
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(ReadOptions()));
+  std::unique_ptr<Iterator> result_iter(new KVIter(&map));
+
+  bool is_valid = false;
+  for (int i = 0; i < num_iter_ops; i++) {
+    // Random walk and make sure iter and result_iter returns the
+    // same key and value
+    int type = rnd->Uniform(6);
+    ASSERT_OK(iter->status());
+    switch (type) {
+      case 0:
+        // Seek to First
+        iter->SeekToFirst();
+        result_iter->SeekToFirst();
+        break;
+      case 1:
+        // Seek to last
+        iter->SeekToLast();
+        result_iter->SeekToLast();
+        break;
+      case 2: {
+        // Seek to random key
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        iter->Seek(key);
+        result_iter->Seek(key);
+        break;
+      }
+      case 3:
+        // Next
+        if (is_valid) {
+          iter->Next();
+          result_iter->Next();
+        } else {
+          continue;
+        }
+        break;
+      case 4:
+        // Prev
+        if (is_valid) {
+          iter->Prev();
+          result_iter->Prev();
+        } else {
+          continue;
+        }
+        break;
+      default: {
+        assert(type == 5);
+        auto key_idx = rnd->Uniform(static_cast<int>(source_strings.size()));
+        auto key = source_strings[key_idx];
+        std::string result;
+        auto status = db->Get(ReadOptions(), key, &result);
+        if (map.find(key) == map.end()) {
+          ASSERT_TRUE(status.IsNotFound());
+        } else {
+          ASSERT_EQ(map[key], result);
+        }
+        break;
+      }
+    }
+    AssertItersEqual(iter.get(), result_iter.get());
+    is_valid = iter->Valid();
+  }
+}
+
+class DoubleComparator : public Comparator {
+ public:
+  DoubleComparator() {}
+
+  const char* Name() const override { return "DoubleComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+#ifndef CYGWIN
+    double da = std::stod(a.ToString());
+    double db = std::stod(b.ToString());
+#else
+    double da = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+    double db = std::strtod(a.ToString().c_str(), 0 /* endptr */);
+#endif
+    if (da == db) {
+      return a.compare(b);
+    } else if (da > db) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class HashComparator : public Comparator {
+ public:
+  HashComparator() {}
+
+  const char* Name() const override { return "HashComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    uint32_t ha = Hash(a.data(), a.size(), 66);
+    uint32_t hb = Hash(b.data(), b.size(), 66);
+    if (ha == hb) {
+      return a.compare(b);
+    } else if (ha > hb) {
+      return 1;
+    } else {
+      return -1;
+    }
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class TwoStrComparator : public Comparator {
+ public:
+  TwoStrComparator() {}
+
+  const char* Name() const override { return "TwoStrComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() >= 2);
+    assert(b.size() >= 2);
+    size_t size_a1 = static_cast<size_t>(a[0]);
+    size_t size_b1 = static_cast<size_t>(b[0]);
+    size_t size_a2 = static_cast<size_t>(a[1]);
+    size_t size_b2 = static_cast<size_t>(b[1]);
+    assert(size_a1 + size_a2 + 2 == a.size());
+    assert(size_b1 + size_b2 + 2 == b.size());
+
+    Slice a1 = Slice(a.data() + 2, size_a1);
+    Slice b1 = Slice(b.data() + 2, size_b1);
+    Slice a2 = Slice(a.data() + 2 + size_a1, size_a2);
+    Slice b2 = Slice(b.data() + 2 + size_b1, size_b2);
+
+    if (a1 != b1) {
+      return a1.compare(b1);
+    }
+    return a2.compare(b2);
+  }
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+}  // anonymous namespace
+
+class ComparatorDBTest
+    : public testing::Test,
+      virtual public ::testing::WithParamInterface<uint32_t> {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+  Options last_options_;
+  std::unique_ptr<const Comparator> comparator_guard;
+
+ public:
+  ComparatorDBTest() : env_(Env::Default()), db_(nullptr) {
+    kTestComparator = BytewiseComparator();
+    dbname_ = test::PerThreadDBPath("comparator_db_test");
+    BlockBasedTableOptions toptions;
+    toptions.format_version = GetParam();
+    last_options_.table_factory.reset(
+        ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(toptions));
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  ~ComparatorDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, last_options_));
+    kTestComparator = BytewiseComparator();
+  }
+
+  DB* GetDB() { return db_; }
+
+  void SetOwnedComparator(const Comparator* cmp, bool owner = true) {
+    if (owner) {
+      comparator_guard.reset(cmp);
+    } else {
+      comparator_guard.reset();
+    }
+    kTestComparator = cmp;
+    last_options_.comparator = cmp;
+  }
+
+  // Return the current option configuration.
+  Options* GetOptions() { return &last_options_; }
+
+  void DestroyAndReopen() {
+    // Destroy using last options
+    Destroy();
+    ASSERT_OK(TryReopen());
+  }
+
+  void Destroy() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, last_options_));
+  }
+
+  Status TryReopen() {
+    delete db_;
+    db_ = nullptr;
+    last_options_.create_if_missing = true;
+
+    return DB::Open(last_options_, dbname_, &db_);
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(FormatDef, ComparatorDBTest,
+                        testing::Values(test::kDefaultFormatVersion));
+INSTANTIATE_TEST_CASE_P(FormatLatest, ComparatorDBTest,
+                        testing::Values(kLatestFormatVersion));
+
+TEST_P(ComparatorDBTest, Bytewise) {
+  for (int rand_seed = 301; rand_seed < 306; rand_seed++) {
+    DestroyAndReopen();
+    Random rnd(rand_seed);
+    DoRandomIteraratorTest(GetDB(),
+                           {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd,
+                           8, 100, 3);
+  }
+}
+
+TEST_P(ComparatorDBTest, SimpleSuffixReverseComparator) {
+  SetOwnedComparator(new test::SimpleSuffixReverseComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    std::vector<std::string> source_prefixes;
+    // Randomly generate 5 prefixes
+    for (int i = 0; i < 5; i++) {
+      source_prefixes.push_back(rnd.HumanReadableString(8));
+    }
+    for (int j = 0; j < 20; j++) {
+      int prefix_index = rnd.Uniform(static_cast<int>(source_prefixes.size()));
+      std::string key = source_prefixes[prefix_index] +
+                        rnd.HumanReadableString(rnd.Uniform(8));
+      source_strings.push_back(key);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, Uint64Comparator) {
+  SetOwnedComparator(test::Uint64Comparator(), false /* owner */);
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+    Random64 rnd64(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint64_t r = rnd64.Next();
+      std::string str;
+      str.resize(8);
+      memcpy(&str[0], static_cast<void*>(&r), 8);
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, DoubleComparator) {
+  SetOwnedComparator(new DoubleComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      uint32_t r = rnd.Next();
+      uint32_t divide_order = rnd.Uniform(8);
+      double to_divide = 1.0;
+      for (uint32_t j = 0; j < divide_order; j++) {
+        to_divide *= 10.0;
+      }
+      source_strings.push_back(std::to_string(r / to_divide));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, HashComparator) {
+  SetOwnedComparator(new HashComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      source_strings.push_back(test::RandomKey(&rnd, 8));
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+TEST_P(ComparatorDBTest, TwoStrComparator) {
+  SetOwnedComparator(new TwoStrComparator());
+
+  for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) {
+    Options* opt = GetOptions();
+    opt->comparator = kTestComparator;
+    DestroyAndReopen();
+    Random rnd(rnd_seed);
+
+    std::vector<std::string> source_strings;
+    // Randomly generate source keys
+    for (int i = 0; i < 100; i++) {
+      std::string str;
+      uint32_t size1 = rnd.Uniform(8);
+      uint32_t size2 = rnd.Uniform(8);
+      str.append(1, static_cast<char>(size1));
+      str.append(1, static_cast<char>(size2));
+      str.append(test::RandomKey(&rnd, size1));
+      str.append(test::RandomKey(&rnd, size2));
+      source_strings.push_back(str);
+    }
+
+    DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66);
+  }
+}
+
+namespace {
+void VerifyNotSuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+void VerifySuccessor(const Slice& s, const Slice& t) {
+  auto bc = BytewiseComparator();
+  auto rbc = ReverseBytewiseComparator();
+  ASSERT_TRUE(bc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(s, t));
+  ASSERT_FALSE(bc->IsSameLengthImmediateSuccessor(t, s));
+  // Should be true but that increases exposure to a design bug in
+  // auto_prefix_mode, so currently set to FALSE
+  ASSERT_FALSE(rbc->IsSameLengthImmediateSuccessor(t, s));
+}
+
+}  // anonymous namespace
+
+TEST_P(ComparatorDBTest, IsSameLengthImmediateSuccessor) {
+  {
+    // different length
+    Slice s("abcxy");
+    Slice t("abcxyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    Slice s("abcxyz");
+    Slice t("abcxy");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    // not last byte different
+    Slice s("abc1xyz");
+    Slice t("abc2xyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    // same string
+    Slice s("abcxyz");
+    Slice t("abcxyz");
+    VerifyNotSuccessor(s, t);
+  }
+  {
+    Slice s("abcxy");
+    Slice t("abcxz");
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xac";
+    const char t_array[] = "\x50\x8a\xad";
+    Slice s(s_array);
+    Slice t(t_array);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff";
+    const char t_array[] = "\x50\x8b\x00";
+    Slice s(s_array, 3);
+    Slice t(t_array, 3);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x00";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    VerifySuccessor(s, t);
+  }
+  {
+    const char s_array[] = "\x50\x8a\xff\xff";
+    const char t_array[] = "\x50\x8b\x00\x01";
+    Slice s(s_array, 4);
+    Slice t(t_array, 4);
+    VerifyNotSuccessor(s, t);
+  }
+}
+
+TEST_P(ComparatorDBTest, FindShortestSeparator) {
+  std::string s1 = "abc1xyz";
+  std::string s2 = "abc3xy";
+
+  BytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc2", s1);
+
+  s1 = "abc5xyztt";
+
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc5", s1);
+
+  s1 = "abc3";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2xy";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  s1 = "abc3xyz";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_EQ("abc3", s1);
+
+  std::string old_s1 = s1 = "abc2xy";
+  s2 = "abc2";
+  ReverseBytewiseComparator()->FindShortestSeparator(&s1, s2);
+  ASSERT_TRUE(old_s1 >= s1);
+  ASSERT_TRUE(s1 > s2);
+}
+
+TEST_P(ComparatorDBTest, SeparatorSuccessorRandomizeTest) {
+  // Char list for boundary cases.
+  std::array<unsigned char, 6> char_list{{0, 1, 2, 253, 254, 255}};
+  Random rnd(301);
+
+  for (int attempts = 0; attempts < 1000; attempts++) {
+    uint32_t size1 = rnd.Skewed(4);
+    uint32_t size2;
+
+    if (rnd.OneIn(2)) {
+      // size2 to be random size
+      size2 = rnd.Skewed(4);
+    } else {
+      // size1 is within [-2, +2] of size1
+      int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+      int tmp_size2 = static_cast<int>(size1) + diff;
+      if (tmp_size2 < 0) {
+        tmp_size2 = 0;
+      }
+      size2 = static_cast<uint32_t>(tmp_size2);
+    }
+
+    std::string s1;
+    std::string s2;
+    for (uint32_t i = 0; i < size1; i++) {
+      if (rnd.OneIn(2)) {
+        // Use random byte
+        s1 += static_cast<char>(rnd.Uniform(256));
+      } else {
+        // Use one byte in char_list
+        char c = static_cast<char>(char_list[rnd.Uniform(sizeof(char_list))]);
+        s1 += c;
+      }
+    }
+
+    // First set s2 to be the same as s1, and then modify s2.
+    s2 = s1;
+    s2.resize(size2);
+    // We start from the back of the string
+    if (size2 > 0) {
+      uint32_t pos = size2 - 1;
+      do {
+        if (pos >= size1 || rnd.OneIn(4)) {
+          // For 1/4 chance, use random byte
+          s2[pos] = static_cast<char>(rnd.Uniform(256));
+        } else if (rnd.OneIn(4)) {
+          // In 1/4 chance, stop here.
+          break;
+        } else {
+          // Create a char within [-2, +2] of the matching char of s1.
+          int diff = static_cast<int>(rnd.Uniform(5)) - 2;
+          // char may be signed or unsigned based on platform.
+          int s1_char = static_cast<int>(static_cast<unsigned char>(s1[pos]));
+          int s2_char = s1_char + diff;
+          if (s2_char < 0) {
+            s2_char = 0;
+          }
+          if (s2_char > 255) {
+            s2_char = 255;
+          }
+          s2[pos] = static_cast<char>(s2_char);
+        }
+      } while (pos-- != 0);
+    }
+
+    // Test separators
+    for (int rev = 0; rev < 2; rev++) {
+      if (rev == 1) {
+        // switch s1 and s2
+        std::string t = s1;
+        s1 = s2;
+        s2 = t;
+      }
+      std::string separator = s1;
+      BytewiseComparator()->FindShortestSeparator(&separator, s2);
+      std::string rev_separator = s1;
+      ReverseBytewiseComparator()->FindShortestSeparator(&rev_separator, s2);
+
+      if (s1 == s2) {
+        ASSERT_EQ(s1, separator);
+        ASSERT_EQ(s2, rev_separator);
+      } else if (s1 < s2) {
+        ASSERT_TRUE(s1 <= separator);
+        ASSERT_TRUE(s2 > separator);
+        ASSERT_LE(separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, rev_separator);
+      } else {
+        ASSERT_TRUE(s1 >= rev_separator);
+        ASSERT_TRUE(s2 < rev_separator);
+        ASSERT_LE(rev_separator.size(), std::max(s1.size(), s2.size()));
+        ASSERT_EQ(s1, separator);
+      }
+    }
+
+    // Test successors
+    std::string succ = s1;
+    BytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ >= s1);
+
+    succ = s1;
+    ReverseBytewiseComparator()->FindShortSuccessor(&succ);
+    ASSERT_TRUE(succ <= s1);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/convenience.cc b/src/rocksdb/db/convenience.cc
new file mode 100644
index 000000000..6344d356d
--- /dev/null
+++ b/src/rocksdb/db/convenience.cc
@@ -0,0 +1,81 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include "rocksdb/convenience.h"
+
+#include "db/db_impl/db_impl.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void CancelAllBackgroundWork(DB* db, bool wait) {
+  (static_cast_with_check<DBImpl>(db->GetRootDB()))
+      ->CancelAllBackgroundWork(wait);
+}
+
+Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family,
+                          const Slice* begin, const Slice* end,
+                          bool include_end) {
+  RangePtr range(begin, end);
+  return DeleteFilesInRanges(db, column_family, &range, 1, include_end);
+}
+
+Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
+                           const RangePtr* ranges, size_t n, bool include_end) {
+  return (static_cast_with_check<DBImpl>(db->GetRootDB()))
+      ->DeleteFilesInRanges(column_family, ranges, n, include_end);
+}
+
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const std::string& file_path) {
+  return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
+}
+Status VerifySstFileChecksum(const Options& options,
+                             const EnvOptions& env_options,
+                             const ReadOptions& read_options,
+                             const std::string& file_path,
+                             const SequenceNumber& largest_seqno) {
+  std::unique_ptr<FSRandomAccessFile> file;
+  uint64_t file_size;
+  InternalKeyComparator internal_comparator(options.comparator);
+  ImmutableOptions ioptions(options);
+
+  Status s = ioptions.fs->NewRandomAccessFile(
+      file_path, FileOptions(env_options), &file, nullptr);
+  if (s.ok()) {
+    s = ioptions.fs->GetFileSize(file_path, IOOptions(), &file_size, nullptr);
+  } else {
+    return s;
+  }
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */,
+          nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+          ioptions.rate_limiter.get()));
+  const bool kImmortal = true;
+  auto reader_options = TableReaderOptions(
+      ioptions, options.prefix_extractor, env_options, internal_comparator,
+      false /* skip_filters */, !kImmortal, false /* force_direct_prefetch */,
+      -1 /* level */);
+  reader_options.largest_seqno = largest_seqno;
+  s = ioptions.table_factory->NewTableReader(
+      reader_options, std::move(file_reader), file_size, &table_reader,
+      false /* prefetch_index_and_filter_in_cache */);
+  if (!s.ok()) {
+    return s;
+  }
+  s = table_reader->VerifyChecksum(read_options,
+                                   TableReaderCaller::kUserVerifyChecksum);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/corruption_test.cc b/src/rocksdb/db/corruption_test.cc
new file mode 100644
index 000000000..8ccac6130
--- /dev/null
+++ b/src/rocksdb/db/corruption_test.cc
@@ -0,0 +1,1587 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "rocksdb/write_batch.h"
+#include "table/block_based/block_based_table_builder.h"
+#include "table/meta_blocks.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static constexpr int kValueSize = 1000;
+namespace {
+// A wrapper that allows injection of errors.
+class ErrorEnv : public EnvWrapper {
+ public:
+  bool writable_file_error_;
+  int num_writable_file_errors_;
+
+  explicit ErrorEnv(Env* _target)
+      : EnvWrapper(_target),
+        writable_file_error_(false),
+        num_writable_file_errors_(0) {}
+  const char* Name() const override { return "ErrorEnv"; }
+
+  virtual Status NewWritableFile(const std::string& fname,
+                                 std::unique_ptr<WritableFile>* result,
+                                 const EnvOptions& soptions) override {
+    result->reset();
+    if (writable_file_error_) {
+      ++num_writable_file_errors_;
+      return Status::IOError(fname, "fake error");
+    }
+    return target()->NewWritableFile(fname, result, soptions);
+  }
+};
+}  // anonymous namespace
+class CorruptionTest : public testing::Test {
+ public:
+  std::shared_ptr<Env> env_guard_;
+  ErrorEnv* env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  CorruptionTest() {
+    // If LRU cache shard bit is smaller than 2 (or -1 which will automatically
+    // set it to 0), test SequenceNumberRecovery will fail, likely because of a
+    // bug in recovery code. Keep it 4 for now to make the test passes.
+    tiny_cache_ = NewLRUCache(100, 4);
+    Env* base_env = Env::Default();
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &base_env, &env_guard_));
+    EXPECT_NE(base_env, nullptr);
+    env_ = new ErrorEnv(base_env);
+    options_.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+    options_.env = env_;
+    dbname_ = test::PerThreadDBPath(env_, "corruption_test");
+    Status s = DestroyDB(dbname_, options_);
+    EXPECT_OK(s);
+
+    db_ = nullptr;
+    options_.create_if_missing = true;
+    BlockBasedTableOptions table_options;
+    table_options.block_size_deviation = 0;  // make unit test pass for now
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen();
+    options_.create_if_missing = false;
+  }
+
+  ~CorruptionTest() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->LoadDependency({});
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    delete db_;
+    db_ = nullptr;
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still at %s\n", dbname_.c_str());
+    } else {
+      Options opts;
+      opts.env = env_->target();
+      EXPECT_OK(DestroyDB(dbname_, opts));
+    }
+    delete env_;
+  }
+
+  void CloseDb() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opt = (options ? *options : options_);
+    if (opt.env == Options().env) {
+      // If env is not overridden, replace it with ErrorEnv.
+      // Otherwise, the test already uses a non-default Env.
+      opt.env = env_;
+    }
+    opt.arena_block_size = 4096;
+    BlockBasedTableOptions table_options;
+    table_options.block_cache = tiny_cache_;
+    table_options.block_size_deviation = 0;
+    opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return DB::Open(opt, dbname_, &db_);
+  }
+
+  void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+  void RepairDB() {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(::ROCKSDB_NAMESPACE::RepairDB(dbname_, options_));
+  }
+
+  void Build(int n, int start, int flush_every) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = 0; i < n; i++) {
+      if (flush_every != 0 && i != 0 && i % flush_every == 0) {
+        DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+        ASSERT_OK(dbi->TEST_FlushMemTable());
+      }
+      // if ((i % 100) == 0) fprintf(stderr, "@ %d of %d\n", i, n);
+      Slice key = Key(i + start, &key_space);
+      batch.Clear();
+      ASSERT_OK(batch.Put(key, Value(i + start, &value_space)));
+      ASSERT_OK(db_->Write(WriteOptions(), &batch));
+    }
+  }
+
+  void Build(int n, int flush_every = 0) { Build(n, 0, flush_every); }
+
+  void Check(int min_expected, int max_expected) {
+    uint64_t next_expected = 0;
+    uint64_t missed = 0;
+    int bad_keys = 0;
+    int bad_values = 0;
+    int correct = 0;
+    std::string value_space;
+    // Do not verify checksums. If we verify checksums then the
+    // db itself will raise errors because data is corrupted.
+    // Instead, we want the reads to be successful and this test
+    // will detect whether the appropriate corruptions have
+    // occurred.
+    Iterator* iter = db_->NewIterator(ReadOptions(false, true));
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      uint64_t key;
+      Slice in(iter->key());
+      if (!ConsumeDecimalNumber(&in, &key) || !in.empty() ||
+          key < next_expected) {
+        bad_keys++;
+        continue;
+      }
+      missed += (key - next_expected);
+      next_expected = key + 1;
+      if (iter->value() != Value(static_cast<int>(key), &value_space)) {
+        bad_values++;
+      } else {
+        correct++;
+      }
+    }
+    iter->status().PermitUncheckedError();
+    delete iter;
+
+    fprintf(
+        stderr,
+        "expected=%d..%d; got=%d; bad_keys=%d; bad_values=%d; missed=%llu\n",
+        min_expected, max_expected, correct, bad_keys, bad_values,
+        static_cast<unsigned long long>(missed));
+    ASSERT_LE(min_expected, correct);
+    ASSERT_GE(max_expected, correct);
+  }
+
+  void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) {
+    // Pick file to corrupt
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    uint64_t number;
+    FileType type;
+    std::string fname;
+    int picked_number = -1;
+    for (size_t i = 0; i < filenames.size(); i++) {
+      if (ParseFileName(filenames[i], &number, &type) && type == filetype &&
+          static_cast<int>(number) > picked_number) {  // Pick latest file
+        fname = dbname_ + "/" + filenames[i];
+        picked_number = static_cast<int>(number);
+      }
+    }
+    ASSERT_TRUE(!fname.empty()) << filetype;
+
+    ASSERT_OK(test::CorruptFile(env_, fname, offset, bytes_to_corrupt));
+  }
+
+  // corrupts exactly one file at level `level`. if no file found at level,
+  // asserts
+  void CorruptTableFileAtLevel(int level, int offset, int bytes_to_corrupt) {
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    for (const auto& m : metadata) {
+      if (m.level == level) {
+        ASSERT_OK(test::CorruptFile(env_, dbname_ + "/" + m.name, offset,
+                                    bytes_to_corrupt));
+        return;
+      }
+    }
+    FAIL() << "no file found at level";
+  }
+
+  int Property(const std::string& name) {
+    std::string property;
+    int result;
+    if (db_->GetProperty(name, &property) &&
+        sscanf(property.c_str(), "%d", &result) == 1) {
+      return result;
+    } else {
+      return -1;
+    }
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", i);
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) {
+    if (k == 0) {
+      // Ugh.  Random seed of 0 used to produce no entropy.  This code
+      // preserves the implementation that was in place when all of the
+      // magic values in this file were picked.
+      *storage = std::string(kValueSize, ' ');
+    } else {
+      Random r(k);
+      *storage = r.RandomString(kValueSize);
+    }
+    return Slice(*storage);
+  }
+
+  void GetSortedWalFiles(std::vector<uint64_t>& file_nums) {
+    std::vector<std::string> tmp_files;
+    ASSERT_OK(env_->GetChildren(dbname_, &tmp_files));
+    FileType type = kWalFile;
+    for (const auto& file : tmp_files) {
+      uint64_t number = 0;
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
+        file_nums.push_back(number);
+      }
+    }
+    std::sort(file_nums.begin(), file_nums.end());
+  }
+
+  void CorruptFileWithTruncation(FileType file, uint64_t number,
+                                 uint64_t bytes_to_truncate = 0) {
+    std::string path;
+    switch (file) {
+      case FileType::kWalFile:
+        path = LogFileName(dbname_, number);
+        break;
+      // TODO: Add other file types as this method is being used for those file
+      // types.
+      default:
+        return;
+    }
+    uint64_t old_size = 0;
+    ASSERT_OK(env_->GetFileSize(path, &old_size));
+    assert(old_size > bytes_to_truncate);
+    uint64_t new_size = old_size - bytes_to_truncate;
+    // If bytes_to_truncate == 0, it will do full truncation.
+    if (bytes_to_truncate == 0) {
+      new_size = 0;
+    }
+    ASSERT_OK(test::TruncateFile(env_, path, new_size));
+  }
+};
+
+TEST_F(CorruptionTest, Recovery) {
+  Build(100);
+  Check(100, 100);
+#ifdef OS_WIN
+  // On Wndows OS Disk cache does not behave properly
+  // We do not call FlushBuffers on every Flush. If we do not close
+  // the log file prior to the corruption we end up with the first
+  // block not corrupted but only the second. However, under the debugger
+  // things work just fine but never pass when running normally
+  // For that reason people may want to run with unbuffered I/O. That option
+  // is not available for WAL though.
+  CloseDb();
+#endif
+  Corrupt(kWalFile, 19, 1);  // WriteBatch tag for first record
+  Corrupt(kWalFile, log::kBlockSize + 1000, 1);  // Somewhere in second block
+  ASSERT_TRUE(!TryReopen().ok());
+  options_.paranoid_checks = false;
+  Reopen(&options_);
+
+  // The 64 records in the first two log blocks are completely lost.
+  Check(36, 36);
+}
+
+TEST_F(CorruptionTest, PostPITRCorruptionWALsRetained) {
+  // Repro for bug where WALs following the point-in-time recovery were not
+  // retained leading to the next recovery failing.
+  CloseDb();
+
+  options_.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+
+  const std::string test_cf_name = "test_cf";
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+  cf_descs.emplace_back(test_cf_name, ColumnFamilyOptions());
+
+  uint64_t log_num;
+  {
+    options_.create_missing_column_families = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k", "v"));
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[0], "k2", "v2"));
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    log_num = file_nums.back();
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+
+  CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                            /*bytes_to_truncate=*/1);
+
+  {
+    // Recover "k" -> "v" for both CFs. "k2" -> "v2" is lost due to truncation.
+    options_.avoid_flush_during_recovery = true;
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    // Flush one but not both CFs and write some data so there's a seqno gap
+    // between the PITR corruption and the next DB session's first WAL.
+    ASSERT_OK(db_->Put(WriteOptions(), cfhs[1], "k2", "v2"));
+    ASSERT_OK(db_->Flush(FlushOptions(), cfhs[1]));
+
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+
+  // With the bug, this DB open would remove the WALs following the PITR
+  // corruption. Then, the next recovery would fail.
+  for (int i = 0; i < 2; ++i) {
+    std::vector<ColumnFamilyHandle*> cfhs;
+    ASSERT_OK(DB::Open(options_, dbname_, cf_descs, &cfhs, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+
+    for (auto* cfh : cfhs) {
+      delete cfh;
+    }
+    CloseDb();
+  }
+}
+
+TEST_F(CorruptionTest, RecoverWriteError) {
+  env_->writable_file_error_ = true;
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+}
+
+TEST_F(CorruptionTest, NewFileErrorDuringWrite) {
+  // Do enough writing to force minor compaction
+  env_->writable_file_error_ = true;
+  const int num =
+      static_cast<int>(3 + (Options().write_buffer_size / kValueSize));
+  std::string value_storage;
+  Status s;
+  bool failed = false;
+  for (int i = 0; i < num; i++) {
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("a", Value(100, &value_storage)));
+    s = db_->Write(WriteOptions(), &batch);
+    if (!s.ok()) {
+      failed = true;
+    }
+    ASSERT_TRUE(!failed || !s.ok());
+  }
+  ASSERT_TRUE(!s.ok());
+  ASSERT_GE(env_->num_writable_file_errors_, 1);
+  env_->writable_file_error_ = false;
+  Reopen();
+}
+
+TEST_F(CorruptionTest, TableFile) {
+  Build(100);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(99, 99);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, VerifyChecksumReadahead) {
+  Options options;
+  SpecialEnv senv(env_->target());
+  options.env = &senv;
+  // Disable block cache as we are going to check checksum for
+  // the same file twice and measure number of reads.
+  BlockBasedTableOptions table_options_no_bc;
+  table_options_no_bc.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
+
+  Reopen(&options);
+
+  Build(10000);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+
+  senv.count_random_reads_ = true;
+  senv.random_read_counter_.Reset();
+  ASSERT_OK(dbi->VerifyChecksum());
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read(), 0);
+
+  // The SST file is about 10MB. Default readahead size is 256KB.
+  // Give a conservative 20 reads for metadata blocks, The number
+  // of random reads should be within 10 MB / 256KB + 20 = 60.
+  ASSERT_LT(senv.random_read_counter_.Read(), 60);
+
+  senv.random_read_bytes_counter_ = 0;
+  ReadOptions ro;
+  ro.readahead_size = size_t{32 * 1024};
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+  // The SST file is about 10MB. We set readahead size to 32KB.
+  // Give 0 to 20 reads for metadata blocks, and allow real read
+  // to range from 24KB to 48KB. The lower bound would be:
+  //   10MB / 48KB + 0 = 213
+  // The higher bound is
+  //   10MB / 24KB + 20 = 447.
+  ASSERT_GE(senv.random_read_counter_.Read(), 213);
+  ASSERT_LE(senv.random_read_counter_.Read(), 447);
+
+  // Test readahead shouldn't break mmap mode (where it should be
+  // disabled).
+  options.allow_mmap_reads = true;
+  Reopen(&options);
+  dbi = static_cast<DBImpl*>(db_);
+  ASSERT_OK(dbi->VerifyChecksum(ro));
+
+  CloseDb();
+}
+
+TEST_F(CorruptionTest, TableFileIndexData) {
+  Options options;
+  // very big, we'll trigger flushes manually
+  options.write_buffer_size = 100 * 1024 * 1024;
+  Reopen(&options);
+  // build 2 tables, flush at 5000
+  Build(10000, 5000);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  // corrupt an index block of an entire file
+  Corrupt(kTableFile, -2000, 500);
+  options.paranoid_checks = false;
+  Reopen(&options);
+  dbi = static_cast_with_check<DBImpl>(db_);
+  // one full file may be readable, since only one was corrupted
+  // the other file should be fully non-readable, since index was corrupted
+  Check(0, 5000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // In paranoid mode, the db cannot be opened due to the corrupted file.
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, MissingDescriptor) {
+  Build(1000);
+  RepairDB();
+  Reopen();
+  Check(1000, 1000);
+}
+
+TEST_F(CorruptionTest, SequenceNumberRecovery) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v2"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v3"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v4"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v5"));
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v5", v);
+  // Write something.  If sequence number was not recovered properly,
+  // it will be hidden by an earlier write.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "v6"));
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+  Reopen();
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("v6", v);
+}
+
+TEST_F(CorruptionTest, CorruptedDescriptor) {
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "hello"));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+
+  Corrupt(kDescriptorFile, 0, 1000);
+  Status s = TryReopen();
+  ASSERT_TRUE(!s.ok());
+
+  RepairDB();
+  Reopen();
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("hello", v);
+}
+
+TEST_F(CorruptionTest, CompactionInputError) {
+  Options options;
+  options.env = env_;
+  Reopen(&options);
+  Build(10);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_OK(dbi->TEST_CompactRange(1, nullptr, nullptr));
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level2"));
+
+  Corrupt(kTableFile, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Force compactions by writing lots of values
+  Build(10000);
+  Check(10000, 10000);
+  ASSERT_NOK(dbi->VerifyChecksum());
+}
+
+TEST_F(CorruptionTest, CompactionInputErrorParanoid) {
+  Options options;
+  options.env = env_;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 131072;
+  options.max_write_buffer_number = 2;
+  Reopen(&options);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+  // Fill levels >= 1
+  for (int level = 1; level < dbi->NumberLevels(); level++) {
+    ASSERT_OK(dbi->Put(WriteOptions(), "", "begin"));
+    ASSERT_OK(dbi->Put(WriteOptions(), "~", "end"));
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    for (int comp_level = 0; comp_level < dbi->NumberLevels() - level;
+         ++comp_level) {
+      ASSERT_OK(dbi->TEST_CompactRange(comp_level, nullptr, nullptr));
+    }
+  }
+
+  Reopen(&options);
+
+  dbi = static_cast_with_check<DBImpl>(db_);
+  Build(10);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(dbi->TEST_WaitForCompact());
+  ASSERT_EQ(1, Property("rocksdb.num-files-at-level0"));
+
+  CorruptTableFileAtLevel(0, 100, 1);
+  Check(9, 9);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  // Write must eventually fail because of corrupted table
+  Status s;
+  std::string tmp1, tmp2;
+  bool failed = false;
+  for (int i = 0; i < 10000; i++) {
+    s = db_->Put(WriteOptions(), Key(i, &tmp1), Value(i, &tmp2));
+    if (!s.ok()) {
+      failed = true;
+    }
+    // if one write failed, every subsequent write must fail, too
+    ASSERT_TRUE(!failed || !s.ok()) << "write did not fail in a corrupted db";
+  }
+  ASSERT_TRUE(!s.ok()) << "write did not fail in corrupted paranoid db";
+}
+
+TEST_F(CorruptionTest, UnrelatedKeys) {
+  Build(10);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  Corrupt(kTableFile, 100, 1);
+  ASSERT_NOK(dbi->VerifyChecksum());
+
+  std::string tmp1, tmp2;
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2)));
+  std::string v;
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(db_->Get(ReadOptions(), Key(1000, &tmp1), &v));
+  ASSERT_EQ(Value(1000, &tmp2).ToString(), v);
+}
+
+TEST_F(CorruptionTest, RangeDeletionCorrupted) {
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(static_cast<size_t>(1), metadata.size());
+  std::string filename = dbname_ + metadata[0].name;
+
+  FileOptions file_opts;
+  const auto& fs = options_.env->GetFileSystem();
+  std::unique_ptr<RandomAccessFileReader> file_reader;
+  ASSERT_OK(RandomAccessFileReader::Create(fs, filename, file_opts,
+                                           &file_reader, nullptr));
+
+  uint64_t file_size;
+  ASSERT_OK(
+      fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr));
+
+  BlockHandle range_del_handle;
+  ASSERT_OK(FindMetaBlockInFile(
+      file_reader.get(), file_size, kBlockBasedTableMagicNumber,
+      ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle));
+
+  ASSERT_OK(TryReopen());
+  ASSERT_OK(test::CorruptFile(env_, filename,
+                              static_cast<int>(range_del_handle.offset()), 1));
+  ASSERT_TRUE(TryReopen().IsCorruption());
+}
+
+TEST_F(CorruptionTest, FileSystemStateCorrupted) {
+  for (int iter = 0; iter < 2; ++iter) {
+    Options options;
+    options.env = env_;
+    options.paranoid_checks = true;
+    options.create_if_missing = true;
+    Reopen(&options);
+    Build(10);
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    std::vector<LiveFileMetaData> metadata;
+    dbi->GetLiveFilesMetaData(&metadata);
+    ASSERT_GT(metadata.size(), 0);
+    std::string filename = dbname_ + metadata[0].name;
+
+    delete db_;
+    db_ = nullptr;
+
+    if (iter == 0) {  // corrupt file size
+      std::unique_ptr<WritableFile> file;
+      ASSERT_OK(env_->NewWritableFile(filename, &file, EnvOptions()));
+      ASSERT_OK(file->Append(Slice("corrupted sst")));
+      file.reset();
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsCorruption());
+    } else {  // delete the file
+      ASSERT_OK(env_->DeleteFile(filename));
+      Status x = TryReopen(&options);
+      ASSERT_TRUE(x.IsCorruption());
+    }
+
+    ASSERT_OK(DestroyDB(dbname_, options_));
+  }
+}
+
+static const auto& corruption_modes = {
+    mock::MockTableFactory::kCorruptNone, mock::MockTableFactory::kCorruptKey,
+    mock::MockTableFactory::kCorruptValue,
+    mock::MockTableFactory::kCorruptReorderKey};
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnFlush) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    ASSERT_OK(s);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    mock->SetCorruptionMode(mode);
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10);
+    s = db_->Flush(FlushOptions());
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksOnCompact) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  Status s;
+  for (const auto& mode : corruption_modes) {
+    delete db_;
+    db_ = nullptr;
+    s = DestroyDB(dbname_, options);
+    ASSERT_OK(s);
+    std::shared_ptr<mock::MockTableFactory> mock =
+        std::make_shared<mock::MockTableFactory>();
+    options.table_factory = mock;
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(100, 2);
+    // ASSERT_OK(db_->Flush(FlushOptions()));
+    DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+    ASSERT_OK(dbi->TEST_FlushMemTable());
+    mock->SetCorruptionMode(mode);
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    s = dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+    if (mode == mock::MockTableFactory::kCorruptNone) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeFirst) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    std::string start, end;
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    Build(10);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRange) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    Build(10, 0, 0);
+    std::string start, end;
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(5, &start), Key(15, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(8, &start), Key(9, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(12, &start), Key(17, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(4, &end)));
+    Build(10, 10, 0);
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, ParanoidFileChecksWithDeleteRangeLast) {
+  Options options;
+  options.env = env_;
+  options.check_flush_compaction_key_order = false;
+  options.paranoid_file_checks = true;
+  options.create_if_missing = true;
+  for (bool do_flush : {true, false}) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options));
+    ASSERT_OK(DB::Open(options, dbname_, &db_));
+    assert(db_ != nullptr);  // suppress false clang-analyze report
+    std::string start, end;
+    Build(10);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(3, &start), Key(7, &end)));
+    auto snap = db_->GetSnapshot();
+    ASSERT_NE(snap, nullptr);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(6, &start), Key(8, &end)));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(2, &start), Key(5, &end)));
+    if (do_flush) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else {
+      DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+      ASSERT_OK(dbi->TEST_FlushMemTable());
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(
+          dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+    }
+    db_->ReleaseSnapshot(snap);
+  }
+}
+
+TEST_F(CorruptionTest, LogCorruptionErrorsInCompactionIterator) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.allow_data_in_errors = true;
+  auto mode = mock::MockTableFactory::kCorruptKey;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  mock->SetCorruptionMode(mode);
+  options.table_factory = mock;
+
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  Build(100, 2);
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  Status s =
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr);
+  ASSERT_NOK(s);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(CorruptionTest, CompactionKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  options.check_flush_compaction_key_order = false;
+  delete db_;
+  db_ = nullptr;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  std::shared_ptr<mock::MockTableFactory> mock =
+      std::make_shared<mock::MockTableFactory>();
+  options.table_factory = mock;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  assert(db_ != nullptr);  // suppress false clang-analyze report
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptReorderKey);
+  Build(100, 2);
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  mock->SetCorruptionMode(mock::MockTableFactory::kCorruptNone);
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_NOK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+}
+
+TEST_F(CorruptionTest, FlushKeyOrderCheck) {
+  Options options;
+  options.env = env_;
+  options.paranoid_file_checks = false;
+  options.create_if_missing = true;
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "true"}}));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+
+  int cnt = 0;
+  // Generate some out of order keys from the memtable
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableIterator::Next:0", [&](void* arg) {
+        MemTableRep::Iterator* mem_iter =
+            static_cast<MemTableRep::Iterator*>(arg);
+        if (++cnt == 3) {
+          mem_iter->Prev();
+          mem_iter->Prev();
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = static_cast_with_check<DBImpl>(db_)->TEST_FlushMemTable();
+  ASSERT_NOK(s);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, DisableKeyOrderCheck) {
+  ASSERT_OK(db_->SetOptions({{"check_flush_compaction_key_order", "false"}}));
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "OutputValidator::Add:order_check",
+      [&](void* /*arg*/) { ASSERT_TRUE(false); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo3", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  ASSERT_OK(db_->Put(WriteOptions(), "foo2", "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo4", "v1"));
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(
+      dbi->CompactRange(cro, dbi->DefaultColumnFamily(), nullptr, nullptr));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(CorruptionTest, VerifyWholeTableChecksum) {
+  CloseDb();
+  Options options;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.file_checksum_gen_factory =
+      ROCKSDB_NAMESPACE::GetFileChecksumGenCrc32cFactory();
+  Reopen(&options);
+
+  Build(10, 5);
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+  CloseDb();
+
+  // Corrupt the first byte of each table file, this must be data block.
+  Corrupt(kTableFile, 0, 1);
+
+  ASSERT_OK(TryReopen(&options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count{0};
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::VerifyFullFileChecksum:mismatch", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NE(s, nullptr);
+        ++count;
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsCorruption());
+  ASSERT_EQ(1, count);
+}
+
+class CrashDuringRecoveryWithCorruptionTest
+    : public CorruptionTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  explicit CrashDuringRecoveryWithCorruptionTest()
+      : CorruptionTest(),
+        avoid_flush_during_recovery_(std::get<0>(GetParam())),
+        track_and_verify_wals_in_manifest_(std::get<1>(GetParam())) {}
+
+ protected:
+  const bool avoid_flush_during_recovery_;
+  const bool track_and_verify_wals_in_manifest_;
+};
+
+INSTANTIATE_TEST_CASE_P(CorruptionTest, CrashDuringRecoveryWithCorruptionTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(true, true),
+                                          std::make_tuple(false, true)));
+
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecovery) {
+  CloseDb();
+  Options options;
+  options.track_and_verify_wals_in_manifest =
+      track_and_verify_wals_in_manifest_;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 8;
+
+  Reopen(&options);
+  Status s;
+  const std::string test_cf_name = "test_cf";
+  ColumnFamilyHandle* cfh = nullptr;
+  s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  // 1. Open and populate the DB. Write and flush default_cf several times to
+  // advance wal number so that some column families have advanced log_number
+  // while other don't.
+  {
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+    auto* dbimpl = static_cast_with_check<DBImpl>(db_);
+    assert(dbimpl);
+
+    // Write one key to test_cf.
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+    ASSERT_OK(db_->Flush(FlushOptions(), handles[1]));
+
+    // Write to default_cf and flush this cf several times to advance wal
+    // number. TEST_SwitchMemtable makes sure WALs are not synced and test can
+    // corrupt un-sync WAL.
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+
+  // 2. Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    assert(size >= 2);
+    uint64_t log_num = file_nums[size - 2];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // 3. After first crash reopen the DB which contains corrupted WAL. Default
+  // family has higher log number than corrupted wal number.
+  //
+  // Case1: If avoid_flush_during_recovery = true, RocksDB won't flush the data
+  // from WAL to L0 for all column families (test_cf_name in this case). As a
+  // result, not all column families can increase their log_numbers, and
+  // min_log_number_to_keep won't change.
+  //
+  // Case2: If avoid_flush_during_recovery = false, all column families have
+  // flushed their data from WAL to L0 during recovery, and none of them will
+  // ever need to read the WALs again.
+
+  // 4. Fault is injected to fail the recovery.
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
+    s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // 5. After second crash reopen the db with second corruption. Default family
+  // has higher log number than corrupted wal number.
+  //
+  // Case1: If avoid_flush_during_recovery = true, we persist a new
+  // MANIFEST with advanced log_numbers for some column families only after
+  // syncing the WAL. So during second crash, RocksDB will skip the corrupted
+  // WAL files as they have been moved to different folder. Since newly synced
+  // WAL file's sequence number (sentinel WriteBatch) will be the next
+  // sequence number immediately after the largest sequence number recovered
+  // from previous WALs and MANIFEST, db will be in consistent state and opens
+  // successfully.
+  //
+  // Case2: If avoid_flush_during_recovery = false, the corrupted WAL is below
+  // this number. So during a second crash after persisting the new MANIFEST,
+  // RocksDB will skip the corrupted WAL(s) because they are all below this
+  // bound. Therefore, we won't hit the "column family inconsistency" error
+  // message.
+  {
+    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      v.clear();
+      ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Since  it's corrupting second last wal, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+}
+
+// In case of TransactionDB, it enables two-phase-commit. The prepare section of
+// an uncommitted transaction always need to be kept. Even if we perform flush
+// during recovery, we may still need to hold an old WAL. The
+// min_log_number_to_keep won't change, and "Column family inconsistency: SST
+// file contains data beyond the point of corruption" error will be hit, causing
+// recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+//
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, TxnDbCrashDuringRecovery) {
+  CloseDb();
+  Options options;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.track_and_verify_wals_in_manifest =
+      track_and_verify_wals_in_manifest_;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 3;
+  Reopen(&options);
+
+  // Create cf test_cf_name.
+  ColumnFamilyHandle* cfh = nullptr;
+  const std::string test_cf_name = "test_cf";
+  Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+
+  // 1. Open and populate the DB. Write and flush default_cf several times to
+  // advance wal number so that some column families have advanced log_number
+  // while other don't.
+  {
+    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+                                  &handles, &txn_db));
+
+    auto* txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+    // Put cf1
+    ASSERT_OK(txn->Put(handles[1], "foo", "value"));
+    ASSERT_OK(txn->SetName("txn0"));
+    ASSERT_OK(txn->Prepare());
+    ASSERT_OK(txn_db->Flush(FlushOptions()));
+
+    delete txn;
+    txn = nullptr;
+
+    auto* dbimpl = static_cast_with_check<DBImpl>(txn_db->GetRootDB());
+    assert(dbimpl);
+
+    // Put and flush cf0
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(txn_db->Put(WriteOptions(), "key" + std::to_string(i),
+                            "value" + std::to_string(i)));
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable());
+    }
+
+    // Put cf1
+    txn = txn_db->BeginTransaction(WriteOptions(), TransactionOptions());
+    ASSERT_OK(txn->Put(handles[1], "foo1", "value1"));
+    ASSERT_OK(txn->Commit());
+
+    delete txn;
+    txn = nullptr;
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    delete txn_db;
+  }
+
+  // 2. Corrupt second last wal to emulate power reset which caused the DB to
+  // lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    assert(size >= 2);
+    uint64_t log_num = file_nums[size - 2];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // 3. After first crash reopen the DB which contains corrupted WAL. Default
+  // family has higher log number than corrupted wal number. There may be old
+  // WAL files that it must not delete because they can contain data of
+  // uncommitted transactions. As a result, min_log_number_to_keep won't change.
+
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::Open::BeforeSyncWAL", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    s = TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs, &handles,
+                            &txn_db);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // 4. Corrupt max_wal_num.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    uint64_t log_num = file_nums[size - 1];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num);
+  }
+
+  // 5. After second crash reopen the db with second corruption. Default family
+  // has higher log number than corrupted wal number.
+  // We persist a new MANIFEST with advanced log_numbers for some column
+  // families only after syncing the WAL. So during second crash, RocksDB will
+  // skip the corrupted WAL files as they have been moved to different folder.
+  // Since newly synced WAL file's sequence number (sentinel WriteBatch) will be
+  // the next sequence number immediately after the largest sequence number
+  // recovered from previous WALs and MANIFEST, db will be in consistent state
+  // and opens successfully.
+  {
+    ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, cf_descs,
+                                  &handles, &txn_db));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      // Key not visible since it's not committed.
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo", &v),
+                Status::NotFound());
+
+      v.clear();
+      ASSERT_OK(txn_db->Get(ReadOptions(), "key" + std::to_string(0), &v));
+      ASSERT_EQ("value" + std::to_string(0), v);
+
+      // Last WAL is corrupted which contains two keys below.
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), "key" + std::to_string(1), &v),
+                Status::NotFound());
+      v.clear();
+      ASSERT_EQ(txn_db->Get(ReadOptions(), handles[1], "foo1", &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+    delete txn_db;
+  }
+}
+
+// This test is similar to
+// CrashDuringRecoveryWithCorruptionTest.CrashDuringRecovery except it calls
+// flush and corrupts Last WAL. It calls flush to sync some of the WALs and
+// remaining are unsyned one of which is then corrupted to simulate crash.
+//
+// In case of non-TransactionDB with avoid_flush_during_recovery = true, RocksDB
+// won't flush the data from WAL to L0 for all column families if possible. As a
+// result, not all column families can increase their log_numbers, and
+// min_log_number_to_keep won't change.
+// It may prematurely persist a new MANIFEST even before we can declare the DB
+// is in consistent state after recovery (this is when the new WAL is synced)
+// and advances log_numbers for some column families.
+//
+// If there is power failure before we sync the new WAL, we will end up in
+// a situation in which after persisting the MANIFEST, RocksDB will see some
+// column families' log_numbers larger than the corrupted wal, and
+// "Column family inconsistency: SST file contains data beyond the point of
+// corruption" error will be hit, causing recovery to fail.
+//
+// After adding the fix, only after new WAL is synced, RocksDB persist a new
+// MANIFEST with column families to ensure RocksDB is in consistent state.
+// RocksDB writes an empty WriteBatch as a sentinel to the new WAL which is
+// synced immediately afterwards. The sequence number of the sentinel
+// WriteBatch will be the next sequence number immediately after the largest
+// sequence number recovered from previous WALs and MANIFEST because of which DB
+// will be in consistent state.
+// If a future recovery starts from the new MANIFEST, then it means the new WAL
+// is successfully synced. Due to the sentinel empty write batch at the
+// beginning, kPointInTimeRecovery of WAL is guaranteed to go after this point.
+// If future recovery starts from the old MANIFEST, it means the writing the new
+// MANIFEST failed. It won't have the "SST ahead of WAL" error.
+
+// The combination of corrupting a WAL and injecting an error during subsequent
+// re-open exposes the bug of prematurely persisting a new MANIFEST with
+// advanced ColumnFamilyData::log_number.
+TEST_P(CrashDuringRecoveryWithCorruptionTest, CrashDuringRecoveryWithFlush) {
+  CloseDb();
+  Options options;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.avoid_flush_during_recovery = false;
+  options.env = env_;
+  options.create_if_missing = true;
+
+  ASSERT_OK(DestroyDB(dbname_, options));
+  Reopen(&options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  const std::string test_cf_name = "test_cf";
+  Status s = db_->CreateColumnFamily(options, test_cf_name, &cfh);
+  ASSERT_OK(s);
+  delete cfh;
+
+  CloseDb();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back(test_cf_name, options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  {
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Write one key to test_cf.
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "old_key", "dontcare"));
+
+    // Write to default_cf and flush this cf several times to advance wal
+    // number.
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), "key" + std::to_string(i),
+                         "value" + std::to_string(i)));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(db_->Put(WriteOptions(), handles[1], "dontcare", "dontcare"));
+    for (auto* h : handles) {
+      delete h;
+    }
+    handles.clear();
+    CloseDb();
+  }
+
+  // Corrupt second last un-syned wal file to emulate power reset which
+  // caused the DB to lose the un-synced WAL.
+  {
+    std::vector<uint64_t> file_nums;
+    GetSortedWalFiles(file_nums);
+    size_t size = file_nums.size();
+    uint64_t log_num = file_nums[size - 1];
+    CorruptFileWithTruncation(FileType::kWalFile, log_num,
+                              /*bytes_to_truncate=*/8);
+  }
+
+  // Fault is injected to fail the recovery.
+  {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::GetLogSizeAndMaybeTruncate:0", [&](void* arg) {
+          auto* tmp_s = reinterpret_cast<Status*>(arg);
+          assert(tmp_s);
+          *tmp_s = Status::IOError("Injected");
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    handles.clear();
+    options.avoid_flush_during_recovery = true;
+    s = DB::Open(options, dbname_, cf_descs, &handles, &db_);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_EQ("IO error: Injected", s.ToString());
+    for (auto* h : handles) {
+      delete h;
+    }
+    CloseDb();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  // Reopen db again
+  {
+    options.avoid_flush_during_recovery = avoid_flush_during_recovery_;
+    ASSERT_OK(DB::Open(options, dbname_, cf_descs, &handles, &db_));
+
+    // Verify that data is not lost.
+    {
+      std::string v;
+      ASSERT_OK(db_->Get(ReadOptions(), handles[1], "old_key", &v));
+      ASSERT_EQ("dontcare", v);
+
+      for (int i = 0; i < 2; ++i) {
+        v.clear();
+        ASSERT_OK(db_->Get(ReadOptions(), "key" + std::to_string(i), &v));
+        ASSERT_EQ("value" + std::to_string(i), v);
+      }
+
+      // Since it's corrupting last wal after Flush, below key is not found.
+      v.clear();
+      ASSERT_EQ(db_->Get(ReadOptions(), handles[1], "dontcare", &v),
+                Status::NotFound());
+    }
+
+    for (auto* h : handles) {
+      delete h;
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB() is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/cuckoo_table_db_test.cc b/src/rocksdb/db/cuckoo_table_db_test.cc
new file mode 100644
index 000000000..868b798ea
--- /dev/null
+++ b/src/rocksdb/db/cuckoo_table_db_test.cc
@@ -0,0 +1,361 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "table/cuckoo/cuckoo_table_factory.h"
+#include "table/cuckoo/cuckoo_table_reader.h"
+#include "table/meta_blocks.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CuckooTableDBTest : public testing::Test {
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+ public:
+  CuckooTableDBTest() : env_(Env::Default()) {
+    dbname_ = test::PerThreadDBPath("cuckoo_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  ~CuckooTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  Options CurrentOptions() {
+    Options options;
+    options.table_factory.reset(NewCuckooTableFactory());
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+    options.allow_mmap_reads = true;
+    options.create_if_missing = true;
+    options.allow_concurrent_memtable_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  // The following util methods are copied from plain_table_db_test.
+  void Reopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    ASSERT_OK(DB::Open(opts, dbname_, &db_));
+  }
+
+  void DestroyAndReopen(Options* options) {
+    assert(options);
+    ASSERT_OK(db_->Close());
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+    Reopen(options);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  std::string Get(const std::string& k) {
+    ReadOptions options;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+};
+
+TEST_F(CuckooTableDBTest, Flush) {
+  // Try with empty DB first.
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("key2"));
+
+  // Add some values to db.
+  Options options = CurrentOptions();
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  TablePropertiesCollection ptc;
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(3U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+
+  // Now add more keys and flush.
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key5", "v5"));
+  ASSERT_OK(Put("key6", "v6"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(2U, ptc.size());
+  auto row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v5", Get("key5"));
+  ASSERT_EQ("v6", Get("key6"));
+
+  ASSERT_OK(Delete("key6"));
+  ASSERT_OK(Delete("key5"));
+  ASSERT_OK(Delete("key4"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(3U, ptc.size());
+  row = ptc.begin();
+  ASSERT_EQ(3U, row->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ(3U, (++row)->second->num_entries);
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_EQ("v1", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("NOT_FOUND", Get("key4"));
+  ASSERT_EQ("NOT_FOUND", Get("key5"));
+  ASSERT_EQ("NOT_FOUND", Get("key6"));
+}
+
+TEST_F(CuckooTableDBTest, FlushWithDuplicateKeys) {
+  Options options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key1", "v3"));  // Duplicate
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  TablePropertiesCollection ptc;
+  ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(&ptc));
+  VerifySstUniqueIds(ptc);
+  ASSERT_EQ(1U, ptc.size());
+  ASSERT_EQ(2U, ptc.begin()->second->num_entries);
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_EQ("v3", Get("key1"));
+  ASSERT_EQ("v2", Get("key2"));
+}
+
+namespace {
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+static std::string Uint64Key(uint64_t i) {
+  std::string str;
+  str.resize(8);
+  memcpy(&str[0], static_cast<void*>(&i), 8);
+  return str;
+}
+}  // namespace.
+
+TEST_F(CuckooTableDBTest, Uint64Comparator) {
+  Options options = CurrentOptions();
+  options.comparator = test::Uint64Comparator();
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put(Uint64Key(1), "v1"));
+  ASSERT_OK(Put(Uint64Key(2), "v2"));
+  ASSERT_OK(Put(Uint64Key(3), "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("v2", Get(Uint64Key(2)));
+  ASSERT_EQ("v3", Get(Uint64Key(3)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
+
+  // Add more keys.
+  ASSERT_OK(Delete(Uint64Key(2)));  // Delete.
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_OK(Put(Uint64Key(3), "v0"));  // Update.
+  ASSERT_OK(Put(Uint64Key(4), "v4"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v1", Get(Uint64Key(1)));
+  ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
+  ASSERT_EQ("v0", Get(Uint64Key(3)));
+  ASSERT_EQ("v4", Get(Uint64Key(4)));
+}
+
+TEST_F(CuckooTableDBTest, CompactionIntoMultipleFiles) {
+  // Create a big L0 file and check it compacts into multiple files in L1.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 270 << 10;
+  // Two SST files should be created, each containing 14 keys.
+  // Number of buckets will be 16. Total size ~156 KB.
+  options.target_file_size_base = 160 << 10;
+  Reopen(&options);
+
+  // Write 28 values, each 10016 B ~ 10KB
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("1", FilesPerLevel());
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,2", FilesPerLevel());
+  for (int idx = 0; idx < 28; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
+  // Insert same key twice so that they go to different SST files. Then wait for
+  // compaction and check if the latest value is stored and old value removed.
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(&options);
+
+  // Write 11 values, each 10016 B
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a')));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("1", FilesPerLevel());
+
+  // Generate one more file in level-0, and should trigger level-0 compaction
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + char(idx))));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+  ASSERT_EQ("0,1", FilesPerLevel());
+  for (int idx = 0; idx < 11; ++idx) {
+    ASSERT_EQ(std::string(10000, 'a' + char(idx)), Get(Key(idx)));
+  }
+}
+
+TEST_F(CuckooTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+
+  // Ensure options compatible with PlainTable
+  options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+
+  // Write some keys using cuckoo table.
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(&options);
+
+  ASSERT_OK(Put("key1", "v1"));
+  ASSERT_OK(Put("key2", "v2"));
+  ASSERT_OK(Put("key3", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Write some keys using plain table.
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+  std::shared_ptr<TableFactory> cuckoo_table_factory(NewCuckooTableFactory());
+  options.create_if_missing = false;
+  options.table_factory.reset(
+      NewAdaptiveTableFactory(plain_table_factory, block_based_factory,
+                              plain_table_factory, cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key4", "v4"));
+  ASSERT_OK(Put("key1", "v5"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Write some keys using block based table.
+  options.table_factory.reset(
+      NewAdaptiveTableFactory(block_based_factory, block_based_factory,
+                              plain_table_factory, cuckoo_table_factory));
+  Reopen(&options);
+  ASSERT_OK(Put("key5", "v6"));
+  ASSERT_OK(Put("key2", "v7"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v5", Get("key1"));
+  ASSERT_EQ("v7", Get("key2"));
+  ASSERT_EQ("v3", Get("key3"));
+  ASSERT_EQ("v4", Get("key4"));
+  ASSERT_EQ("v6", Get("key5"));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  if (ROCKSDB_NAMESPACE::port::kLittleEndian) {
+    ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+  } else {
+    fprintf(stderr, "SKIPPED as Cuckoo table doesn't support Big Endian\n");
+    return 0;
+  }
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_basic_test.cc b/src/rocksdb/db/db_basic_test.cc
new file mode 100644
index 000000000..a28ac2b88
--- /dev/null
+++ b/src/rocksdb/db/db_basic_test.cc
@@ -0,0 +1,4643 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/file_checksum_helper.h"
+#include "util/random.h"
+#include "utilities/counted_fs.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBasicTest : public DBTestBase {
+ public:
+  DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DBBasicTest, OpenWhenOpen) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DB* db2 = nullptr;
+  Status s = DB::Open(options, dbname_, &db2);
+  ASSERT_NOK(s) << [db2]() {
+    delete db2;
+    return "db2 open: ok";
+  }();
+  ASSERT_EQ(Status::Code::kIOError, s.code());
+  ASSERT_EQ(Status::SubCode::kNone, s.subcode());
+  ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr);
+
+  delete db2;
+}
+
+TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) {
+  if (!IsDirectIOSupported()) {
+    ROCKSDB_GTEST_BYPASS("Direct IO not supported");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.writable_file_max_buffer_size = 0;
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+
+  options.writable_file_max_buffer_size = 1024;
+  Reopen(options);
+  const std::unordered_map<std::string, std::string> new_db_opts = {
+      {"writable_file_max_buffer_size", "0"}};
+  ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, UniqueSession) {
+  Options options = CurrentOptions();
+  std::string sid1, sid2, sid3, sid4;
+
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+  Reopen(options);
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+  ASSERT_NE(sid1, sid3);
+  ASSERT_NE(sid2, sid3);
+
+  ASSERT_EQ(sid2, sid4);
+
+  // Expected compact format for session ids (see notes in implementation)
+  TestRegex expected("[0-9A-Z]{20}");
+  EXPECT_MATCHES_REGEX(sid1, expected);
+  EXPECT_MATCHES_REGEX(sid2, expected);
+  EXPECT_MATCHES_REGEX(sid3, expected);
+
+#ifndef ROCKSDB_LITE
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  // Test uniqueness between readonly open (sid1) and regular open (sid3)
+  ASSERT_NE(sid1, sid3);
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+
+  ASSERT_NE(sid1, sid2);
+
+  ASSERT_EQ(sid2, sid3);
+#endif  // ROCKSDB_LITE
+
+  CreateAndReopenWithCF({"goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid1));
+  ASSERT_OK(Put("bar", "e1"));
+  ASSERT_OK(db_->GetDbSessionId(sid2));
+  ASSERT_EQ("e1", Get("bar"));
+  ASSERT_OK(db_->GetDbSessionId(sid3));
+  ReopenWithColumnFamilies({"default", "goku"}, options);
+  ASSERT_OK(db_->GetDbSessionId(sid4));
+
+  ASSERT_EQ(sid1, sid2);
+  ASSERT_EQ(sid2, sid3);
+
+  ASSERT_NE(sid1, sid4);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, ReadOnlyDB) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto verify_one_iter = [&](Iterator* iter) {
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      ++count;
+    }
+    // Always expect two keys: "foo" and "bar"
+    ASSERT_EQ(count, 2);
+  };
+
+  auto verify_all_iters = [&]() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    verify_one_iter(iter);
+    delete iter;
+
+    std::vector<Iterator*> iters;
+    ASSERT_OK(db_->NewIterators(ReadOptions(),
+                                {dbfull()->DefaultColumnFamily()}, &iters));
+    ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+    verify_one_iter(iters[0]);
+    delete iters[0];
+  };
+
+  auto options = CurrentOptions();
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  verify_all_iters();
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  ASSERT_OK(Flush());
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  verify_all_iters();
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+}
+
+// TODO akanksha: Update the test to check that combination
+// does not actually write to FS (use open read-only with
+// CompositeEnvWrapper+ReadOnlyFileSystem).
+TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) {
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  Close();
+
+  auto options = CurrentOptions();
+  options.write_dbid_to_manifest = true;
+  assert(options.env == env_);
+  ASSERT_OK(ReadOnlyReopen(options));
+  std::string db_id1;
+  ASSERT_OK(db_->GetDbIdentity(db_id1));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  Close();
+
+  // Reopen and flush memtable.
+  Reopen(options);
+  ASSERT_OK(Flush());
+  Close();
+  // Now check keys in read only mode.
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v2", Get("bar"));
+  ASSERT_TRUE(db_->SyncWAL().IsNotSupported());
+  std::string db_id2;
+  ASSERT_OK(db_->GetDbIdentity(db_id2));
+  ASSERT_EQ(db_id1, db_id2);
+}
+
+TEST_F(DBBasicTest, CompactedDB) {
+  const uint64_t kFileSize = 1 << 20;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = kFileSize;
+  options.target_file_size_base = kFileSize;
+  options.max_bytes_for_level_base = 1 << 30;
+  options.compression = kNoCompression;
+  Reopen(options);
+  // 1 L0 file, use CompactedDB if max_open_files = -1
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1')));
+  ASSERT_OK(Flush());
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa"));
+  Close();
+  Reopen(options);
+  // Add more L0 files
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b')));
+  ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e')));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("something_not_flushed", "x"));
+  Close();
+
+  ASSERT_OK(ReadOnlyReopen(options));
+  // Fallback to read-only DB
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+
+  // TODO: validate that other write ops return NotImplemented
+  // (DBImplReadOnly is missing some overrides)
+
+  // Ensure no deadlock on flush triggered by another API function
+  // (Old deadlock bug depends on something_not_flushed above.)
+  std::vector<std::string> files;
+  uint64_t manifest_file_size;
+  ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+  LiveFilesStorageInfoOptions lfsi_opts;
+  lfsi_opts.wal_size_for_flush = 0;  // always
+  std::vector<LiveFileStorageInfo> files2;
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+  Close();
+
+  // Full compaction
+  Reopen(options);
+  // Add more keys
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h')));
+  ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i')));
+  ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j')));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+  Close();
+
+  // CompactedDB
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported in compacted db mode.");
+  ASSERT_EQ("NOT_FOUND", Get("abc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb"));
+  ASSERT_EQ("NOT_FOUND", Get("ccc"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff"));
+  ASSERT_EQ("NOT_FOUND", Get("ggg"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii"));
+  ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj"));
+  ASSERT_EQ("NOT_FOUND", Get("kkk"));
+
+  // TODO: validate that other write ops return NotImplemented
+  // (CompactedDB is missing some overrides)
+
+  // Ensure no deadlock on flush triggered by another API function
+  ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true));
+  ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2));
+
+  // MultiGet
+  std::vector<std::string> values;
+  std::vector<Status> status_list = dbfull()->MultiGet(
+      ReadOptions(),
+      std::vector<Slice>({Slice("aaa"), Slice("ccc"), Slice("eee"),
+                          Slice("ggg"), Slice("iii"), Slice("kkk")}),
+      &values);
+  ASSERT_EQ(status_list.size(), static_cast<uint64_t>(6));
+  ASSERT_EQ(values.size(), static_cast<uint64_t>(6));
+  ASSERT_OK(status_list[0]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]);
+  ASSERT_TRUE(status_list[1].IsNotFound());
+  ASSERT_OK(status_list[2]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]);
+  ASSERT_TRUE(status_list[3].IsNotFound());
+  ASSERT_OK(status_list[4]);
+  ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]);
+  ASSERT_TRUE(status_list[5].IsNotFound());
+
+  Reopen(options);
+  // Add a key
+  ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f')));
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  s = Put("new", "value");
+  ASSERT_EQ(s.ToString(),
+            "Not implemented: Not supported operation in read only mode.");
+}
+
+TEST_F(DBBasicTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, PutDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBBasicTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Ski FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, EmptyFlush) {
+  // It is possible to produce empty flushes when using single deletes. Tests
+  // whether empty flushes cause issues.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and  universal compaction as they do not apply to the test
+    // case. Skip MergePut because merges cannot be combined with single
+    // deletions.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBBasicTest, GetFromVersions) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+  } while (ChangeOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, GetSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    // Try with both a short key and a long key
+    for (int i = 0; i < 2; i++) {
+      std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x');
+      ASSERT_OK(Put(1, key, "v1"));
+      const Snapshot* s1 = db_->GetSnapshot();
+      ASSERT_OK(Put(1, key, "v2"));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      ASSERT_OK(Flush(1));
+      ASSERT_EQ("v2", Get(1, key));
+      ASSERT_EQ("v1", Get(1, key, s1));
+      db_->ReleaseSnapshot(s1);
+    }
+  } while (ChangeOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBBasicTest, CheckLock) {
+  do {
+    DB* localdb = nullptr;
+    Options options = CurrentOptions();
+    ASSERT_OK(TryReopen(options));
+
+    // second open should fail
+    Status s = DB::Open(options, dbname_, &localdb);
+    ASSERT_NOK(s) << [localdb]() {
+      delete localdb;
+      return "localdb open: ok";
+    }();
+#ifdef OS_LINUX
+    ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos);
+#endif  // OS_LINUX
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushEmptyColumnFamily) {
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  // disable compaction
+  options.disable_auto_compactions = true;
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.max_write_buffer_number = 2;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Compaction can still go through even if no thread can flush the
+  // mem table.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  // Insert can go through
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1"));
+  ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+  ASSERT_EQ("v1", Get(0, "foo"));
+  ASSERT_EQ("v1", Get(1, "bar"));
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+
+  // Flush can still go through.
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Flush(1));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBBasicTest, Flush) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    SetPerfLevel(kEnableTime);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    // this will now also flush the last 2 writes
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    get_perf_context()->Reset();
+    Get(1, "foo");
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+    ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes);
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v2", Get(1, "bar"));
+    get_perf_context()->Reset();
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0);
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+    ASSERT_OK(Flush(1));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // 'foo' should be there because its put
+    // has WAL enabled.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, ManifestRollOver) {
+  do {
+    Options options;
+    options.max_manifest_file_size = 10;  // 10 bytes
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    {
+      ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1')));
+      ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2')));
+      ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3')));
+      uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_OK(Flush(1));  // This should trigger LogAndApply.
+      uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo();
+      ASSERT_GT(manifest_after_flush, manifest_before_flush);
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+      ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush);
+      // check if a new manifest file got inserted or not.
+      ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1"));
+      ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2"));
+      ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3"));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, IdentityAcrossRestarts) {
+  constexpr size_t kMinIdSize = 10;
+  do {
+    for (bool with_manifest : {false, true}) {
+      std::string idfilename = IdentityFileName(dbname_);
+      std::string id1, tmp;
+      ASSERT_OK(db_->GetDbIdentity(id1));
+      ASSERT_GE(id1.size(), kMinIdSize);
+
+      Options options = CurrentOptions();
+      options.write_dbid_to_manifest = with_manifest;
+      Reopen(options);
+      std::string id2;
+      ASSERT_OK(db_->GetDbIdentity(id2));
+      // id2 should match id1 because identity was not regenerated
+      ASSERT_EQ(id1, id2);
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id2);
+
+      // Recover from deleted/missing IDENTITY
+      ASSERT_OK(env_->DeleteFile(idfilename));
+      Reopen(options);
+      std::string id3;
+      ASSERT_OK(db_->GetDbIdentity(id3));
+      if (with_manifest) {
+        // id3 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id3);
+      } else {
+        // id3 should NOT match id1 because identity was regenerated
+        ASSERT_NE(id1, id3);
+        ASSERT_GE(id3.size(), kMinIdSize);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id3);
+
+      // Recover from truncated IDENTITY
+      {
+        std::unique_ptr<WritableFile> w;
+        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+        ASSERT_OK(w->Close());
+      }
+      Reopen(options);
+      std::string id4;
+      ASSERT_OK(db_->GetDbIdentity(id4));
+      if (with_manifest) {
+        // id4 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id4);
+      } else {
+        // id4 should NOT match id1 because identity was regenerated
+        ASSERT_NE(id1, id4);
+        ASSERT_GE(id4.size(), kMinIdSize);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id4);
+
+      // Recover from overwritten IDENTITY
+      std::string silly_id = "asdf123456789";
+      {
+        std::unique_ptr<WritableFile> w;
+        ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions()));
+        ASSERT_OK(w->Append(silly_id));
+        ASSERT_OK(w->Close());
+      }
+      Reopen(options);
+      std::string id5;
+      ASSERT_OK(db_->GetDbIdentity(id5));
+      if (with_manifest) {
+        // id4 should match id1 because identity was restored from manifest
+        ASSERT_EQ(id1, id5);
+      } else {
+        ASSERT_EQ(id5, silly_id);
+      }
+      ASSERT_OK(ReadFileToString(env_, idfilename, &tmp));
+      ASSERT_EQ(tmp, id5);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, Snapshot) {
+  env_->SetMockSleep();
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
+
+    env_->MockSleepForSeconds(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+#endif  // ROCKSDB_LITE
+
+class DBBasicMultiConfigs : public DBBasicTest,
+                            public ::testing::WithParamInterface<int> {
+ public:
+  DBBasicMultiConfigs() { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    return option_configs;
+  }
+};
+
+TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  FillLevels("a", "z", 1);
+
+  ASSERT_OK(Put(1, "foo", "first"));
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "second"));
+  ASSERT_OK(Put(1, "foo", "third"));
+  ASSERT_OK(Put(1, "foo", "fourth"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_OK(Put(1, "foo", "fifth"));
+  ASSERT_OK(Put(1, "foo", "sixth"));
+
+  // All entries (including duplicates) exist
+  // before any compaction or flush is triggered.
+  ASSERT_EQ(AllEntriesFor("foo", 1),
+            "[ sixth, fifth, fourth, third, second, first ]");
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ("first", Get(1, "foo", snapshot1));
+
+  // After a flush, "second", "third" and "fifth" should
+  // be removed
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]");
+
+  // after we release the snapshot1, only two values left
+  db_->ReleaseSnapshot(snapshot1);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+
+  // We have only one valid snapshot snapshot2. Since snapshot1 is
+  // not valid anymore, "first" should be removed by a compaction.
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ("fourth", Get(1, "foo", snapshot2));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]");
+
+  // after we release the snapshot2, only one value should be left
+  db_->ReleaseSnapshot(snapshot2);
+  FillLevels("a", "z", 1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                   nullptr));
+  ASSERT_EQ("sixth", Get(1, "foo"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]");
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBasicMultiConfigs, DBBasicMultiConfigs,
+    ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs()));
+
+TEST_F(DBBasicTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  Close();
+  Destroy(options);
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname_, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBBasicTest, CompactOnFlush) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]");
+
+    // Write two new keys
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+
+    // Case1: Delete followed by a put
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+
+    // After the current memtable is flushed, the DEL should
+    // have been removed
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+
+    // Case 2: Delete followed by another delete
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 3: Put followed by a delete
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 4: Put followed by another Put
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_OK(Put(1, "foo", "v5"));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]");
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]");
+
+    // clear database
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: Put followed by snapshot followed by another Put
+    // Both puts should remain.
+    ASSERT_OK(Put(1, "foo", "v6"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "v7"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]");
+    db_->ReleaseSnapshot(snapshot);
+
+    // clear database
+    ASSERT_OK(Delete(1, "foo"));
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+
+    // Case 5: snapshot followed by a put followed by another Put
+    // Only the last put should remain.
+    const Snapshot* snapshot1 = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "v8"));
+    ASSERT_OK(Put(1, "foo", "v9"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]");
+    db_->ReleaseSnapshot(snapshot1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, FlushOneColumnFamily) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  ASSERT_OK(Put(0, "Default", "Default"));
+  ASSERT_OK(Put(1, "pikachu", "pikachu"));
+  ASSERT_OK(Put(2, "ilya", "ilya"));
+  ASSERT_OK(Put(3, "muromec", "muromec"));
+  ASSERT_OK(Put(4, "dobrynia", "dobrynia"));
+  ASSERT_OK(Put(5, "nikitich", "nikitich"));
+  ASSERT_OK(Put(6, "alyosha", "alyosha"));
+  ASSERT_OK(Put(7, "popovich", "popovich"));
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), i + 1U);
+  }
+}
+
+TEST_F(DBBasicTest, MultiGetSimple) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+
+    std::vector<std::string> values(20, "Temporary data to be overwritten");
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+    get_perf_context()->Reset();
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(values[0], "v1");
+    ASSERT_EQ(values[1], "v2");
+    ASSERT_EQ(values[2], "v3");
+    ASSERT_EQ(values[4], "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBBasicTest, MultiGetEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Empty Key Set
+    std::vector<Slice> keys;
+    std::vector<std::string> values;
+    std::vector<ColumnFamilyHandle*> cfs;
+    std::vector<Status> s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Empty Key Set
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(s.size(), 0U);
+
+    // Empty Database, Search for Keys
+    keys.resize(2);
+    keys[0] = "a";
+    keys[1] = "b";
+    cfs.push_back(handles_[0]);
+    cfs.push_back(handles_[1]);
+    s = db_->MultiGet(ReadOptions(), cfs, keys, &values);
+    ASSERT_EQ(static_cast<int>(s.size()), 2);
+    ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound());
+  } while (ChangeCompactOptions());
+}
+
+class DBBlockChecksumTest : public DBBasicTest,
+                            public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+TEST_P(DBBlockChecksumTest, BlockChecksumTest) {
+  BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
+  Options options = CurrentOptions();
+  const int kNumPerFile = 2;
+
+  const auto algs = GetSupportedChecksums();
+  const int algs_size = static_cast<int>(algs.size());
+
+  // generate one table with each type of checksum
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // with each valid checksum type setting...
+  for (int i = 0; i < algs_size; ++i) {
+    table_options.checksum = algs[i];
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    // verify every type of checksum (should be regardless of that setting)
+    for (int j = 0; j < algs_size * kNumPerFile; ++j) {
+      ASSERT_EQ(Key(j), Get(Key(j)));
+    }
+  }
+
+  // Now test invalid checksum type
+  table_options.checksum = static_cast<ChecksumType>(123);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+}
+
+// On Windows you can have either memory mapped file or a file
+// with unbuffered access. So this asserts and does not make
+// sense to run
+#ifndef OS_WIN
+TEST_F(DBBasicTest, MmapAndBufferOptions) {
+  if (!IsMemoryMappedAccessSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+
+  options.use_direct_reads = true;
+  options.allow_mmap_reads = true;
+  ASSERT_NOK(TryReopen(options));
+
+  // All other combinations are acceptable
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+
+  if (IsDirectIOSupported()) {
+    options.use_direct_reads = true;
+    options.allow_mmap_reads = false;
+    ASSERT_OK(TryReopen(options));
+  }
+
+  options.use_direct_reads = false;
+  ASSERT_OK(TryReopen(options));
+}
+#endif
+
+class TestEnv : public EnvWrapper {
+ public:
+  explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {}
+  static const char* kClassName() { return "TestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  class TestLogger : public Logger {
+   public:
+    using Logger::Logv;
+    explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; }
+    ~TestLogger() override {
+      if (!closed_) {
+        CloseHelper().PermitUncheckedError();
+      }
+    }
+    void Logv(const char* /*format*/, va_list /*ap*/) override {}
+
+   protected:
+    Status CloseImpl() override { return CloseHelper(); }
+
+   private:
+    Status CloseHelper() {
+      env->CloseCountInc();
+      ;
+      return Status::IOError();
+    }
+    TestEnv* env;
+  };
+
+  void CloseCountInc() { close_count++; }
+
+  int GetCloseCount() { return close_count; }
+
+  Status NewLogger(const std::string& /*fname*/,
+                   std::shared_ptr<Logger>* result) override {
+    result->reset(new TestLogger(this));
+    return Status::OK();
+  }
+
+ private:
+  int close_count;
+};
+
+TEST_F(DBBasicTest, DBClose) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  TestEnv* env = new TestEnv(env_);
+  std::unique_ptr<TestEnv> local_env_guard(env);
+  options.create_if_missing = true;
+  options.env = env;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(env->GetCloseCount(), 1);
+  ASSERT_EQ(s, Status::IOError());
+
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 1);
+
+  // Do not call DB::Close() and ensure our logger Close() still gets called
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+
+  // Provide our own logger and ensure DB::Close() does not close it
+  options.info_log.reset(new TestEnv::TestLogger(env));
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  s = db->Close();
+  ASSERT_EQ(s, Status::OK());
+  delete db;
+  ASSERT_EQ(env->GetCloseCount(), 2);
+  options.info_log.reset();
+  ASSERT_EQ(env->GetCloseCount(), 3);
+}
+
+TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) {
+  Options options = GetDefaultOptions();
+  std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test");
+  // Configure a specific WAL directory
+  options.wal_dir = dbname + "_wal_dir";
+  // Configure 3 different data directories
+  options.db_paths.emplace_back(dbname + "_1", 512 * 1024);
+  options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024);
+
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  DB* db = nullptr;
+  std::unique_ptr<Env> env = NewCompositeEnv(
+      std::make_shared<CountedFileSystem>(FileSystem::Default()));
+  options.create_if_missing = true;
+  options.env = env.get();
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  // Explicitly close the database to ensure the open and close counter for
+  // directories are equivalent
+  s = db->Close();
+  auto* counted_fs =
+      options.env->GetFileSystem()->CheckedCast<CountedFileSystem>();
+  ASSERT_TRUE(counted_fs != nullptr);
+  ASSERT_EQ(counted_fs->counters()->dir_opens,
+            counted_fs->counters()->dir_closes);
+  ASSERT_OK(s);
+  delete db;
+}
+
+TEST_F(DBBasicTest, DBCloseFlushError) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.manual_wal_flush = true;
+  options.write_buffer_size = 100;
+  options.env = fault_injection_env.get();
+
+  Reopen(options);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(Put("key3", "value3"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  // retry should return the same error
+  s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  fault_injection_env->SetFilesystemActive(true);
+  // retry close() is no-op even the system is back. Could be improved if
+  // Close() is retry-able: #9029
+  s = dbfull()->Close();
+  ASSERT_NE(s, Status::OK());
+  Destroy(options);
+}
+
+class DBMultiGetTestWithParam
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+  // <CF, key, value> tuples
+  std::vector<std::tuple<int, std::string, std::string>> cf_kv_vec;
+  static const int num_keys = 24;
+  cf_kv_vec.reserve(num_keys);
+
+  for (int i = 0; i < num_keys; ++i) {
+    int cf = i / 3;
+    int cf_key = 1 % 3;
+    cf_kv_vec.emplace_back(std::make_tuple(
+        cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key),
+        "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)));
+    ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                  std::get<2>(cf_kv_vec[i])));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          // After MultiGet refs a couple of CFs, flush all CFs so MultiGet
+          // is forced to repeat the process
+          for (int i = 0; i < num_keys; ++i) {
+            int cf = i / 3;
+            int cf_key = i % 8;
+            if (cf_key == 0) {
+              ASSERT_OK(Flush(cf));
+            }
+            ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]),
+                          std::get<2>(cf_kv_vec[i]) + "_2"));
+          }
+        }
+        if (get_sv_count == 11) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < num_keys; ++i) {
+    cfs.push_back(std::get<0>(cf_kv_vec[i]));
+    keys.push_back(std::get<1>(cf_kv_vec[i]));
+  }
+
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), num_keys);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2");
+  }
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[0]));
+  keys.push_back(std::get<1>(cf_kv_vec[0]));
+  cfs.push_back(std::get<0>(cf_kv_vec[3]));
+  keys.push_back(std::get<1>(cf_kv_vec[3]));
+  cfs.push_back(std::get<0>(cf_kv_vec[4]));
+  keys.push_back(std::get<1>(cf_kv_vec[4]));
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2");
+
+  keys.clear();
+  cfs.clear();
+  cfs.push_back(std::get<0>(cf_kv_vec[7]));
+  keys.push_back(std::get<1>(cf_kv_vec[7]));
+  cfs.push_back(std::get<0>(cf_kv_vec[6]));
+  keys.push_back(std::get<1>(cf_kv_vec[6]));
+  cfs.push_back(std::get<0>(cf_kv_vec[1]));
+  keys.push_back(std::get<1>(cf_kv_vec[1]));
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2");
+  ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2");
+  ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2");
+
+  for (int cf = 0; cf < 8; ++cf) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(cf))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  int retries = 0;
+  bool last_try = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::LastTry", [&](void* /*arg*/) {
+        last_try = true;
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (last_try) {
+          return;
+        }
+        if (++get_sv_count == 2) {
+          ++retries;
+          get_sv_count = 0;
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(
+                i, "cf" + std::to_string(i) + "_key",
+                "cf" + std::to_string(i) + "_val" + std::to_string(retries)));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  ASSERT_TRUE(last_try);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j],
+              "cf" + std::to_string(j) + "_val" + std::to_string(retries));
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich",
+                         "alyosha", "popovich"},
+                        options);
+
+  for (int i = 0; i < 8; ++i) {
+    ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                  "cf" + std::to_string(i) + "_val"));
+  }
+
+  int get_sv_count = 0;
+  ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check<DBImpl>(db_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MultiGet::AfterRefSV", [&](void* /*arg*/) {
+        if (++get_sv_count == 2) {
+          for (int i = 0; i < 8; ++i) {
+            ASSERT_OK(Flush(i));
+            ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key",
+                          "cf" + std::to_string(i) + "_val2"));
+          }
+        }
+        if (get_sv_count == 8) {
+          for (int i = 0; i < 8; ++i) {
+            auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                            db->GetColumnFamilyHandle(i))
+                            ->cfd();
+            ASSERT_TRUE(
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) ||
+                (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete));
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<int> cfs;
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 0; i < 8; ++i) {
+    cfs.push_back(i);
+    keys.push_back("cf" + std::to_string(i) + "_key");
+  }
+
+  const Snapshot* snapshot = db_->GetSnapshot();
+  values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()),
+                    std::get<1>(GetParam()));
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(values.size(), 8);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val");
+  }
+  for (int i = 0; i < 8; ++i) {
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            static_cast_with_check<DBImpl>(db_)->GetColumnFamilyHandle(i))
+            ->cfd();
+    ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"one", "two"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(2, "baz", "xyz"));
+  ASSERT_OK(Put(1, "abc", "def"));
+
+  // Note: keys for the same CF do not form a consecutive range
+  std::vector<int> cfs{1, 2, 1};
+  std::vector<std::string> keys{"foo", "baz", "abc"};
+  std::vector<std::string> values;
+
+  values = MultiGet(cfs, keys, /* snapshot */ nullptr,
+                    /* batched */ std::get<0>(GetParam()),
+                    /* async */ std::get<1>(GetParam()));
+
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(values[0], "bar");
+  ASSERT_EQ(values[1], "xyz");
+  ASSERT_EQ(values[2], "def");
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k2", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+    ASSERT_OK(s[5]);
+
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+    // To expand the power of this test, generate > 1 table file and
+    // mix with memtable
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    ASSERT_OK(Delete(1, "no_key"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k2", "k3", "k4", "k5", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), true);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_OK(s[0]);
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_TRUE(s[3].IsNotFound());
+    ASSERT_OK(s[4]);
+    ASSERT_TRUE(s[5].IsNotFound());
+
+    SetPerfLevel(kDisable);
+  } while (ChangeOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateStringAppendOperator();
+  CreateAndReopenWithCF({"pikachu"}, opts);
+  SetPerfLevel(kEnableCount);
+  // To expand the power of this test, generate > 1 table file and
+  // mix with memtable
+  ASSERT_OK(Merge(1, "k1", "v1"));
+  ASSERT_OK(Merge(1, "k2", "v2"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k3", "v3"));
+  ASSERT_OK(Merge(1, "k4", "v4"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k4", "v4_2"));
+  ASSERT_OK(Merge(1, "k6", "v6"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+  ASSERT_OK(Merge(1, "k7", "v7"));
+  ASSERT_OK(Merge(1, "k8", "v8"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  get_perf_context()->Reset();
+
+  std::vector<Slice> keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"});
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+  std::vector<Status> s(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = std::get<1>(GetParam());
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8");
+  ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8");
+  ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8");
+  ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2");
+  ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1");
+  ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3");
+  ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes);
+
+  for (Status& status : s) {
+    ASSERT_OK(status);
+  }
+
+  SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 64; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), 16);
+  for (unsigned int j = 0; j < values.size(); ++j) {
+    int key = j + 64;
+    if (key % 9 == 0) {
+      ASSERT_EQ(values[j], "val_mem_" + std::to_string(key));
+    } else if (key % 5 == 0) {
+      ASSERT_EQ(values[j], "val_l0_" + std::to_string(key));
+    } else if (key % 3 == 0) {
+      ASSERT_EQ(values[j], "val_l1_" + std::to_string(key));
+    } else {
+      ASSERT_EQ(values[j], "val_l2_" + std::to_string(key));
+    }
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 128; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 128; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 128; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 128; i += 9) {
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+
+  for (int i = 32; i < 80; ++i) {
+    keys.push_back("key_" + std::to_string(i));
+  }
+
+  values = MultiGet(keys, nullptr, std::get<1>(GetParam()));
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int j = 0; j < 48; ++j) {
+    int key = j + 32;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+  }
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  SetPerfLevel(kEnableCount);
+  ASSERT_OK(Put(1, "k1", "v_1"));
+  ASSERT_OK(Put(1, "k2", "v_2"));
+  ASSERT_OK(Put(1, "k3", "v_3"));
+  ASSERT_OK(Put(1, "k4", "v_4"));
+  ASSERT_OK(Put(1, "k5", "v_5"));
+  ASSERT_OK(Put(1, "k6", "v_6"));
+  std::vector<Slice> keys = {"k1", "k2", "k3", "k4", "k5", "k6"};
+  std::vector<PinnableSlice> values(keys.size());
+  std::vector<Status> s(keys.size());
+  std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+
+  get_perf_context()->Reset();
+  ReadOptions ro;
+  ro.value_size_soft_limit = 11;
+  ro.async_io = std::get<1>(GetParam());
+  db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                s.data(), false);
+
+  ASSERT_EQ(values.size(), keys.size());
+  for (unsigned int i = 0; i < 4; i++) {
+    ASSERT_EQ(std::string(values[i].data(), values[i].size()),
+              "v_" + std::to_string(i + 1));
+  }
+
+  for (unsigned int i = 4; i < 6; i++) {
+    ASSERT_TRUE(s[i].IsAborted());
+  }
+
+  ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes);
+  SetPerfLevel(kDisable);
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) {
+#ifndef USE_COROUTINES
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    return;
+  }
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    SetPerfLevel(kEnableCount);
+
+    ASSERT_OK(Put(1, "k6", "v6"));
+    ASSERT_OK(Put(1, "k7", "v7_"));
+    ASSERT_OK(Put(1, "k3", "v3_"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Put(1, "k11", "v11"));
+    ASSERT_OK(Delete(1, "no_key"));
+    ASSERT_OK(Put(1, "k8", "v8_"));
+    ASSERT_OK(Put(1, "k13", "v13"));
+    ASSERT_OK(Put(1, "k14", "v14"));
+    ASSERT_OK(Put(1, "k15", "v15"));
+    ASSERT_OK(Put(1, "k16", "v16"));
+    ASSERT_OK(Put(1, "k17", "v17"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(Put(1, "k1", "v1_"));
+    ASSERT_OK(Put(1, "k2", "v2_"));
+    ASSERT_OK(Put(1, "k5", "v5_"));
+    ASSERT_OK(Put(1, "k9", "v9_"));
+    ASSERT_OK(Put(1, "k10", "v10"));
+    ASSERT_OK(Delete(1, "k2"));
+    ASSERT_OK(Delete(1, "k6"));
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15",
+                             "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7",
+                             "k8", "k9", "no_key"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    ro.value_size_soft_limit = 20;
+    ro.async_io = std::get<1>(GetParam());
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+
+    // In memory keys
+    ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10");
+    ASSERT_TRUE(s[9].IsNotFound());  // k2
+    ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_");
+    ASSERT_TRUE(s[13].IsNotFound());  // k6
+    ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_");
+
+    // In sst files
+    ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11");
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13");
+    ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14");
+
+    // Remaining aborted after value_size exceeds.
+    ASSERT_TRUE(s[3].IsAborted());
+    ASSERT_TRUE(s[6].IsAborted());
+    ASSERT_TRUE(s[7].IsAborted());
+    ASSERT_TRUE(s[8].IsAborted());
+    ASSERT_TRUE(s[10].IsAborted());
+    ASSERT_TRUE(s[11].IsAborted());
+    ASSERT_TRUE(s[14].IsAborted());
+    ASSERT_TRUE(s[15].IsAborted());
+    ASSERT_TRUE(s[17].IsAborted());
+
+    // 6 kv pairs * 3 bytes per value (i.e. 18)
+    ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes);
+    SetPerfLevel(kDisable);
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) {
+  if (std::get<1>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO");
+    return;
+  }
+  // Skip for unbatched MultiGet
+  if (!std::get<0>(GetParam())) {
+    ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet");
+    return;
+  }
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+  int num_keys = 0;
+
+  for (int i = 0; i < 64; ++i) {
+    ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 64; i += 3) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 64; i += 5) {
+    ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i)));
+    num_keys++;
+    if (num_keys == 8) {
+      ASSERT_OK(Flush());
+      num_keys = 0;
+    }
+  }
+  if (num_keys > 0) {
+    ASSERT_OK(Flush());
+    num_keys = 0;
+  }
+  ASSERT_EQ(0, num_keys);
+
+  for (int i = 0; i < 64; i += 9) {
+    ASSERT_OK(
+        Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i)));
+  }
+
+  std::vector<std::string> keys_str;
+  for (int i = 10; i < 50; ++i) {
+    keys_str.push_back("key_" + std::to_string(i));
+  }
+
+  std::vector<Slice> keys(keys_str.size());
+  for (int i = 0; i < 40; i++) {
+    keys[i] = Slice(keys_str[i]);
+  }
+
+  std::vector<PinnableSlice> values(keys_str.size());
+  std::vector<Status> statuses(keys_str.size());
+  ReadOptions read_options;
+  read_options.verify_checksums = true;
+  read_options.value_size_soft_limit = 380;
+  read_options.async_io = std::get<1>(GetParam());
+  db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), values.data(), statuses.data());
+
+  ASSERT_EQ(values.size(), keys.size());
+
+  for (unsigned int j = 0; j < 26; ++j) {
+    int key = j + 10;
+    std::string value;
+    value.append("val_l2_" + std::to_string(key));
+    if (key % 3 == 0) {
+      value.append(",");
+      value.append("val_l1_" + std::to_string(key));
+    }
+    if (key % 5 == 0) {
+      value.append(",");
+      value.append("val_l0_" + std::to_string(key));
+    }
+    if (key % 9 == 0) {
+      value.append(",");
+      value.append("val_mem_" + std::to_string(key));
+    }
+    ASSERT_EQ(values[j], value);
+    ASSERT_OK(statuses[j]);
+  }
+
+  // All remaning keys status is set Status::Abort
+  for (unsigned int j = 26; j < 40; j++) {
+    ASSERT_TRUE(statuses[j].IsAborted());
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam,
+                        testing::Combine(testing::Bool(), testing::Bool()));
+
+#if USE_COROUTINES
+class DBMultiGetAsyncIOTest : public DBBasicTest,
+                              public ::testing::WithParamInterface<bool> {
+ public:
+  DBMultiGetAsyncIOTest()
+      : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) {
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.statistics = statistics_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options_);
+    int num_keys = 0;
+
+    // Put all keys in the bottommost level, and overwrite some keys
+    // in L0 and L1
+    for (int i = 0; i < 256; ++i) {
+      EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    MoveFilesToLevel(2);
+
+    for (int i = 0; i < 128; i += 3) {
+      EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    // Put some range deletes in L1
+    for (int i = 128; i < 256; i += 32) {
+      std::string range_begin = Key(i);
+      std::string range_end = Key(i + 16);
+      EXPECT_OK(dbfull()->DeleteRange(WriteOptions(),
+                                      dbfull()->DefaultColumnFamily(),
+                                      range_begin, range_end));
+      // Also do some Puts to force creation of bloom filter
+      for (int j = i + 16; j < i + 32; ++j) {
+        if (j % 3 == 0) {
+          EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j)));
+        }
+      }
+      EXPECT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    for (int i = 0; i < 128; i += 5) {
+      EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i)));
+      num_keys++;
+      if (num_keys == 8) {
+        EXPECT_OK(Flush());
+        num_keys = 0;
+      }
+    }
+    if (num_keys > 0) {
+      EXPECT_OK(Flush());
+      num_keys = 0;
+    }
+    EXPECT_EQ(0, num_keys);
+  }
+
+  const std::shared_ptr<Statistics>& statistics() { return statistics_; }
+
+ protected:
+  void ReopenDB() { Reopen(options_); }
+
+ private:
+  std::shared_ptr<Statistics> statistics_;
+  Options options_;
+};
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL0) {
+  // All 3 keys in L0. The L0 files should be read serially.
+  std::vector<std::string> key_strs{Key(0), Key(40), Key(80)};
+  std::vector<Slice> keys{key_strs[0], key_strs[1], key_strs[2]};
+  std::vector<PinnableSlice> values(key_strs.size());
+  std::vector<Status> statuses(key_strs.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_OK(statuses[0]);
+  ASSERT_OK(statuses[1]);
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[0], "val_l0_" + std::to_string(0));
+  ASSERT_EQ(values[1], "val_l0_" + std::to_string(40));
+  ASSERT_EQ(values[2], "val_l0_" + std::to_string(80));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // With async IO, lookups will happen in parallel for each key
+  if (GetParam()) {
+    ASSERT_EQ(multiget_io_batch_size.count, 1);
+    ASSERT_EQ(multiget_io_batch_size.max, 3);
+    ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+  } else {
+    // Without Async IO, MultiGet will call MultiRead 3 times, once for each
+    // L0 file
+    ASSERT_EQ(multiget_io_batch_size.count, 3);
+  }
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // A batch of 3 async IOs is expected, one for each overlapping file in L1
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 3);
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) {
+        static int count = 0;
+        count++;
+        // Fail the last table reader open, which is the 6th SST file
+        // since 3 overlapping L0 files + 3 L1 files containing the keys
+        if (count == 6) {
+          Status* s = static_cast<Status*>(status);
+          *s = Status::IOError();
+        }
+      });
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReopenDB();
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::IOError());
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // A batch of 3 async IOs is expected, one for each overlapping file in L1
+  ASSERT_EQ(multiget_io_batch_size.count, 1);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 21 is the last key in the first L1 file
+  key_strs.push_back(Key(21));
+  key_strs.push_back(Key(54));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(21));
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(54));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // Since the first MultiGet key is the last key in a file, the MultiGet is
+  // expected to lookup in that file first, before moving on to other files.
+  // So the first file lookup will issue one async read, and the next lookup
+  // will lookup 2 files in parallel and issue 2 async reads
+  ASSERT_EQ(multiget_io_batch_size.count, 2);
+  ASSERT_EQ(multiget_io_batch_size.max, 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 33 and 102 are in L1, and 56 is in L2
+  key_strs.push_back(Key(33));
+  key_strs.push_back(Key(56));
+  key_strs.push_back(Key(102));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 3);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(statuses[2], Status::OK());
+  ASSERT_EQ(values[0], "val_l1_" + std::to_string(33));
+  ASSERT_EQ(values[1], "val_l2_" + std::to_string(56));
+  ASSERT_EQ(values[2], "val_l1_" + std::to_string(102));
+
+  HistogramData multiget_io_batch_size;
+
+  statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size);
+
+  // There are 2 keys in L1 in twp separate files, and 1 in L2. With
+  // optimize_multiget_for_io, all three lookups will happen in parallel.
+  // Otherwise, the L2 lookup will happen after L1.
+  ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2);
+  ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 19 and 26 are in L2, but overlap with L0 and L1 file ranges
+  key_strs.push_back(Key(19));
+  key_strs.push_back(Key(26));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 2);
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(values[0], "val_l2_" + std::to_string(19));
+  ASSERT_EQ(values[1], "val_l2_" + std::to_string(26));
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 139 and 163 are in L2, but overlap with a range deletes in L1
+  key_strs.push_back(Key(139));
+  key_strs.push_back(Key(163));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), 2);
+  ASSERT_EQ(statuses[0], Status::NotFound());
+  ASSERT_EQ(statuses[1], Status::NotFound());
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2);
+}
+
+TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) {
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  std::vector<PinnableSlice> values;
+  std::vector<Status> statuses;
+
+  // 139 and 163 are in L2, but overlap with a range deletes in L1
+  key_strs.push_back(Key(139));
+  key_strs.push_back(Key(144));
+  key_strs.push_back(Key(163));
+  keys.push_back(key_strs[0]);
+  keys.push_back(key_strs[1]);
+  keys.push_back(key_strs[2]);
+  values.resize(keys.size());
+  statuses.resize(keys.size());
+
+  ReadOptions ro;
+  ro.async_io = true;
+  ro.optimize_multiget_for_io = GetParam();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data());
+  ASSERT_EQ(values.size(), keys.size());
+  ASSERT_EQ(statuses[0], Status::NotFound());
+  ASSERT_EQ(statuses[1], Status::OK());
+  ASSERT_EQ(values[1], "val_l1_" + std::to_string(144));
+  ASSERT_EQ(statuses[2], Status::NotFound());
+
+  // Bloom filters in L0/L1 will avoid the coroutine calls in those levels
+  ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3);
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest,
+                        testing::Bool());
+#endif  // USE_COROUTINES
+
+TEST_F(DBBasicTest, MultiGetStats) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  table_options.partition_filters = true;
+  table_options.no_block_cache = true;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int total_keys = 2000;
+  std::vector<std::string> keys_str(total_keys);
+  std::vector<Slice> keys(total_keys);
+  static size_t kMultiGetBatchSize = 100;
+  std::vector<PinnableSlice> values(kMultiGetBatchSize);
+  std::vector<Status> s(kMultiGetBatchSize);
+  ReadOptions read_opts;
+
+  Random rnd(309);
+  // Create Multiple SST files at multiple levels.
+  for (int i = 0; i < 500; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 501; i < 1000; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  for (int i = 1001; i < total_keys; ++i) {
+    keys_str[i] = "k" + std::to_string(i);
+    keys[i] = Slice(keys_str[i]);
+    ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000)));
+    if (i % 100 == 0) {
+      ASSERT_OK(Flush(1));
+    }
+  }
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+  Close();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(options.statistics->Reset());
+
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250],
+                values.data(), s.data(), false);
+
+  ASSERT_EQ(values.size(), kMultiGetBatchSize);
+  HistogramData hist_level;
+  HistogramData hist_index_and_filter_blocks;
+  HistogramData hist_sst;
+
+  options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+  options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                                    &hist_index_and_filter_blocks);
+  options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst);
+
+  // Maximum number of blocks read from a file system in a level.
+  ASSERT_EQ(hist_level.max, 1);
+  ASSERT_GT(hist_index_and_filter_blocks.max, 0);
+  // Maximum number of sst files read from file system in a level.
+  ASSERT_EQ(hist_sst.max, 2);
+
+  // Minimun number of blocks read in a level.
+  ASSERT_EQ(hist_level.min, 1);
+  ASSERT_GT(hist_index_and_filter_blocks.min, 0);
+  // Minimun number of sst files read in a level.
+  ASSERT_EQ(hist_sst.min, 1);
+
+  for (PinnableSlice& value : values) {
+    value.Reset();
+  }
+  for (Status& status : s) {
+    status = Status::OK();
+  }
+  db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950],
+                values.data(), s.data(), false);
+  options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level);
+  ASSERT_EQ(hist_level.max, 2);
+}
+
+// Test class for batched MultiGet with prefix extractor
+// Param bool - If true, use partitioned filters
+//              If false, use full filter block
+class MultiGetPrefixExtractorTest : public DBBasicTest,
+                                    public ::testing::WithParamInterface<bool> {
+};
+
+TEST_P(MultiGetPrefixExtractorTest, Batched) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_prefix_bloom_size_ratio = 10;
+  BlockBasedTableOptions bbto;
+  if (GetParam()) {
+    bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    bbto.partition_filters = true;
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  bbto.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  SetPerfLevel(kEnableCount);
+  get_perf_context()->Reset();
+
+  ASSERT_OK(Put("k", "v0"));
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk3", "v3"));
+  ASSERT_OK(Put("kk4", "v4"));
+  std::vector<std::string> keys(
+      {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"});
+  std::vector<std::string> expected(
+      {"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"});
+  std::vector<std::string> values;
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values, expected);
+  // One key ("k") is not queried against the filter because it is outside
+  // the prefix_extractor domain, leaving 6 keys with queried prefixes.
+  ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4);
+  ASSERT_OK(Flush());
+
+  get_perf_context()->Reset();
+  values = MultiGet(keys, nullptr);
+  ASSERT_EQ(values, expected);
+  ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+
+  // Also check Get stat
+  get_perf_context()->Reset();
+  for (size_t i = 0; i < keys.size(); ++i) {
+    values[i] = Get(keys[i]);
+  }
+  ASSERT_EQ(values, expected);
+  ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2);
+  ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4);
+}
+
+INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+class DBMultiGetRowCacheTest : public DBBasicTest,
+                               public ::testing::WithParamInterface<bool> {};
+
+TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) {
+  do {
+    option_config_ = kRowCache;
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    SetPerfLevel(kEnableCount);
+    ASSERT_OK(Put(1, "k1", "v1"));
+    ASSERT_OK(Put(1, "k2", "v2"));
+    ASSERT_OK(Put(1, "k3", "v3"));
+    ASSERT_OK(Put(1, "k4", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "k5", "v5"));
+    const Snapshot* snap1 = dbfull()->GetSnapshot();
+    ASSERT_OK(Delete(1, "k4"));
+    ASSERT_OK(Flush(1));
+    const Snapshot* snap2 = dbfull()->GetSnapshot();
+
+    get_perf_context()->Reset();
+
+    std::vector<Slice> keys({"no_key", "k5", "k4", "k3", "k1"});
+    std::vector<PinnableSlice> values(keys.size());
+    std::vector<ColumnFamilyHandle*> cfs(keys.size(), handles_[1]);
+    std::vector<Status> s(keys.size());
+
+    ReadOptions ro;
+    bool use_snapshots = GetParam();
+    if (use_snapshots) {
+      ro.snapshot = snap2;
+    }
+    db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(),
+                  s.data(), false);
+
+    ASSERT_EQ(values.size(), keys.size());
+    ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1");
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_TRUE(s[2].IsNotFound());
+    ASSERT_OK(s[3]);
+    ASSERT_OK(s[4]);
+
+    // Call MultiGet() again with some intersection with the previous set of
+    // keys. Those should already be in the row cache.
+    keys.assign({"no_key", "k5", "k3", "k2"});
+    for (size_t i = 0; i < keys.size(); ++i) {
+      values[i].Reset();
+      s[i] = Status::OK();
+    }
+    get_perf_context()->Reset();
+
+    if (use_snapshots) {
+      ro.snapshot = snap1;
+    }
+    db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(),
+                  values.data(), s.data(), false);
+
+    ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2");
+    ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3");
+    ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5");
+    // four kv pairs * two bytes per value
+    ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes);
+
+    ASSERT_TRUE(s[0].IsNotFound());
+    ASSERT_OK(s[1]);
+    ASSERT_OK(s[2]);
+    ASSERT_OK(s[3]);
+    if (use_snapshots) {
+      // Only reads from the first SST file would have been cached, since
+      // snapshot seq no is > fd.largest_seqno
+      ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT));
+    } else {
+      ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT));
+    }
+
+    SetPerfLevel(kDisable);
+    dbfull()->ReleaseSnapshot(snap1);
+    dbfull()->ReleaseSnapshot(snap2);
+  } while (ChangeCompactOptions());
+}
+
+INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest,
+                        testing::Values(true, false));
+
+TEST_F(DBBasicTest, GetAllKeyVersions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  const size_t kNumInserts = 4;
+  const size_t kNumDeletes = 4;
+  const size_t kNumUpdates = 4;
+
+  // Check default column family
+  for (size_t i = 0; i != kNumInserts; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i != kNumUpdates; ++i) {
+    ASSERT_OK(Put(std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(std::to_string(i)));
+  }
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+  for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) {
+    if (i % 3 == 0) {
+      ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion");
+    } else {
+      ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue");
+    }
+  }
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size());
+
+  // Check non-default column family
+  for (size_t i = 0; i + 1 != kNumInserts; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value"));
+  }
+  for (size_t i = 0; i + 1 != kNumUpdates; ++i) {
+    ASSERT_OK(Put(1, std::to_string(i), "value1"));
+  }
+  for (size_t i = 0; i + 1 != kNumDeletes; ++i) {
+    ASSERT_OK(Delete(1, std::to_string(i)));
+  }
+  ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size());
+}
+
+TEST_F(DBBasicTest, ValueTypeString) {
+  KeyVersion key_version;
+  // when adding new type, please also update `value_type_string_map`
+  for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid;
+       i++) {
+    key_version.type = i;
+    ASSERT_TRUE(key_version.GetTypeName() != "Invalid");
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, MultiGetIOBufferOverrun) {
+  Options options = CurrentOptions();
+  Random rnd(301);
+  BlockBasedTableOptions table_options;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.block_size = 16 * 1024;
+  ASSERT_TRUE(table_options.block_size >
+              BlockBasedTable::kMultiGetReadStackBufSize);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string zero_str(128, '\0');
+  for (int i = 0; i < 100; ++i) {
+    // Make the value compressible. A purely random string doesn't compress
+    // and the resultant data block will not be compressed
+    std::string value(rnd.RandomString(128) + zero_str);
+    assert(Put(Key(i), value) == Status::OK());
+  }
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+}
+
+TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (size_t i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string value_str = std::to_string(cf) + "_" + std::to_string(i);
+
+      ASSERT_OK(Put(static_cast<int>(cf), key_str, value_str));
+      if (0 == (i % 1000)) {
+        ASSERT_OK(Flush(static_cast<int>(cf)));
+      }
+    }
+  }
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+  Close();
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    for (int i = 0; i != 10000; ++i) {
+      std::string key_str = Key(static_cast<int>(i));
+      std::string expected_value_str =
+          std::to_string(cf) + "_" + std::to_string(i);
+      ASSERT_EQ(expected_value_str, Get(static_cast<int>(cf), key_str));
+    }
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.best_efforts_recovery = true;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+class TableFileListener : public EventListener {
+ public:
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    cf_to_paths_[info.cf_name].push_back(info.file_path);
+  }
+  std::vector<std::string>& GetFiles(const std::string& cf_name) {
+    InstrumentedMutexLock lock(&mutex_);
+    return cf_to_paths_[cf_name];
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::unordered_map<std::string, std::vector<std::string>> cf_to_paths_;
+};
+}  // anonymous namespace
+
+TEST_F(DBBasicTest, LastSstFileNotInManifest) {
+  // If the last sst file is not tracked in MANIFEST,
+  // or the VersionEdit for the last sst file is not synced,
+  // on recovery, the last sst file should be deleted,
+  // and new sst files shouldn't reuse its file number.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  Close();
+
+  // Manually add a sst file.
+  constexpr uint64_t kSstFileNumber = 100;
+  const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
+  ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
+                              /* fname = */ kSstFile,
+                              /* should_sync = */ true));
+  ASSERT_OK(env_->FileExists(kSstFile));
+
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+  // kSstFile should already be deleted.
+  ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
+
+  ASSERT_OK(Put("k", "v"));
+  ASSERT_OK(Flush());
+  // New sst file should have file number > kSstFileNumber.
+  std::vector<std::string>& files =
+      listener->GetFiles(kDefaultColumnFamilyName);
+  ASSERT_EQ(files.size(), 1);
+  const std::string fname = files[0].erase(0, (dbname_ + "/").size());
+  uint64_t number = 0;
+  FileType type = kTableFile;
+  ASSERT_TRUE(ParseFileName(fname, &number, &type));
+  ASSERT_EQ(type, kTableFile);
+  ASSERT_GT(number, kSstFileNumber);
+}
+
+TEST_F(DBBasicTest, RecoverWithMissingFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  // Disable auto compaction to simplify SST file name tracking.
+  options.disable_auto_compactions = true;
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<std::string> all_cf_names = {kDefaultColumnFamilyName, "pikachu",
+                                           "eevee"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  for (size_t cf = 0; cf != num_cfs; ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "b", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+    ASSERT_OK(Put(static_cast<int>(cf), "c", "0_value"));
+    ASSERT_OK(Flush(static_cast<int>(cf)));
+  }
+
+  // Delete and corrupt files
+  for (size_t i = 0; i < all_cf_names.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(all_cf_names[i]);
+    ASSERT_EQ(3, files.size());
+    std::string corrupted_data;
+    ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data));
+    ASSERT_OK(WriteStringToFile(
+        env_, corrupted_data.substr(0, corrupted_data.size() - 2),
+        files[files.size() - 1], /*should_sync=*/true));
+    for (int j = static_cast<int>(files.size() - 2); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(all_cf_names, options);
+  // Verify data
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+    iter->SeekToFirst();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter.reset(db_->NewIterator(read_opts, handles_[2]));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("a", iter->key());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("b", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+}
+
+TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Hack by adding a new MANIFEST with high file number
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000",
+                                /*should_sync=*/true));
+  }
+  {
+    // Hack by adding a corrupted SST not referenced by any MANIFEST
+    std::string garbage(10, '\0');
+    ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst",
+                                /*should_sync=*/true));
+  }
+
+  options.best_efforts_recovery = true;
+
+  Reopen(options);
+  ASSERT_OK(Put("bar", "value"));
+}
+
+TEST_F(DBBasicTest, RecoverWithNoCurrentFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put(1, "bar", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Flush(1));
+  Close();
+  ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_)));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  std::vector<std::string> cf_names;
+  ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names));
+  ASSERT_EQ(2, cf_names.size());
+  for (const auto& name : cf_names) {
+    ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu");
+  }
+}
+
+TEST_F(DBBasicTest, RecoverWithNoManifest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  Close();
+  {
+    // Delete all MANIFEST.
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& file : files) {
+      uint64_t number = 0;
+      FileType type = kWalFile;
+      if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+        ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file));
+      }
+    }
+  }
+  options.best_efforts_recovery = true;
+  options.create_if_missing = false;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  options.create_if_missing = true;
+  Reopen(options);
+  // Since no MANIFEST exists, best-efforts recovery creates a new, empty db.
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  TableFileListener* listener = new TableFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<std::string> kAllCfNames = {kDefaultColumnFamilyName, "pikachu"};
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (int cf = 0; cf < static_cast<int>(kAllCfNames.size()); ++cf) {
+    ASSERT_OK(Put(cf, "a", "0_value"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "0_value"));
+  }
+  // Delete files
+  for (size_t i = 0; i < kAllCfNames.size(); ++i) {
+    std::vector<std::string>& files = listener->GetFiles(kAllCfNames[i]);
+    ASSERT_EQ(1, files.size());
+    for (int j = static_cast<int>(files.size() - 1); j >= static_cast<int>(i);
+         --j) {
+      ASSERT_OK(env_->DeleteFile(files[j]));
+    }
+  }
+  options.best_efforts_recovery = true;
+  ReopenWithColumnFamilies(kAllCfNames, options);
+  // Verify WAL is not applied
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts, handles_[0]));
+  iter->SeekToFirst();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter.reset(db_->NewIterator(read_opts, handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("a", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(DBBasicTest, DisableTrackWal) {
+  // If WAL tracking was enabled, and then disabled during reopen,
+  // the previously tracked WALs should be removed from MANIFEST.
+
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  // extremely small write buffer size,
+  // so that new WALs are created more frequently.
+  options.write_buffer_size = 100;
+  options.env = env_;
+  DestroyAndReopen(options);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->SyncWAL());
+  // Some WALs are tracked.
+  ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Disable WAL tracking.
+  options.track_and_verify_wals_in_manifest = false;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  // Previously tracked WALs are cleared.
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+
+  // Re-enable WAL tracking again.
+  options.track_and_verify_wals_in_manifest = true;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBBasicTest, ManifestChecksumMismatch) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("bar", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) {
+        auto* crc = reinterpret_cast<uint32_t*>(arg);
+        *crc = *crc + 1;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  Status s = db_->Put(write_opts, "foo", "value");
+  ASSERT_OK(s);
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(Put("foo", "value1"));
+  ASSERT_OK(Flush());
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBBasicTest, ConcurrentlyCloseDB) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::vector<std::thread> workers;
+  for (int i = 0; i < 10; i++) {
+    workers.push_back(std::thread([&]() {
+      auto s = db_->Close();
+      ASSERT_OK(s);
+    }));
+  }
+  for (auto& w : workers) {
+    w.join();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+class DBBasicTestTrackWal : public DBTestBase,
+                            public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestTrackWal()
+      : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {}
+
+  int CountWalFiles() {
+    VectorLogPtr log_files;
+    EXPECT_OK(dbfull()->GetSortedWalFiles(log_files));
+    return static_cast<int>(log_files.size());
+  };
+};
+
+TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) {
+  // If a WAL becomes obsolete after flushing, but is not deleted from disk yet,
+  // then if SyncWAL is called afterwards, the obsolete WAL should not be
+  // tracked in MANIFEST.
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.track_and_verify_wals_in_manifest = true;
+  options.atomic_flush = GetParam();
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"cf"}, options);
+  ASSERT_EQ(handles_.size(), 2);  // default, cf
+  // Do not delete WALs.
+  ASSERT_OK(db_->DisableFileDeletions());
+  constexpr int n = 10;
+  std::vector<std::unique_ptr<LogFile>> wals(n);
+  for (size_t i = 0; i < n; i++) {
+    // Generate a new WAL for each key-value.
+    const int cf = i % 2;
+    ASSERT_OK(db_->GetCurrentWalFile(&wals[i]));
+    ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i)));
+    ASSERT_OK(Flush({0, 1}));
+  }
+  ASSERT_EQ(CountWalFiles(), n);
+  // Since all WALs are obsolete, no WAL should be tracked in MANIFEST.
+  ASSERT_OK(db_->SyncWAL());
+
+  // Manually delete all WALs.
+  Close();
+  for (const auto& wal : wals) {
+    ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+  }
+
+  // If SyncWAL tracks the obsolete WALs in MANIFEST,
+  // reopen will fail because the WALs are missing from disk.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options));
+  Destroy(options);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal,
+                        testing::Bool());
+#endif  // ROCKSDB_LITE
+
+class DBBasicTestMultiGet : public DBTestBase {
+ public:
+  DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool compressed_cache,
+                      bool uncompressed_cache, bool _compression_enabled,
+                      bool _fill_cache, uint32_t compression_parallel_threads)
+      : DBTestBase(test_dir, /*env_do_fsync=*/false) {
+    compression_enabled_ = _compression_enabled;
+    fill_cache_ = _fill_cache;
+
+    if (compressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      compressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+    if (uncompressed_cache) {
+      std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+      uncompressed_cache_ = std::make_shared<MyBlockCache>(cache);
+    }
+
+    env_->count_random_reads_ = true;
+
+    Options options = CurrentOptions();
+    Random rnd(301);
+    BlockBasedTableOptions table_options;
+
+#ifndef ROCKSDB_LITE
+    if (compression_enabled_) {
+      std::vector<CompressionType> compression_types;
+      compression_types = GetSupportedCompressions();
+      // Not every platform may have compression libraries available, so
+      // dynamically pick based on what's available
+      CompressionType tmp_type = kNoCompression;
+      for (auto c_type : compression_types) {
+        if (c_type != kNoCompression) {
+          tmp_type = c_type;
+          break;
+        }
+      }
+      if (tmp_type != kNoCompression) {
+        options.compression = tmp_type;
+      } else {
+        compression_enabled_ = false;
+      }
+    }
+#else
+    // GetSupportedCompressions() is not available in LITE build
+    if (!Snappy_Supported()) {
+      compression_enabled_ = false;
+    }
+#endif  // ROCKSDB_LITE
+
+    table_options.block_cache = uncompressed_cache_;
+    if (table_options.block_cache == nullptr) {
+      table_options.no_block_cache = true;
+    } else {
+      table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    }
+    table_options.block_cache_compressed = compressed_cache_;
+    table_options.flush_block_policy_factory.reset(
+        new MyFlushBlockPolicyFactory());
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    if (!compression_enabled_) {
+      options.compression = kNoCompression;
+    } else {
+      options.compression_opts.parallel_threads = compression_parallel_threads;
+    }
+    options_ = options;
+    Reopen(options);
+
+    if (num_cfs > 1) {
+      for (int cf = 0; cf < num_cfs; ++cf) {
+        cf_names_.emplace_back("cf" + std::to_string(cf));
+      }
+      CreateColumnFamilies(cf_names_, options);
+      cf_names_.emplace_back("default");
+    }
+
+    std::string zero_str(128, '\0');
+    for (int cf = 0; cf < num_cfs; ++cf) {
+      for (int i = 0; i < 100; ++i) {
+        // Make the value compressible. A purely random string doesn't compress
+        // and the resultant data block will not be compressed
+        values_.emplace_back(rnd.RandomString(128) + zero_str);
+        assert(((num_cfs == 1) ? Put(Key(i), values_[i])
+                               : Put(cf, Key(i), values_[i])) == Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+
+      for (int i = 0; i < 100; ++i) {
+        // block cannot gain space by compression
+        uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0');
+        std::string tmp_key = "a" + Key(i);
+        assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i])
+                               : Put(cf, tmp_key, uncompressable_values_[i])) ==
+               Status::OK());
+      }
+      if (num_cfs == 1) {
+        EXPECT_OK(Flush());
+      } else {
+        EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
+      }
+    }
+    // Clear compressed cache, which is always pre-populated
+    if (compressed_cache_) {
+      compressed_cache_->SetCapacity(0);
+      compressed_cache_->SetCapacity(1048576);
+    }
+  }
+
+  bool CheckValue(int i, const std::string& value) {
+    if (values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  bool CheckUncompressableValue(int i, const std::string& value) {
+    if (uncompressable_values_[i].compare(value) == 0) {
+      return true;
+    }
+    return false;
+  }
+
+  const std::vector<std::string>& GetCFNames() const { return cf_names_; }
+
+  int num_lookups() { return uncompressed_cache_->num_lookups(); }
+  int num_found() { return uncompressed_cache_->num_found(); }
+  int num_inserts() { return uncompressed_cache_->num_inserts(); }
+
+  int num_lookups_compressed() { return compressed_cache_->num_lookups(); }
+  int num_found_compressed() { return compressed_cache_->num_found(); }
+  int num_inserts_compressed() { return compressed_cache_->num_inserts(); }
+
+  bool fill_cache() { return fill_cache_; }
+  bool compression_enabled() { return compression_enabled_; }
+  bool has_compressed_cache() { return compressed_cache_ != nullptr; }
+  bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; }
+  Options get_options() { return options_; }
+
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+ protected:
+  class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory {
+   public:
+    MyFlushBlockPolicyFactory() {}
+
+    virtual const char* Name() const override {
+      return "MyFlushBlockPolicyFactory";
+    }
+
+    virtual FlushBlockPolicy* NewFlushBlockPolicy(
+        const BlockBasedTableOptions& /*table_options*/,
+        const BlockBuilder& data_block_builder) const override {
+      return new MyFlushBlockPolicy(data_block_builder);
+    }
+  };
+
+  class MyFlushBlockPolicy : public FlushBlockPolicy {
+   public:
+    explicit MyFlushBlockPolicy(const BlockBuilder& data_block_builder)
+        : num_keys_(0), data_block_builder_(data_block_builder) {}
+
+    bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+      if (data_block_builder_.empty()) {
+        // First key in this block
+        num_keys_ = 1;
+        return false;
+      }
+      // Flush every 10 keys
+      if (num_keys_ == 10) {
+        num_keys_ = 1;
+        return true;
+      }
+      num_keys_++;
+      return false;
+    }
+
+   private:
+    int num_keys_;
+    const BlockBuilder& data_block_builder_;
+  };
+
+  class MyBlockCache : public CacheWrapper {
+   public:
+    explicit MyBlockCache(std::shared_ptr<Cache> target)
+        : CacheWrapper(target),
+          num_lookups_(0),
+          num_found_(0),
+          num_inserts_(0) {}
+
+    const char* Name() const override { return "MyBlockCache"; }
+
+    using Cache::Insert;
+    Status Insert(const Slice& key, void* value, size_t charge,
+                  void (*deleter)(const Slice& key, void* value),
+                  Handle** handle = nullptr,
+                  Priority priority = Priority::LOW) override {
+      num_inserts_++;
+      return target_->Insert(key, value, charge, deleter, handle, priority);
+    }
+
+    using Cache::Lookup;
+    Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+      num_lookups_++;
+      Handle* handle = target_->Lookup(key, stats);
+      if (handle != nullptr) {
+        num_found_++;
+      }
+      return handle;
+    }
+    int num_lookups() { return num_lookups_; }
+
+    int num_found() { return num_found_; }
+
+    int num_inserts() { return num_inserts_; }
+
+   private:
+    int num_lookups_;
+    int num_found_;
+    int num_inserts_;
+  };
+
+  std::shared_ptr<MyBlockCache> compressed_cache_;
+  std::shared_ptr<MyBlockCache> uncompressed_cache_;
+  Options options_;
+  bool compression_enabled_;
+  std::vector<std::string> values_;
+  std::vector<std::string> uncompressable_values_;
+  bool fill_cache_;
+  std::vector<std::string> cf_names_;
+};
+
+class DBBasicTestWithParallelIO
+    : public DBBasicTestMultiGet,
+      public testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, uint32_t>> {
+ public:
+  DBBasicTestWithParallelIO()
+      : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1,
+                            std::get<0>(GetParam()), std::get<1>(GetParam()),
+                            std::get<2>(GetParam()), std::get<3>(GetParam()),
+                            std::get<4>(GetParam())) {}
+};
+
+TEST_P(DBBasicTestWithParallelIO, MultiGet) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads + (read_from_cache ? 0 : 2);
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    key_data[i] = Key(key_ints[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_ints.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 2 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 2 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(10);
+  statuses.resize(10);
+  std::vector<int> key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85};
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    key_data[i] = "a" + Key(key_uncmp[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_uncmp.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 3 : 3);
+  } else {
+    expected_reads += (read_from_cache ? 4 : 4);
+  }
+  ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+
+  keys.resize(5);
+  statuses.resize(5);
+  std::vector<int> key_tr{1, 2, 15, 16, 55};
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    key_data[i] = "a" + Key(key_tr[i]);
+    keys[i] = Slice(key_data[i]);
+    statuses[i] = Status::OK();
+    values[i].Reset();
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  for (size_t i = 0; i < key_tr.size(); ++i) {
+    ASSERT_OK(statuses[i]);
+    ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString()));
+  }
+  if (compression_enabled() && !has_compressed_cache()) {
+    expected_reads += (read_from_cache ? 0 : 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  } else {
+    if (has_uncompressed_cache()) {
+      expected_reads += (read_from_cache ? 0 : 3);
+      ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+    } else {
+      // A rare case, even we enable the block compression but some of data
+      // blocks are not compressed due to content. If user only enable the
+      // compressed cache, the uncompressed blocks will not tbe cached, and
+      // block reads will be triggered. The number of reads is related to
+      // the compression algorithm.
+      ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads);
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) {
+  class FakeDirectIOEnv : public EnvWrapper {
+    class FakeDirectIOSequentialFile;
+    class FakeDirectIORandomAccessFile;
+
+   public:
+    FakeDirectIOEnv(Env* env) : EnvWrapper(env) {}
+    static const char* kClassName() { return "FakeDirectIOEnv"; }
+    const char* Name() const override { return kClassName(); }
+
+    Status NewRandomAccessFile(const std::string& fname,
+                               std::unique_ptr<RandomAccessFile>* result,
+                               const EnvOptions& options) override {
+      std::unique_ptr<RandomAccessFile> file;
+      assert(options.use_direct_reads);
+      EnvOptions opts = options;
+      opts.use_direct_reads = false;
+      Status s = target()->NewRandomAccessFile(fname, &file, opts);
+      if (!s.ok()) {
+        return s;
+      }
+      result->reset(new FakeDirectIORandomAccessFile(std::move(file)));
+      return s;
+    }
+
+   private:
+    class FakeDirectIOSequentialFile : public SequentialFileWrapper {
+     public:
+      FakeDirectIOSequentialFile(std::unique_ptr<SequentialFile>&& file)
+          : SequentialFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIOSequentialFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<SequentialFile> file_;
+    };
+
+    class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper {
+     public:
+      FakeDirectIORandomAccessFile(std::unique_ptr<RandomAccessFile>&& file)
+          : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {}
+      ~FakeDirectIORandomAccessFile() {}
+
+      bool use_direct_io() const override { return true; }
+      size_t GetRequiredBufferAlignment() const override { return 1; }
+
+     private:
+      std::unique_ptr<RandomAccessFile> file_;
+    };
+  };
+
+  std::unique_ptr<FakeDirectIOEnv> env(new FakeDirectIOEnv(env_));
+  Options opts = get_options();
+  opts.env = env.get();
+  opts.use_direct_reads = true;
+  Reopen(opts);
+
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+
+  int random_reads = env_->random_read_counter_.Read();
+  key_data[0] = Key(1);
+  key_data[1] = Key(51);
+  keys[0] = Slice(key_data[0]);
+  keys[1] = Slice(key_data[1]);
+  values[0].Reset();
+  values[1].Reset();
+  if (uncompressed_cache_) {
+    uncompressed_cache_->SetCapacity(0);
+    uncompressed_cache_->SetCapacity(1048576);
+  }
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(1, values[0].ToString()));
+  ASSERT_TRUE(CheckValue(51, values[1].ToString()));
+
+  bool read_from_cache = false;
+  if (fill_cache()) {
+    if (has_uncompressed_cache()) {
+      read_from_cache = true;
+    } else if (has_compressed_cache() && compression_enabled()) {
+      read_from_cache = true;
+    }
+  }
+
+  int expected_reads = random_reads;
+  if (!compression_enabled() || !has_compressed_cache()) {
+    expected_reads += 2;
+  } else {
+    expected_reads += (read_from_cache ? 0 : 2);
+  }
+  if (env_->random_read_counter_.Read() != expected_reads) {
+    ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads);
+  }
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  int read_count = 0;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        read_count++;
+        if (read_count == 2) {
+          *s = Status::Corruption();
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_TRUE(CheckValue(0, values[0].ToString()));
+  // ASSERT_TRUE(CheckValue(50, values[1].ToString()));
+  ASSERT_EQ(statuses[0], Status::OK());
+  ASSERT_EQ(statuses[1], Status::Corruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
+  std::vector<std::string> key_data(10);
+  std::vector<Slice> keys;
+  // We cannot resize a PinnableSlice vector, so just set initial size to
+  // largest we think we will need
+  std::vector<PinnableSlice> values(10);
+  std::vector<Status> statuses;
+  ReadOptions ro;
+  ro.fill_cache = fill_cache();
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::MultiGet:FindTable", [&](void* status) {
+        Status* s = static_cast<Status*>(status);
+        *s = Status::IOError();
+      });
+  // DB open will create table readers unless we reduce the table cache
+  // capacity.
+  // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+  // is allocated with max_open_files - 10 as capacity. So override
+  // max_open_files to 11 so table cache capacity will become 1. This will
+  // prevent file open during DB open and force the file to be opened
+  // during MultiGet
+  SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = (int*)arg;
+        *max_open_files = 11;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(CurrentOptions());
+
+  // Warm up the cache first
+  key_data.emplace_back(Key(0));
+  keys.emplace_back(Slice(key_data.back()));
+  key_data.emplace_back(Key(50));
+  keys.emplace_back(Slice(key_data.back()));
+  statuses.resize(keys.size());
+
+  dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
+                     keys.data(), values.data(), statuses.data(), true);
+  ASSERT_EQ(statuses[0], Status::IOError());
+  ASSERT_EQ(statuses[1], Status::IOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO,
+                        // Params are as follows -
+                        // Param 0 - Compressed cache enabled
+                        // Param 1 - Uncompressed cache enabled
+                        // Param 2 - Data compression enabled
+                        // Param 3 - ReadOptions::fill_cache
+                        // Param 4 - CompressionOptions::parallel_threads
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Values(1, 4)));
+
+// Forward declaration
+class DeadlineFS;
+
+class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper {
+ public:
+  DeadlineRandomAccessFile(DeadlineFS& fs,
+                           std::unique_ptr<FSRandomAccessFile>& file)
+      : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {}
+
+  IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
+                Slice* result, char* scratch,
+                IODebugContext* dbg) const override;
+
+  IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
+                     const IOOptions& options, IODebugContext* dbg) override;
+
+  IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts,
+                     std::function<void(const FSReadRequest&, void*)> cb,
+                     void* cb_arg, void** io_handle, IOHandleDeleter* del_fn,
+                     IODebugContext* dbg) override;
+
+ private:
+  DeadlineFS& fs_;
+  std::unique_ptr<FSRandomAccessFile> file_;
+};
+
+class DeadlineFS : public FileSystemWrapper {
+ public:
+  // The error_on_delay parameter specifies whether a IOStatus::TimedOut()
+  // status should be returned after delaying the IO to exceed the timeout,
+  // or to simply delay but return success anyway. The latter mimics the
+  // behavior of PosixFileSystem, which does not enforce any timeout
+  explicit DeadlineFS(SpecialEnv* env, bool error_on_delay)
+      : FileSystemWrapper(env->GetFileSystem()),
+        deadline_(std::chrono::microseconds::zero()),
+        io_timeout_(std::chrono::microseconds::zero()),
+        env_(env),
+        timedout_(false),
+        ignore_deadline_(false),
+        error_on_delay_(error_on_delay) {}
+
+  static const char* kClassName() { return "DeadlineFileSystem"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    std::unique_ptr<FSRandomAccessFile> file;
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
+    EXPECT_OK(s);
+    result->reset(new DeadlineRandomAccessFile(*this, file));
+
+    const std::chrono::microseconds deadline = GetDeadline();
+    const std::chrono::microseconds io_timeout = GetIOTimeout();
+    if (deadline.count() || io_timeout.count()) {
+      AssertDeadline(deadline, io_timeout, opts.io_options);
+    }
+    return ShouldDelay(opts.io_options);
+  }
+
+  // Set a vector of {IO counter, delay in microseconds, return status} tuples
+  // that control when to inject a delay and duration of the delay
+  void SetDelayTrigger(const std::chrono::microseconds deadline,
+                       const std::chrono::microseconds io_timeout,
+                       const int trigger) {
+    delay_trigger_ = trigger;
+    io_count_ = 0;
+    deadline_ = deadline;
+    io_timeout_ = io_timeout;
+    timedout_ = false;
+  }
+
+  // Increment the IO counter and return a delay in microseconds
+  IOStatus ShouldDelay(const IOOptions& opts) {
+    if (timedout_) {
+      return IOStatus::TimedOut();
+    } else if (!deadline_.count() && !io_timeout_.count()) {
+      return IOStatus::OK();
+    }
+    if (!ignore_deadline_ && delay_trigger_ == io_count_++) {
+      env_->SleepForMicroseconds(static_cast<int>(opts.timeout.count() + 1));
+      timedout_ = true;
+      if (error_on_delay_) {
+        return IOStatus::TimedOut();
+      }
+    }
+    return IOStatus::OK();
+  }
+
+  const std::chrono::microseconds GetDeadline() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_;
+  }
+
+  const std::chrono::microseconds GetIOTimeout() {
+    return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_;
+  }
+
+  bool TimedOut() { return timedout_; }
+
+  void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; }
+
+  void AssertDeadline(const std::chrono::microseconds deadline,
+                      const std::chrono::microseconds io_timeout,
+                      const IOOptions& opts) const {
+    // Give a leeway of +- 10us as it can take some time for the Get/
+    // MultiGet call to reach here, in order to avoid false alarms
+    std::chrono::microseconds now =
+        std::chrono::microseconds(env_->NowMicros());
+    std::chrono::microseconds timeout;
+    if (deadline.count()) {
+      timeout = deadline - now;
+      if (io_timeout.count()) {
+        timeout = std::min(timeout, io_timeout);
+      }
+    } else {
+      timeout = io_timeout;
+    }
+    if (opts.timeout != timeout) {
+      ASSERT_EQ(timeout, opts.timeout);
+    }
+  }
+
+ private:
+  // The number of IOs to trigger the delay after
+  int delay_trigger_;
+  // Current IO count
+  int io_count_;
+  // ReadOptions deadline for the Get/MultiGet/Iterator
+  std::chrono::microseconds deadline_;
+  // ReadOptions io_timeout for the Get/MultiGet/Iterator
+  std::chrono::microseconds io_timeout_;
+  SpecialEnv* env_;
+  // Flag to indicate whether we injected a delay
+  bool timedout_;
+  // Temporarily ignore deadlines/timeouts
+  bool ignore_deadline_;
+  // Return IOStatus::TimedOut() or IOStatus::OK()
+  bool error_on_delay_;
+};
+
+IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len,
+                                        const IOOptions& opts, Slice* result,
+                                        char* scratch,
+                                        IODebugContext* dbg) const {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
+                                        dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
+IOStatus DeadlineRandomAccessFile::ReadAsync(
+    FSReadRequest& req, const IOOptions& opts,
+    std::function<void(const FSReadRequest&, void*)> cb, void* cb_arg,
+    void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, opts);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle,
+                                             del_fn, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(opts);
+  }
+  return s;
+}
+
+IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs,
+                                             size_t num_reqs,
+                                             const IOOptions& options,
+                                             IODebugContext* dbg) {
+  const std::chrono::microseconds deadline = fs_.GetDeadline();
+  const std::chrono::microseconds io_timeout = fs_.GetIOTimeout();
+  IOStatus s;
+  if (deadline.count() || io_timeout.count()) {
+    fs_.AssertDeadline(deadline, io_timeout, options);
+  }
+  if (s.ok()) {
+    s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
+  }
+  if (s.ok()) {
+    s = fs_.ShouldDelay(options);
+  }
+  return s;
+}
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the MultiGet deadline feature
+class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet,
+                                    public testing::WithParamInterface<bool> {
+ public:
+  DBBasicTestMultiGetDeadline()
+      : DBBasicTestMultiGet(
+            "db_basic_test_multiget_deadline" /*Test dir*/,
+            10 /*# of column families*/, false /*compressed cache enabled*/,
+            true /*uncompressed cache enabled*/, true /*compression enabled*/,
+            true /*ReadOptions.fill_cache*/,
+            1 /*# of parallel compression threads*/) {}
+
+  inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
+    for (size_t i = 0; i < statuses.size(); ++i) {
+      if (i < num_ok) {
+        EXPECT_OK(statuses[i]);
+      } else {
+        if (statuses[i] != Status::TimedOut()) {
+          EXPECT_EQ(statuses[i], Status::TimedOut());
+        }
+      }
+    }
+  }
+};
+
+TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
+#ifndef USE_COROUTINES
+  if (GetParam()) {
+    ROCKSDB_GTEST_SKIP("This test requires coroutine support");
+    return;
+  }
+#endif  // USE_COROUTINES
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, false);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  Options options = CurrentOptions();
+
+  std::shared_ptr<Cache> cache = NewLRUCache(1048576);
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.env = env.get();
+  SetTimeElapseOnlySleepOnReopen(&options);
+  ReopenWithColumnFamilies(GetCFNames(), options);
+
+  // Test the non-batched version of MultiGet with multiple column
+  // families
+  std::vector<std::string> key_str;
+  size_t i;
+  for (i = 0; i < 5; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  std::vector<ColumnFamilyHandle*> cfs(key_str.size());
+  ;
+  std::vector<Slice> keys(key_str.size());
+  std::vector<std::string> values(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    cfs[i] = handles_[i];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+
+  ReadOptions ro;
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  ro.async_io = GetParam();
+  // Delay the first IO
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+
+  std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  // The first key is successful because we check after the lookup, but
+  // subsequent keys fail due to deadline exceeded
+  CheckStatus(statuses, 1);
+
+  // Clear the cache
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  // Test non-batched Multiget with multiple column families and
+  // introducing an IO delay in one of the middle CFs
+  key_str.clear();
+  for (i = 0; i < 10; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  cfs.resize(key_str.size());
+  keys.resize(key_str.size());
+  values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    // 2 keys per CF
+    cfs[i] = handles_[i / 2];
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
+  CheckStatus(statuses, 3);
+
+  // Test batched MultiGet with an IO delay in the first data block read.
+  // Both keys in the first CF should succeed as they're in the same data
+  // block and would form one batch, and we check for deadline between
+  // batches.
+  std::vector<PinnableSlice> pin_values(keys.size());
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 2);
+
+  // Similar to the previous one, but an IO delay in the third CF data block
+  // read
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 6);
+
+  // Similar to the previous one, but an IO delay in the last but one CF
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3);
+  dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 8);
+
+  // Test batched MultiGet with single CF and lots of keys. Inject delay
+  // into the second batch of keys. As each batch is 32, the first 64 keys,
+  // i.e first two batches, should succeed and the rest should time out
+  for (PinnableSlice& value : pin_values) {
+    value.Reset();
+  }
+  cache->SetCapacity(0);
+  cache->SetCapacity(1048576);
+  key_str.clear();
+  for (i = 0; i < 100; ++i) {
+    key_str.emplace_back(Key(static_cast<int>(i)));
+  }
+  keys.resize(key_str.size());
+  pin_values.clear();
+  pin_values.resize(key_str.size());
+  for (i = 0; i < key_str.size(); ++i) {
+    keys[i] = Slice(key_str[i].data(), key_str[i].size());
+  }
+  statuses.clear();
+  statuses.resize(keys.size());
+  ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+  fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1);
+  dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
+                     pin_values.data(), statuses.data());
+  CheckStatus(statuses, 64);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline,
+                        ::testing::Bool());
+
+TEST_F(DBBasicTest, ManifestWriteFailure) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*s);
+        // Manually overwrite return status
+        *s = Status::IOError();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("key", "value"));
+  ASSERT_NOK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+}
+
+TEST_F(DBBasicTest, DestroyDefaultCfHandle) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  for (const auto* h : handles_) {
+    ASSERT_NE(db_->DefaultColumnFamily(), h);
+  }
+
+  // We have two handles to the default column family. The two handles point to
+  // different ColumnFamilyHandle objects.
+  assert(db_->DefaultColumnFamily());
+  ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID());
+  assert(handles_[0]);
+  ASSERT_EQ(0U, handles_[0]->GetID());
+
+  // You can destroy handles_[...].
+  for (auto* h : handles_) {
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(h));
+  }
+  handles_.clear();
+
+  // But you should not destroy db_->DefaultColumnFamily(), since it's going to
+  // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used
+  // elsewhere internally too.
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument());
+}
+
+TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        assert(s);
+        *s = Status::IOError("Injected");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s = TryReopen(options);
+  ASSERT_EQ(nullptr, options.info_log);
+  ASSERT_TRUE(s.IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTest, VerifyFileChecksums) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  Reopen(options);
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Write an L0 with checksum computed.
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+
+  // Does the right thing but with the wrong name -- using it should lead to an
+  // error.
+  class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c {
+   public:
+    MisnamedFileChecksumGenerator(const FileChecksumGenContext& context)
+        : FileChecksumGenCrc32c(context) {}
+
+    const char* Name() const override { return "sha1"; }
+  };
+
+  class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory {
+   public:
+    std::unique_ptr<FileChecksumGenerator> CreateFileChecksumGenerator(
+        const FileChecksumGenContext& context) override {
+      return std::unique_ptr<FileChecksumGenerator>(
+          new MisnamedFileChecksumGenerator(context));
+    }
+  };
+
+  options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory());
+  Reopen(options);
+  ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument());
+}
+
+// TODO: re-enable after we provide finer-grained control for WAL tracking to
+// meet the needs of different use cases, durability levels and recovery modes.
+TEST_F(DBBasicTest, DISABLED_ManualWalSync) {
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x", "y"));
+  // This does not create a new WAL.
+  ASSERT_OK(db_->SyncWAL());
+  EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+
+  std::unique_ptr<LogFile> wal;
+  Status s = db_->GetCurrentWalFile(&wal);
+  ASSERT_OK(s);
+  Close();
+
+  EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber())));
+
+  ASSERT_TRUE(TryReopen(options).IsCorruption());
+}
+#endif  // !ROCKSDB_LITE
+
+// A test class for intercepting random reads and injecting artificial
+// delays. Used for testing the deadline/timeout feature
+class DBBasicTestDeadline
+    : public DBBasicTest,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {};
+
+TEST_P(DBBasicTestDeadline, PointLookupDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    option_config_ = option_config;
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // Fileter block reads currently don't cause the request to get
+    // aborted on a read timeout, so its possible those block reads
+    // may get issued even if the deadline is past
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:BeforeFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(true); });
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Get:AfterFilterMatch",
+        [&](void* /*arg*/) { fs->IgnoreDeadline(false); });
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
+
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
+    }
+
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + std::to_string(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    // and cause the Get() to fail.
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      std::string value;
+      Status s = dbfull()->Get(ro, "k50", &value);
+      if (fs->TimedOut()) {
+        ASSERT_EQ(s, Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(s);
+      }
+      io_deadline_trigger++;
+    }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
+  }
+  Close();
+}
+
+TEST_P(DBBasicTestDeadline, IteratorDeadline) {
+  std::shared_ptr<DeadlineFS> fs = std::make_shared<DeadlineFS>(env_, true);
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
+  bool set_deadline = std::get<0>(GetParam());
+  bool set_timeout = std::get<1>(GetParam());
+
+  for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+    if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) {
+      continue;
+    }
+    Options options = CurrentOptions();
+    if (options.use_direct_reads) {
+      continue;
+    }
+    options.env = env.get();
+    options.disable_auto_compactions = true;
+    Cache* block_cache = nullptr;
+    // DB open will create table readers unless we reduce the table cache
+    // capacity.
+    // SanitizeOptions will set max_open_files to minimum of 20. Table cache
+    // is allocated with max_open_files - 10 as capacity. So override
+    // max_open_files to 11 so table cache capacity will become 1. This will
+    // prevent file open during DB open and force the file to be opened
+    // during MultiGet
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = (int*)arg;
+          *max_open_files = 11;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    SetTimeElapseOnlySleepOnReopen(&options);
+    Reopen(options);
+
+    if (options.table_factory) {
+      block_cache = options.table_factory->GetOptions<Cache>(
+          TableFactory::kBlockCacheOpts());
+    }
+
+    Random rnd(301);
+    for (int i = 0; i < 400; ++i) {
+      std::string key = "k" + std::to_string(i);
+      ASSERT_OK(Put(key, rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    bool timedout = true;
+    // A timeout will be forced when the IO counter reaches this value
+    int io_deadline_trigger = 0;
+    // Keep incrementing io_deadline_trigger and call Get() until there is an
+    // iteration that doesn't cause a timeout. This ensures that we cover
+    // all file reads in the point lookup path that can potentially timeout
+    while (timedout) {
+      ReadOptions ro;
+      if (set_deadline) {
+        ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
+      }
+      if (set_timeout) {
+        ro.io_timeout = std::chrono::microseconds{5000};
+      }
+      fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger);
+
+      block_cache->SetCapacity(0);
+      block_cache->SetCapacity(1048576);
+
+      Iterator* iter = dbfull()->NewIterator(ro);
+      int count = 0;
+      iter->Seek("k50");
+      while (iter->Valid() && count++ < 100) {
+        iter->Next();
+      }
+      if (fs->TimedOut()) {
+        ASSERT_FALSE(iter->Valid());
+        ASSERT_EQ(iter->status(), Status::TimedOut());
+      } else {
+        timedout = false;
+        ASSERT_OK(iter->status());
+      }
+      delete iter;
+      io_deadline_trigger++;
+    }
+    // Reset the delay sequence in order to avoid false alarms during Reopen
+    fs->SetDelayTrigger(std::chrono::microseconds::zero(),
+                        std::chrono::microseconds::zero(), 0);
+  }
+  Close();
+}
+
+// Param 0: If true, set read_options.deadline
+// Param 1: If true, set read_options.io_timeout
+INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, true),
+                                          std::make_tuple(true, true)));
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_block_cache_test.cc b/src/rocksdb/db/db_block_cache_test.cc
new file mode 100644
index 000000000..db80b82cb
--- /dev/null
+++ b/src/rocksdb/db/db_block_cache_test.cc
@@ -0,0 +1,2313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <unordered_set>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_key.h"
+#include "cache/lru_cache.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/unique_id_gen.h"
+#include "port/stack_trace.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/compression.h"
+#include "util/defer.h"
+#include "util/hash.h"
+#include "util/math.h"
+#include "util/random.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBBlockCacheTest : public DBTestBase {
+ private:
+  size_t miss_count_ = 0;
+  size_t hit_count_ = 0;
+  size_t insert_count_ = 0;
+  size_t failure_count_ = 0;
+  size_t compression_dict_miss_count_ = 0;
+  size_t compression_dict_hit_count_ = 0;
+  size_t compression_dict_insert_count_ = 0;
+  size_t compressed_miss_count_ = 0;
+  size_t compressed_hit_count_ = 0;
+  size_t compressed_insert_count_ = 0;
+  size_t compressed_failure_count_ = 0;
+
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+
+  DBBlockCacheTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/true) {}
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    // Set a small enough block size so that each key-value get its own block.
+    table_options.block_size = 1;
+    return table_options;
+  }
+
+  Options GetOptions(const BlockBasedTableOptions& table_options) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.avoid_flush_during_recovery = false;
+    // options.compression = kNoCompression;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    return options;
+  }
+
+  void InitTable(const Options& /*options*/) {
+    std::string value(kValueSize, 'a');
+    for (size_t i = 0; i < kNumBlocks; i++) {
+      ASSERT_OK(Put(std::to_string(i), value.c_str()));
+    }
+  }
+
+  void RecordCacheCounters(const Options& options) {
+    miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    compressed_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    compressed_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    compressed_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    compressed_failure_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+  }
+
+  void RecordCacheCountersForCompressionDict(const Options& options) {
+    compression_dict_miss_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    compression_dict_hit_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    compression_dict_insert_count_ =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+  }
+
+  void CheckCacheCounters(const Options& options, size_t expected_misses,
+                          size_t expected_hits, size_t expected_inserts,
+                          size_t expected_failures) {
+    size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS);
+    size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES);
+    ASSERT_EQ(miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(failure_count_ + expected_failures, new_failure_count);
+    miss_count_ = new_miss_count;
+    hit_count_ = new_hit_count;
+    insert_count_ = new_insert_count;
+    failure_count_ = new_failure_count;
+  }
+
+  void CheckCacheCountersForCompressionDict(
+      const Options& options, size_t expected_compression_dict_misses,
+      size_t expected_compression_dict_hits,
+      size_t expected_compression_dict_inserts) {
+    size_t new_compression_dict_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+    size_t new_compression_dict_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_HIT);
+    size_t new_compression_dict_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_ADD);
+    ASSERT_EQ(compression_dict_miss_count_ + expected_compression_dict_misses,
+              new_compression_dict_miss_count);
+    ASSERT_EQ(compression_dict_hit_count_ + expected_compression_dict_hits,
+              new_compression_dict_hit_count);
+    ASSERT_EQ(
+        compression_dict_insert_count_ + expected_compression_dict_inserts,
+        new_compression_dict_insert_count);
+    compression_dict_miss_count_ = new_compression_dict_miss_count;
+    compression_dict_hit_count_ = new_compression_dict_hit_count;
+    compression_dict_insert_count_ = new_compression_dict_insert_count;
+  }
+
+  void CheckCompressedCacheCounters(const Options& options,
+                                    size_t expected_misses,
+                                    size_t expected_hits,
+                                    size_t expected_inserts,
+                                    size_t expected_failures) {
+    size_t new_miss_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
+    size_t new_hit_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
+    size_t new_insert_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD);
+    size_t new_failure_count =
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES);
+    ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count);
+    ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count);
+    ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count);
+    ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count);
+    compressed_miss_count_ = new_miss_count;
+    compressed_hit_count_ = new_hit_count;
+    compressed_insert_count_ = new_insert_count;
+    compressed_failure_count_ = new_failure_count;
+  }
+
+#ifndef ROCKSDB_LITE
+  const std::array<size_t, kNumCacheEntryRoles> GetCacheEntryRoleCountsBg() {
+    // Verify in cache entry role stats
+    std::array<size_t, kNumCacheEntryRoles> cache_entry_role_counts;
+    std::map<std::string, std::string> values;
+    EXPECT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                    &values));
+    for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+      auto role = static_cast<CacheEntryRole>(i);
+      cache_entry_role_counts[i] =
+          ParseSizeT(values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+    }
+    return cache_entry_role_counts;
+  }
+#endif  // ROCKSDB_LITE
+};
+
+TEST_F(DBBlockCacheTest, IteratorBlockCacheUsage) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(0));
+  ASSERT_LT(0, cache->GetUsage());
+  delete iter;
+  iter = nullptr;
+  ASSERT_EQ(0, cache->GetUsage());
+}
+
+TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) {
+  ReadOptions read_options;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  std::vector<std::unique_ptr<Iterator>> iterators(kNumBlocks - 1);
+  Iterator* iter = nullptr;
+
+  // Load blocks into cache.
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(std::to_string(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    iterators[i].reset(iter);
+  }
+  size_t usage = cache->GetUsage();
+  ASSERT_LT(0, usage);
+  cache->SetCapacity(usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+  // Test with strict capacity limit.
+  cache->SetStrictCapacityLimit(true);
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(kNumBlocks - 1));
+  ASSERT_TRUE(iter->status().IsMemoryLimit());
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  delete iter;
+  iter = nullptr;
+
+  // Release iterators and access cache again.
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iterators[i].reset();
+    CheckCacheCounters(options, 0, 0, 0, 0);
+  }
+  ASSERT_EQ(0, cache->GetPinnedUsage());
+  for (size_t i = 0; i + 1 < kNumBlocks; i++) {
+    iter = db_->NewIterator(read_options);
+    iter->Seek(std::to_string(i));
+    ASSERT_OK(iter->status());
+    CheckCacheCounters(options, 0, 1, 0, 0);
+    iterators[i].reset(iter);
+  }
+}
+
+#ifdef SNAPPY
+TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  table_options.block_size = 1;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = CompressionType::kSnappyCompression;
+
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 0; i < kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+  }
+
+  ReadOptions read_options;
+  std::shared_ptr<Cache> compressed_cache = NewLRUCache(1 << 25, 0, false);
+  LRUCacheOptions co;
+  co.capacity = 0;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  // Needed not to count entry stats collector
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  table_options.no_block_cache = false;
+  table_options.block_cache_compressed = compressed_cache;
+  table_options.max_auto_readahead_size = 0;
+  table_options.cache_index_and_filter_blocks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  RecordCacheCounters(options);
+
+  // Load blocks into cache.
+  for (size_t i = 0; i < kNumBlocks - 1; i++) {
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    CheckCacheCounters(options, 1, 0, 1, 0);
+    CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+  }
+
+  size_t usage = cache->GetUsage();
+  ASSERT_EQ(0, usage);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+  size_t compressed_usage = compressed_cache->GetUsage();
+  ASSERT_LT(0, compressed_usage);
+  // Compressed block cache cannot be pinned.
+  ASSERT_EQ(0, compressed_cache->GetPinnedUsage());
+
+  // Set strict capacity limit flag. Now block will only load into compressed
+  // block cache.
+  cache->SetCapacity(usage);
+  cache->SetStrictCapacityLimit(true);
+  ASSERT_EQ(usage, cache->GetPinnedUsage());
+
+  // Load last key block.
+  ASSERT_EQ(
+      "Operation aborted: Memory limit reached: Insert failed due to LRU cache "
+      "being full.",
+      Get(std::to_string(kNumBlocks - 1)));
+  // Failure will also record the miss counter.
+  CheckCacheCounters(options, 1, 0, 0, 1);
+  CheckCompressedCacheCounters(options, 1, 0, 1, 0);
+
+  // Clear strict capacity limit flag. This time we shall hit compressed block
+  // cache and load into block cache.
+  cache->SetStrictCapacityLimit(false);
+  // Load last key block.
+  ASSERT_EQ(value, Get(std::to_string(kNumBlocks - 1)));
+  CheckCacheCounters(options, 1, 0, 1, 0);
+  CheckCompressedCacheCounters(options, 0, 1, 0, 0);
+}
+
+namespace {
+class PersistentCacheFromCache : public PersistentCache {
+ public:
+  PersistentCacheFromCache(std::shared_ptr<Cache> cache, bool read_only)
+      : cache_(cache), read_only_(read_only) {}
+
+  Status Insert(const Slice& key, const char* data,
+                const size_t size) override {
+    if (read_only_) {
+      return Status::NotSupported();
+    }
+    std::unique_ptr<char[]> copy{new char[size]};
+    std::copy_n(data, size, copy.get());
+    Status s = cache_->Insert(
+        key, copy.get(), size,
+        GetCacheEntryDeleterForRole<char[], CacheEntryRole::kMisc>());
+    if (s.ok()) {
+      copy.release();
+    }
+    return s;
+  }
+
+  Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    auto handle = cache_->Lookup(key);
+    if (handle) {
+      char* ptr = static_cast<char*>(cache_->Value(handle));
+      *size = cache_->GetCharge(handle);
+      data->reset(new char[*size]);
+      std::copy_n(ptr, *size, data->get());
+      cache_->Release(handle);
+      return Status::OK();
+    } else {
+      return Status::NotFound();
+    }
+  }
+
+  bool IsCompressed() override { return false; }
+
+  StatsType Stats() override { return StatsType(); }
+
+  std::string GetPrintableOptions() const override { return ""; }
+
+  uint64_t NewId() override { return cache_->NewId(); }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  bool read_only_;
+};
+
+class ReadOnlyCacheWrapper : public CacheWrapper {
+  using CacheWrapper::CacheWrapper;
+
+  using Cache::Insert;
+  Status Insert(const Slice& /*key*/, void* /*value*/, size_t /*charge*/,
+                void (*)(const Slice& key, void* value) /*deleter*/,
+                Handle** /*handle*/, Priority /*priority*/) override {
+    return Status::NotSupported();
+  }
+};
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, TestWithSameCompressed) {
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> rw_cache{NewLRUCache(1000000)};
+  std::shared_ptr<PersistentCacheFromCache> rw_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ false)};
+  // Exercise some obscure behavior with read-only wrappers
+  std::shared_ptr<Cache> ro_cache{new ReadOnlyCacheWrapper(rw_cache)};
+  std::shared_ptr<PersistentCacheFromCache> ro_pcache{
+      new PersistentCacheFromCache(rw_cache, /*read_only*/ true)};
+
+  // Simple same pointer
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache same as block_cache_compressed not "
+            "currently supported, and would be bad for performance anyway");
+
+  // Other cases
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and block_cache_compressed share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and block_cache share "
+            "the same key space, which is not supported");
+
+  table_options.block_cache = ro_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache and persistent_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache = rw_cache;
+  table_options.block_cache_compressed.reset();
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache share the same "
+            "key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = ro_cache;
+  table_options.persistent_cache = rw_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: block_cache_compressed and persistent_cache "
+            "share the same key space, which is not supported");
+
+  table_options.block_cache.reset();
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = rw_cache;
+  table_options.persistent_cache = ro_pcache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_EQ(TryReopen(options).ToString(),
+            "Invalid argument: persistent_cache and block_cache_compressed "
+            "share the same key space, which is not supported");
+}
+#endif  // SNAPPY
+
+#ifndef ROCKSDB_LITE
+
+// Make sure that when options.block_cache is set, after a new table is
+// created its index/filter blocks are added to block cache.
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, /* only index/filter were added */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+  uint64_t int_num;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  // Make sure filter block is in cache.
+  std::string value;
+  ReadOptions ropt;
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+
+  // Miss count should remain the same.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value);
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Make sure index block is in cache.
+  auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(index_block_hit + 2,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+// With fill_cache = false, fills up the cache, then iterates over the entire
+// db, verify dummy entries inserted in `BlockBasedTable::NewDataBlockIterator`
+// does not cause heap-use-after-free errors in COMPILE_WITH_ASAN=1 runs
+TEST_F(DBBlockCacheTest, FillCacheAndIterateDB) {
+  ReadOptions read_options;
+  read_options.fill_cache = false;
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  InitTable(options);
+
+  std::shared_ptr<Cache> cache = NewLRUCache(10, 0, true);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("key1", "val1"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key5", "val5"));
+  ASSERT_OK(Put("key6", "val6"));
+  ASSERT_OK(Flush());
+
+  Iterator* iter = nullptr;
+
+  iter = db_->NewIterator(read_options);
+  iter->Seek(std::to_string(0));
+  while (iter->Valid()) {
+    iter->Next();
+  }
+  delete iter;
+  iter = nullptr;
+}
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  LRUCacheOptions co;
+  // 500 bytes are enough to hold the first two blocks
+  co.capacity = 500;
+  co.num_shard_bits = 0;
+  co.strict_capacity_limit = false;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = NewLRUCache(co);
+  table_options.block_cache = cache;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "longer_key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  size_t index_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT);
+  size_t filter_bytes_insert =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT);
+  ASSERT_GT(index_bytes_insert, 0);
+  ASSERT_GT(filter_bytes_insert, 0);
+  ASSERT_EQ(cache->GetUsage(), index_bytes_insert + filter_bytes_insert);
+  // set the cache capacity to the current usage
+  cache->SetCapacity(index_bytes_insert + filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT), 0);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT), 0);
+  // Note that the second key needs to be no longer than the first one.
+  // Otherwise the second index block may not fit in cache.
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table
+  ASSERT_OK(Flush(1));
+  // cache evicted old index and block entries
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_INSERT),
+            index_bytes_insert);
+  ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_INSERT),
+            filter_bytes_insert);
+  // The index and filter eviction statistics were broken by the refactoring
+  // that moved the readers out of the block cache. Disabling these until we can
+  // bring the stats back.
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_INDEX_BYTES_EVICT),
+  //           index_bytes_insert);
+  // ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_FILTER_BYTES_EVICT),
+  //           filter_bytes_insert);
+}
+
+#if (defined OS_LINUX || defined OS_WIN)
+TEST_F(DBBlockCacheTest, WarmCacheWithDataBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+  // Verify compaction not counted
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                              /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+}
+
+// This test cache data, index and filter blocks during flush.
+class DBBlockCacheTest1 : public DBTestBase,
+                          public ::testing::WithParamInterface<uint32_t> {
+ public:
+  const size_t kNumBlocks = 10;
+  const size_t kValueSize = 100;
+  DBBlockCacheTest1() : DBTestBase("db_block_cache_test1", true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheTest1, DBBlockCacheTest1,
+                        ::testing::Values(1, 2));
+
+TEST_P(DBBlockCacheTest1, WarmCacheWithBlocksDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+
+  uint32_t filter_type = GetParam();
+  switch (filter_type) {
+    case 1:  // partition_filter
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      break;
+    case 2:  // full filter
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      break;
+    default:
+      assert(false);
+  }
+
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+  for (size_t i = 1; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+    if (filter_type == 1) {
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(2 * i,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    } else {
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    }
+    ASSERT_EQ(value, Get(std::to_string(i)));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(i, options.statistics->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(i * 3, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT));
+    if (filter_type == 1) {
+      ASSERT_EQ(i * 3,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    } else {
+      ASSERT_EQ(i * 2,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT));
+    }
+    ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS));
+  }
+
+  // Verify compaction not counted
+  CompactRangeOptions cro;
+  // Ensure files are rewritten, not just trivially moved.
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, /*begin=*/nullptr, /*end=*/nullptr));
+  EXPECT_EQ(kNumBlocks,
+            options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+  // Index and filter blocks are automatically warmed when the new table file
+  // is automatically opened at the end of compaction. This is not easily
+  // disabled so results in the new index and filter blocks being warmed.
+  if (filter_type == 1) {
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2 * (1 + kNumBlocks),
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  } else {
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(1 + kNumBlocks,
+              options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+  }
+}
+
+TEST_F(DBBlockCacheTest, DynamicallyWarmCacheDuringFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.prepopulate_block_cache =
+      BlockBasedTableOptions::PrepopulateBlockCache::kFlushOnly;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  std::string value(kValueSize, 'a');
+
+  for (size_t i = 1; i <= 5; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        0, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"block_based_table_factory", "{prepopulate_block_cache=kDisable;}"}}));
+
+  for (size_t i = 6; i <= kNumBlocks; i++) {
+    ASSERT_OK(Put(std::to_string(i), value));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+
+    ASSERT_EQ(value, Get(std::to_string(i)));
+    ASSERT_EQ(1,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_ADD));
+    ASSERT_EQ(
+        1, options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_MISS));
+    ASSERT_EQ(0,
+              options.statistics->getAndResetTickerCount(BLOCK_CACHE_DATA_HIT));
+  }
+}
+#endif
+
+namespace {
+
+// A mock cache wraps LRUCache, and record how many entries have been
+// inserted for each priority.
+class MockCache : public LRUCache {
+ public:
+  static uint32_t high_pri_insert_count;
+  static uint32_t low_pri_insert_count;
+
+  MockCache()
+      : LRUCache((size_t)1 << 25 /*capacity*/, 0 /*num_shard_bits*/,
+                 false /*strict_capacity_limit*/, 0.0 /*high_pri_pool_ratio*/,
+                 0.0 /*low_pri_pool_ratio*/) {}
+
+  using ShardedCache::Insert;
+
+  Status Insert(const Slice& key, void* value,
+                const Cache::CacheItemHelper* helper_cb, size_t charge,
+                Handle** handle, Priority priority) override {
+    DeleterFn delete_cb = helper_cb->del_cb;
+    if (priority == Priority::LOW) {
+      low_pri_insert_count++;
+    } else {
+      high_pri_insert_count++;
+    }
+    return LRUCache::Insert(key, value, charge, delete_cb, handle, priority);
+  }
+};
+
+uint32_t MockCache::high_pri_insert_count = 0;
+uint32_t MockCache::low_pri_insert_count = 0;
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, IndexAndFilterBlocksCachePriority) {
+  for (auto priority : {Cache::Priority::LOW, Cache::Priority::HIGH}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    table_options.cache_index_and_filter_blocks_with_high_priority =
+        priority == Cache::Priority::HIGH ? true : false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    MockCache::high_pri_insert_count = 0;
+    MockCache::low_pri_insert_count = 0;
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // index/filter blocks added to block cache right after table creation.
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(2, /* only index/filter were added */
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(2u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(0u, MockCache::low_pri_insert_count);
+    }
+
+    // Access data block.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(3, /*adding data block*/
+              TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+    // Data block should be inserted with low priority.
+    if (priority == Cache::Priority::LOW) {
+      ASSERT_EQ(0u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(3u, MockCache::low_pri_insert_count);
+    } else {
+      ASSERT_EQ(2u, MockCache::high_pri_insert_count);
+      ASSERT_EQ(1u, MockCache::low_pri_insert_count);
+    }
+  }
+}
+
+namespace {
+
+// An LRUCache wrapper that can falsely report "not found" on Lookup.
+// This allows us to manipulate BlockBasedTableReader into thinking
+// another thread inserted the data in between Lookup and Insert,
+// while mostly preserving the LRUCache interface/behavior.
+class LookupLiarCache : public CacheWrapper {
+  int nth_lookup_not_found_ = 0;
+
+ public:
+  explicit LookupLiarCache(std::shared_ptr<Cache> target)
+      : CacheWrapper(std::move(target)) {}
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats) override {
+    if (nth_lookup_not_found_ == 1) {
+      nth_lookup_not_found_ = 0;
+      return nullptr;
+    }
+    if (nth_lookup_not_found_ > 1) {
+      --nth_lookup_not_found_;
+    }
+    return CacheWrapper::Lookup(key, stats);
+  }
+
+  // 1 == next lookup, 2 == after next, etc.
+  void SetNthLookupNotFound(int n) { nth_lookup_not_found_ = n; }
+};
+
+}  // anonymous namespace
+
+TEST_F(DBBlockCacheTest, AddRedundantStats) {
+  const size_t capacity = size_t{1} << 25;
+  const int num_shard_bits = 0;  // 1 shard
+  int iterations_tested = 0;
+  for (std::shared_ptr<Cache> base_cache :
+       {NewLRUCache(capacity, num_shard_bits),
+        HyperClockCacheOptions(
+            capacity,
+            BlockBasedTableOptions().block_size /*estimated_value_size*/,
+            num_shard_bits)
+            .MakeSharedCache()}) {
+    if (!base_cache) {
+      // Skip clock cache when not supported
+      continue;
+    }
+    ++iterations_tested;
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    std::shared_ptr<LookupLiarCache> cache =
+        std::make_shared<LookupLiarCache>(base_cache);
+
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = cache;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    // Create a new table.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(Put("bar", "value"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+    // Normal access filter+index+data.
+    ASSERT_EQ("value", Get("foo"));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Againt access filter+index+data, but force redundant load+insert on index
+    cache->SetNthLookupNotFound(2);
+    ASSERT_EQ("value", Get("bar"));
+
+    ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    ASSERT_EQ(4, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just filter (with high probability), and force redundant
+    // load+insert
+    cache->SetNthLookupNotFound(1);
+    ASSERT_EQ("NOT_FOUND", Get("this key was not added"));
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(5, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+
+    // Access just data, forcing redundant load+insert
+    ReadOptions read_options;
+    std::unique_ptr<Iterator> iter{db_->NewIterator(read_options)};
+    cache->SetNthLookupNotFound(1);
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), "bar");
+
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD));
+    EXPECT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD));
+    // --------
+    EXPECT_EQ(6, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_ADD_REDUNDANT));
+    EXPECT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_DATA_ADD_REDUNDANT));
+    // --------
+    EXPECT_EQ(3, TestGetTickerCount(options, BLOCK_CACHE_ADD_REDUNDANT));
+  }
+  EXPECT_GE(iterations_tested, 1);
+}
+
+TEST_F(DBBlockCacheTest, ParanoidFileChecks) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger = 2;
+  options.paranoid_file_checks = true;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "1_key", "val"));
+  ASSERT_OK(Put(1, "9_key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, /* read and cache data block */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  ASSERT_OK(Put(1, "1_key2", "val2"));
+  ASSERT_OK(Put(1, "9_key2", "val2"));
+  // Create a new SST file. This will further trigger a compaction
+  // and generate another file.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // After disabling options.paranoid_file_checks. NO further block
+  // is added after generating a new file.
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "false"}}));
+
+  ASSERT_OK(Put(1, "1_key3", "val3"));
+  ASSERT_OK(Put(1, "9_key3", "val3"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "1_key4", "val4"));
+  ASSERT_OK(Put(1, "9_key4", "val4"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(3, /* Totally 3 files created up to now */
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+}
+
+TEST_F(DBBlockCacheTest, CompressedCache) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  int num_iter = 80;
+
+  // Run this test three iterations.
+  // Iteration 1: only a uncompressed block cache
+  // Iteration 2: only a compressed block cache
+  // Iteration 3: both block cache and compressed cache
+  // Iteration 4: both block cache and compressed cache, but DB is not
+  // compressed
+  for (int iter = 0; iter < 4; iter++) {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 64 * 1024;  // small write buffer
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    BlockBasedTableOptions table_options;
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        table_options.block_cache = NewLRUCache(8 * 1024);
+        table_options.block_cache_compressed = nullptr;
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        table_options.no_block_cache = true;
+        table_options.block_cache = nullptr;
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        table_options.block_cache = NewLRUCache(1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        break;
+      case 3:
+        // both block cache and compressed cache, but DB is not compressed
+        // also, make block cache sizes bigger, to trigger block cache hits
+        table_options.block_cache = NewLRUCache(1024 * 1024);
+        table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+        options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+        options.compression = kNoCompression;
+        break;
+      default:
+        FAIL();
+    }
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // default column family doesn't have block cache
+    Options no_block_cache_opts;
+    no_block_cache_opts.statistics = options.statistics;
+    no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+    BlockBasedTableOptions table_options_no_bc;
+    table_options_no_bc.no_block_cache = true;
+    no_block_cache_opts.table_factory.reset(
+        NewBlockBasedTableFactory(table_options_no_bc));
+    ReopenWithColumnFamilies(
+        {"default", "pikachu"},
+        std::vector<Options>({no_block_cache_opts, options}));
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    std::string str;
+    for (int i = 0; i < num_iter; i++) {
+      if (i % 4 == 0) {  // high compression ratio
+        str = rnd.RandomString(1000);
+      }
+      values.push_back(str);
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+
+    // flush all data from memtable so that reads are from block cache
+    ASSERT_OK(Flush(1));
+
+    for (int i = 0; i < num_iter; i++) {
+      ASSERT_EQ(Get(1, Key(i)), values[i]);
+    }
+
+    // check that we triggered the appropriate code paths in the cache
+    switch (iter) {
+      case 0:
+        // only uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 1:
+        // no block cache, only compressed cache
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 2:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        break;
+      case 3:
+        // both compressed and uncompressed block cache
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_HIT), 0);
+        ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
+        // compressed doesn't have any hits since blocks are not compressed on
+        // storage
+        ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT), 0);
+        break;
+      default:
+        FAIL();
+    }
+
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheCompressionDict) {
+  const int kNumFiles = 4;
+  const int kNumEntriesPerFile = 128;
+  const int kNumBytesPerEntry = 1024;
+
+  // Try all the available libraries that support dictionary compression
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+  if (LZ4_Supported()) {
+    compression_types.push_back(kLZ4Compression);
+    compression_types.push_back(kLZ4HCCompression);
+  }
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  } else if (ZSTDNotFinal_Supported()) {
+    compression_types.push_back(kZSTDNotFinalCompression);
+  }
+  Random rnd(301);
+  for (auto compression_type : compression_types) {
+    Options options = CurrentOptions();
+    options.bottommost_compression = compression_type;
+    options.bottommost_compression_opts.max_dict_bytes = 4096;
+    options.bottommost_compression_opts.enabled = true;
+    options.create_if_missing = true;
+    options.num_levels = 2;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache.reset(new MockCache());
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    RecordCacheCountersForCompressionDict(options);
+
+    for (int i = 0; i < kNumFiles; ++i) {
+      ASSERT_EQ(i, NumTableFilesAtLevel(0, 0));
+      for (int j = 0; j < kNumEntriesPerFile; ++j) {
+        std::string value = rnd.RandomString(kNumBytesPerEntry);
+        ASSERT_OK(Put(Key(j * kNumFiles + i), value.c_str()));
+      }
+      ASSERT_OK(Flush());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(1));
+
+    // Compression dictionary blocks are preloaded.
+    CheckCacheCountersForCompressionDict(
+        options, kNumFiles /* expected_compression_dict_misses */,
+        0 /* expected_compression_dict_hits */,
+        kNumFiles /* expected_compression_dict_inserts */);
+
+    // Seek to a key in a file. It should cause the SST's dictionary meta-block
+    // to be read.
+    RecordCacheCounters(options);
+    RecordCacheCountersForCompressionDict(options);
+    ReadOptions read_options;
+    ASSERT_NE("NOT_FOUND", Get(Key(kNumFiles * kNumEntriesPerFile - 1)));
+    // Two block hits: index and dictionary since they are prefetched
+    // One block missed/added: data block
+    CheckCacheCounters(options, 1 /* expected_misses */, 2 /* expected_hits */,
+                       1 /* expected_inserts */, 0 /* expected_failures */);
+    CheckCacheCountersForCompressionDict(
+        options, 0 /* expected_compression_dict_misses */,
+        1 /* expected_compression_dict_hits */,
+        0 /* expected_compression_dict_inserts */);
+  }
+}
+
+static void ClearCache(Cache* cache) {
+  auto roles = CopyCacheDeleterRoleMap();
+  std::deque<std::string> keys;
+  Cache::ApplyToAllEntriesOptions opts;
+  auto callback = [&](const Slice& key, void* /*value*/, size_t /*charge*/,
+                      Cache::DeleterFn deleter) {
+    if (roles.find(deleter) == roles.end()) {
+      // Keep the stats collector
+      return;
+    }
+    keys.push_back(key.ToString());
+  };
+  cache->ApplyToAllEntries(callback, opts);
+  for (auto& k : keys) {
+    cache->Erase(k);
+  }
+}
+
+TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
+  const size_t capacity = size_t{1} << 25;
+  int iterations_tested = 0;
+  for (bool partition : {false, true}) {
+    for (std::shared_ptr<Cache> cache :
+         {NewLRUCache(capacity),
+          HyperClockCacheOptions(
+              capacity,
+              BlockBasedTableOptions().block_size /*estimated_value_size*/)
+              .MakeSharedCache()}) {
+      ++iterations_tested;
+
+      Options options = CurrentOptions();
+      SetTimeElapseOnlySleepOnReopen(&options);
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.max_open_files = 13;
+      options.table_cache_numshardbits = 0;
+      // If this wakes up, it could interfere with test
+      options.stats_dump_period_sec = 0;
+
+      BlockBasedTableOptions table_options;
+      table_options.block_cache = cache;
+      table_options.cache_index_and_filter_blocks = true;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(50));
+      if (partition) {
+        table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+        table_options.partition_filters = true;
+      }
+      table_options.metadata_cache_options.top_level_index_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.partition_pinning =
+          PinningTier::kNone;
+      table_options.metadata_cache_options.unpartitioned_pinning =
+          PinningTier::kNone;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      DestroyAndReopen(options);
+
+      // Create a new table.
+      ASSERT_OK(Put("foo", "value"));
+      ASSERT_OK(Put("bar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_OK(Put("zfoo", "value"));
+      ASSERT_OK(Put("zbar", "value"));
+      ASSERT_OK(Flush());
+
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+      // Fresh cache
+      ClearCache(cache.get());
+
+      std::array<size_t, kNumCacheEntryRoles> expected{};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      std::array<size_t, kNumCacheEntryRoles> prev_expected = expected;
+
+      // First access only filters
+      ASSERT_EQ("NOT_FOUND", Get("different from any key added"));
+      expected[static_cast<size_t>(CacheEntryRole::kFilterBlock)] += 2;
+      if (partition) {
+        expected[static_cast<size_t>(CacheEntryRole::kFilterMetaBlock)] += 2;
+      }
+      // Within some time window, we will get cached entry stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Not enough to force a miss
+      env_->MockSleepForSeconds(45);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      // Now access index and data block
+      ASSERT_EQ("value", Get("foo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Enough to force a miss
+      env_->MockSleepForSeconds(601);
+      // But inject a simulated long scan so that we need a longer
+      // interval to force a miss next time.
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this](void*) {
+            // To spend no more than 0.2% of time scanning, we would need
+            // interval of at least 10000s
+            env_->MockSleepForSeconds(20);
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+
+      // The same for other file
+      ASSERT_EQ("value", Get("zfoo"));
+      expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      if (partition) {
+        // top-level
+        expected[static_cast<size_t>(CacheEntryRole::kIndexBlock)]++;
+      }
+      expected[static_cast<size_t>(CacheEntryRole::kDataBlock)]++;
+      // Because of the simulated long scan, this is not enough to force
+      // a miss
+      env_->MockSleepForSeconds(601);
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // But this is enough
+      env_->MockSleepForSeconds(10000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+      prev_expected = expected;
+
+      // Also check the GetProperty interface
+      std::map<std::string, std::string> values;
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+      for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+        auto role = static_cast<CacheEntryRole>(i);
+        EXPECT_EQ(std::to_string(expected[i]),
+                  values[BlockCacheEntryStatsMapKeys::EntryCount(role)]);
+      }
+
+      // Add one for kWriteBuffer
+      {
+        WriteBufferManager wbm(size_t{1} << 20, cache);
+        wbm.ReserveMem(1024);
+        expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]++;
+        // Now we check that the GetProperty interface is more agressive about
+        // re-scanning stats, but not totally aggressive.
+        // Within some time window, we will get cached entry stats
+        env_->MockSleepForSeconds(1);
+        EXPECT_EQ(std::to_string(prev_expected[static_cast<size_t>(
+                      CacheEntryRole::kWriteBuffer)]),
+                  values[BlockCacheEntryStatsMapKeys::EntryCount(
+                      CacheEntryRole::kWriteBuffer)]);
+        // Not enough for a "background" miss but enough for a "foreground" miss
+        env_->MockSleepForSeconds(45);
+
+        ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats,
+                                        &values));
+        EXPECT_EQ(
+            std::to_string(
+                expected[static_cast<size_t>(CacheEntryRole::kWriteBuffer)]),
+            values[BlockCacheEntryStatsMapKeys::EntryCount(
+                CacheEntryRole::kWriteBuffer)]);
+      }
+      prev_expected = expected;
+
+      // With collector pinned in cache, we should be able to hit
+      // even if the cache is full
+      ClearCache(cache.get());
+      Cache::Handle* h = nullptr;
+      if (strcmp(cache->Name(), "LRUCache") == 0) {
+        ASSERT_OK(cache->Insert("Fill-it-up", nullptr, capacity + 1,
+                                GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                                &h, Cache::Priority::HIGH));
+      } else {
+        // For ClockCache we use a 16-byte key.
+        ASSERT_OK(cache->Insert("Fill-it-up-xxxxx", nullptr, capacity + 1,
+                                GetNoopDeleterForRole<CacheEntryRole::kMisc>(),
+                                &h, Cache::Priority::HIGH));
+      }
+      ASSERT_GT(cache->GetUsage(), cache->GetCapacity());
+      expected = {};
+      // For CacheEntryStatsCollector
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)] = 1;
+      // For Fill-it-up
+      expected[static_cast<size_t>(CacheEntryRole::kMisc)]++;
+      // Still able to hit on saved stats
+      EXPECT_EQ(prev_expected, GetCacheEntryRoleCountsBg());
+      // Enough to force a miss
+      env_->MockSleepForSeconds(1000);
+      EXPECT_EQ(expected, GetCacheEntryRoleCountsBg());
+
+      cache->Release(h);
+
+      // Now we test that the DB mutex is not held during scans, for the ways
+      // we know how to (possibly) trigger them. Without a better good way to
+      // check this, we simply inject an acquire & release of the DB mutex
+      // deep in the stat collection code. If we were already holding the
+      // mutex, that is UB that would at least be found by TSAN.
+      int scan_count = 0;
+      SyncPoint::GetInstance()->SetCallBack(
+          "CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries",
+          [this, &scan_count](void*) {
+            dbfull()->TEST_LockMutex();
+            dbfull()->TEST_UnlockMutex();
+            ++scan_count;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      // Different things that might trigger a scan, with mock sleeps to
+      // force a miss.
+      env_->MockSleepForSeconds(10000);
+      dbfull()->DumpStats();
+      ASSERT_EQ(scan_count, 1);
+
+      env_->MockSleepForSeconds(60);
+      ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                      &values));
+      ASSERT_EQ(scan_count, 1);
+      ASSERT_TRUE(
+          db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+      ASSERT_EQ(scan_count, 2);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                      &values));
+      ASSERT_EQ(scan_count, 3);
+
+      env_->MockSleepForSeconds(60);
+      std::string value_str;
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                   &value_str));
+      ASSERT_EQ(scan_count, 3);
+      ASSERT_TRUE(
+          db_->GetProperty(DB::Properties::kBlockCacheEntryStats, &value_str));
+      ASSERT_EQ(scan_count, 4);
+
+      env_->MockSleepForSeconds(10000);
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kFastBlockCacheEntryStats,
+                                   &value_str));
+      ASSERT_EQ(scan_count, 5);
+
+      ASSERT_TRUE(db_->GetProperty(DB::Properties::kCFStats, &value_str));
+      // To match historical speed, querying this property no longer triggers
+      // a scan, even if results are old. But periodic dump stats should keep
+      // things reasonably updated.
+      ASSERT_EQ(scan_count, /*unchanged*/ 5);
+
+      SyncPoint::GetInstance()->DisableProcessing();
+      SyncPoint::GetInstance()->ClearAllCallBacks();
+    }
+    EXPECT_GE(iterations_tested, 1);
+  }
+}
+
+namespace {
+
+void DummyFillCache(Cache& cache, size_t entry_size,
+                    std::vector<CacheHandleGuard<void>>& handles) {
+  // fprintf(stderr, "Entry size: %zu\n", entry_size);
+  handles.clear();
+  cache.EraseUnRefEntries();
+  void* fake_value = &cache;
+  size_t capacity = cache.GetCapacity();
+  OffsetableCacheKey ck{"abc", "abc", 42};
+  for (size_t my_usage = 0; my_usage < capacity;) {
+    size_t charge = std::min(entry_size, capacity - my_usage);
+    Cache::Handle* handle;
+    Status st = cache.Insert(ck.WithOffset(my_usage).AsSlice(), fake_value,
+                             charge, /*deleter*/ nullptr, &handle);
+    ASSERT_OK(st);
+    handles.emplace_back(&cache, handle);
+    my_usage += charge;
+  }
+}
+
+class CountingLogger : public Logger {
+ public:
+  ~CountingLogger() override {}
+  using Logger::Logv;
+  void Logv(const InfoLogLevel log_level, const char* format,
+            va_list /*ap*/) override {
+    if (std::strstr(format, "HyperClockCache") == nullptr) {
+      // Not a match
+      return;
+    }
+    // static StderrLogger debug;
+    // debug.Logv(log_level, format, ap);
+    if (log_level == InfoLogLevel::INFO_LEVEL) {
+      ++info_count_;
+    } else if (log_level == InfoLogLevel::WARN_LEVEL) {
+      ++warn_count_;
+    } else if (log_level == InfoLogLevel::ERROR_LEVEL) {
+      ++error_count_;
+    }
+  }
+
+  std::array<int, 3> PopCounts() {
+    std::array<int, 3> rv{{info_count_, warn_count_, error_count_}};
+    info_count_ = warn_count_ = error_count_ = 0;
+    return rv;
+  }
+
+ private:
+  int info_count_{};
+  int warn_count_{};
+  int error_count_{};
+};
+
+}  // namespace
+
+TEST_F(DBBlockCacheTest, HyperClockCacheReportProblems) {
+  size_t capacity = 1024 * 1024;
+  size_t value_size_est = 8 * 1024;
+  HyperClockCacheOptions hcc_opts{capacity, value_size_est};
+  hcc_opts.num_shard_bits = 2;  // 4 shards
+  hcc_opts.metadata_charge_policy = kDontChargeCacheMetadata;
+  std::shared_ptr<Cache> cache = hcc_opts.MakeSharedCache();
+  std::shared_ptr<CountingLogger> logger = std::make_shared<CountingLogger>();
+
+  auto table_options = GetTableOptions();
+  auto options = GetOptions(table_options);
+  table_options.block_cache = cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.info_log = logger;
+  // Going to sample more directly
+  options.stats_dump_period_sec = 0;
+  Reopen(options);
+
+  std::vector<CacheHandleGuard<void>> handles;
+
+  // Clear anything from DB startup
+  logger->PopCounts();
+
+  // Fill cache based on expected size and check that when we
+  // don't report anything relevant in periodic stats dump
+  DummyFillCache(*cache, value_size_est, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  // Same, within reasonable bounds
+  DummyFillCache(*cache, value_size_est - value_size_est / 4, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  DummyFillCache(*cache, value_size_est + value_size_est / 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 0}}));
+
+  // Estimate too high (value size too low) eventually reports ERROR
+  DummyFillCache(*cache, value_size_est / 2, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+  DummyFillCache(*cache, value_size_est / 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 0, 1}}));
+
+  // Estimate too low (value size too high) starts with INFO
+  // and is only WARNING in the worst case
+  DummyFillCache(*cache, value_size_est * 2, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{1, 0, 0}}));
+
+  DummyFillCache(*cache, value_size_est * 3, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+
+  DummyFillCache(*cache, value_size_est * 20, handles);
+  dbfull()->DumpStats();
+  EXPECT_EQ(logger->PopCounts(), (std::array<int, 3>{{0, 1, 0}}));
+}
+
+#endif  // ROCKSDB_LITE
+
+class DBBlockCacheKeyTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBBlockCacheKeyTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    use_compressed_cache_ = std::get<0>(GetParam());
+    exclude_file_numbers_ = std::get<1>(GetParam());
+  }
+
+  bool use_compressed_cache_;
+  bool exclude_file_numbers_;
+};
+
+// Disable LinkFile so that we can physically copy a DB using Checkpoint.
+// Disable file GetUniqueId to enable stable cache keys.
+class StableCacheKeyTestFS : public FaultInjectionTestFS {
+ public:
+  explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
+      : FaultInjectionTestFS(base) {
+    SetFailGetUniqueId(true);
+  }
+
+  virtual ~StableCacheKeyTestFS() override {}
+
+  IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                    IODebugContext*) override {
+    return IOStatus::NotSupported("Disabled");
+  }
+};
+
+TEST_P(DBBlockCacheKeyTest, StableCacheKeys) {
+  std::shared_ptr<StableCacheKeyTestFS> test_fs{
+      new StableCacheKeyTestFS(env_->GetFileSystem())};
+  std::unique_ptr<CompositeEnvWrapper> test_env{
+      new CompositeEnvWrapper(env_, test_fs)};
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.env = test_env.get();
+
+  // Corrupting the table properties corrupts the unique id.
+  // Ignore the unique id recorded in the manifest.
+  options.verify_sst_unique_id_in_manifest = false;
+
+  BlockBasedTableOptions table_options;
+
+  int key_count = 0;
+  uint64_t expected_stat = 0;
+
+  std::function<void()> verify_stats;
+  if (use_compressed_cache_) {
+    if (!Snappy_Supported()) {
+      ROCKSDB_GTEST_SKIP("Compressed cache test requires snappy support");
+      return;
+    }
+    options.compression = CompressionType::kSnappyCompression;
+    table_options.no_block_cache = true;
+    table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      // One for ordinary SST file and one for external SST file
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
+    };
+  } else {
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.block_cache = NewLRUCache(1 << 25, 0, false);
+    verify_stats = [&options, &expected_stat] {
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
+      ASSERT_EQ(expected_stat,
+                options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
+    };
+  }
+
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"koko"}, options);
+
+  if (exclude_file_numbers_) {
+    // Simulate something like old behavior without file numbers in properties.
+    // This is a "control" side of the test that also ensures safely degraded
+    // behavior on old files.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableBuilder::BlockBasedTableBuilder:PreSetupBaseCacheKey",
+        [&](void* arg) {
+          TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+          props->orig_file_number = 0;
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  std::function<void()> perform_gets = [&key_count, &expected_stat, this]() {
+    if (exclude_file_numbers_) {
+      // No cache key reuse should happen, because we can't rely on current
+      // file number being stable
+      expected_stat += key_count;
+    } else {
+      // Cache keys should be stable
+      expected_stat = key_count;
+    }
+    for (int i = 0; i < key_count; ++i) {
+      ASSERT_EQ(Get(1, Key(i)), "abc");
+    }
+  };
+
+  // Ordinary SST files with same session id
+  const std::string something_compressible(500U, 'x');
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put(1, Key(key_count), "abc"));
+    ASSERT_OK(Put(1, Key(key_count) + "a", something_compressible));
+    ASSERT_OK(Flush(1));
+    ++key_count;
+  }
+
+#ifndef ROCKSDB_LITE
+  // Save an export of those ordinary SST files for later
+  std::string export_files_dir = dbname_ + "/exported";
+  ExportImportFilesMetaData* metadata_ptr_ = nullptr;
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+  checkpoint = nullptr;
+
+  // External SST files with same session id
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  std::vector<std::string> external;
+  for (int i = 0; i < 2; ++i) {
+    std::string f = dbname_ + "/external" + std::to_string(i) + ".sst";
+    external.push_back(f);
+    ASSERT_OK(sst_file_writer.Open(f));
+    ASSERT_OK(sst_file_writer.Put(Key(key_count), "abc"));
+    ASSERT_OK(
+        sst_file_writer.Put(Key(key_count) + "a", something_compressible));
+    ++key_count;
+    ExternalSstFileInfo external_info;
+    ASSERT_OK(sst_file_writer.Finish(&external_info));
+    IngestExternalFileOptions ingest_opts;
+    ASSERT_OK(db_->IngestExternalFile(handles_[1], {f}, ingest_opts));
+  }
+
+  if (exclude_file_numbers_) {
+    // FIXME(peterd): figure out where these extra ADDs are coming from
+    options.statistics->recordTick(BLOCK_CACHE_COMPRESSED_ADD,
+                                   uint64_t{0} - uint64_t{2});
+  }
+#endif
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit after re-open
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // Make sure we can cache hit even on a full copy of the DB. Using
+  // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
+  // (Checkpoint  not available in LITE mode to test this.)
+#ifndef ROCKSDB_LITE
+  auto db_copy_name = dbname_ + "-copy";
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
+  delete checkpoint;
+
+  Close();
+  Destroy(options);
+
+  // Switch to the DB copy
+  SaveAndRestore<std::string> save_dbname(&dbname_, db_copy_name);
+  ReopenWithColumnFamilies({"default", "koko"}, options);
+
+  perform_gets();
+  verify_stats();
+
+  // And ensure that re-importing + ingesting the same files into a
+  // different DB uses same cache keys
+  DestroyAndReopen(options);
+
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                              ImportColumnFamilyOptions(),
+                                              *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+  delete cfh;
+  cfh = nullptr;
+  delete metadata_ptr_;
+  metadata_ptr_ = nullptr;
+
+  ASSERT_OK(DestroyDB(export_files_dir, options));
+
+  ReopenWithColumnFamilies({"default", "yoyo"}, options);
+
+  IngestExternalFileOptions ingest_opts;
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {external}, ingest_opts));
+
+  perform_gets();
+  verify_stats();
+#endif  // !ROCKSDB_LITE
+
+  Close();
+  Destroy(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CacheKeyTest : public testing::Test {
+ public:
+  CacheKey GetBaseCacheKey() {
+    CacheKey rv = GetOffsetableCacheKey(0, /*min file_number*/ 1).WithOffset(0);
+    // Correct for file_number_ == 1
+    *reinterpret_cast<uint64_t*>(&rv) ^= ReverseBits(uint64_t{1});
+    return rv;
+  }
+  CacheKey GetCacheKey(uint64_t session_counter, uint64_t file_number,
+                       uint64_t offset) {
+    OffsetableCacheKey offsetable =
+        GetOffsetableCacheKey(session_counter, file_number);
+    // * 4 to counteract optimization that strips lower 2 bits in encoding
+    // the offset in BlockBasedTable::GetCacheKey (which we prefer to include
+    // in unit tests to maximize functional coverage).
+    EXPECT_GE(offset * 4, offset);  // no overflow
+    return BlockBasedTable::GetCacheKey(offsetable,
+                                        BlockHandle(offset * 4, /*size*/ 5));
+  }
+
+ protected:
+  OffsetableCacheKey GetOffsetableCacheKey(uint64_t session_counter,
+                                           uint64_t file_number) {
+    // Like SemiStructuredUniqueIdGen::GenerateNext
+    tp_.db_session_id = EncodeSessionId(base_session_upper_,
+                                        base_session_lower_ ^ session_counter);
+    tp_.db_id = std::to_string(db_id_);
+    tp_.orig_file_number = file_number;
+    bool is_stable;
+    std::string cur_session_id = "";  // ignored
+    uint64_t cur_file_number = 42;    // ignored
+    OffsetableCacheKey rv;
+    BlockBasedTable::SetupBaseCacheKey(&tp_, cur_session_id, cur_file_number,
+                                       &rv, &is_stable);
+    EXPECT_TRUE(is_stable);
+    EXPECT_TRUE(!rv.IsEmpty());
+    // BEGIN some assertions in relation to SST unique IDs
+    std::string external_unique_id_str;
+    EXPECT_OK(GetUniqueIdFromTableProperties(tp_, &external_unique_id_str));
+    UniqueId64x2 sst_unique_id = {};
+    EXPECT_OK(DecodeUniqueIdBytes(external_unique_id_str, &sst_unique_id));
+    ExternalUniqueIdToInternal(&sst_unique_id);
+    OffsetableCacheKey ock =
+        OffsetableCacheKey::FromInternalUniqueId(&sst_unique_id);
+    EXPECT_EQ(rv.WithOffset(0).AsSlice(), ock.WithOffset(0).AsSlice());
+    EXPECT_EQ(ock.ToInternalUniqueId(), sst_unique_id);
+    // END some assertions in relation to SST unique IDs
+    return rv;
+  }
+
+  TableProperties tp_;
+  uint64_t base_session_upper_ = 0;
+  uint64_t base_session_lower_ = 0;
+  uint64_t db_id_ = 0;
+};
+
+TEST_F(CacheKeyTest, DBImplSessionIdStructure) {
+  // We have to generate our own session IDs for simulation purposes in other
+  // tests. Here we verify that the DBImpl implementation seems to match
+  // our construction here, by using lowest XORed-in bits for "session
+  // counter."
+  std::string session_id1 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  std::string session_id2 = DBImpl::GenerateDbSessionId(/*env*/ nullptr);
+  uint64_t upper1, upper2, lower1, lower2;
+  ASSERT_OK(DecodeSessionId(session_id1, &upper1, &lower1));
+  ASSERT_OK(DecodeSessionId(session_id2, &upper2, &lower2));
+  // Because generated in same process
+  ASSERT_EQ(upper1, upper2);
+  // Unless we generate > 4 billion session IDs in this process...
+  ASSERT_EQ(Upper32of64(lower1), Upper32of64(lower2));
+  // But they must be different somewhere
+  ASSERT_NE(Lower32of64(lower1), Lower32of64(lower2));
+}
+
+namespace {
+// Deconstruct cache key, based on knowledge of implementation details.
+void DeconstructNonemptyCacheKey(const CacheKey& key, uint64_t* file_num_etc64,
+                                 uint64_t* offset_etc64) {
+  *file_num_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data());
+  *offset_etc64 = *reinterpret_cast<const uint64_t*>(key.AsSlice().data() + 8);
+  assert(*file_num_etc64 != 0);
+  if (*offset_etc64 == 0) {
+    std::swap(*file_num_etc64, *offset_etc64);
+  }
+  assert(*offset_etc64 != 0);
+}
+
+// Make a bit mask of 0 to 64 bits
+uint64_t MakeMask64(int bits) {
+  if (bits >= 64) {
+    return uint64_t{0} - 1;
+  } else {
+    return (uint64_t{1} << bits) - 1;
+  }
+}
+
+// See CacheKeyTest::Encodings
+struct CacheKeyDecoder {
+  // Inputs
+  uint64_t base_file_num_etc64, base_offset_etc64;
+  int session_counter_bits, file_number_bits, offset_bits;
+
+  // Derived
+  uint64_t session_counter_mask, file_number_mask, offset_mask;
+
+  // Outputs
+  uint64_t decoded_session_counter, decoded_file_num, decoded_offset;
+
+  void SetBaseCacheKey(const CacheKey& base) {
+    DeconstructNonemptyCacheKey(base, &base_file_num_etc64, &base_offset_etc64);
+  }
+
+  void SetRanges(int _session_counter_bits, int _file_number_bits,
+                 int _offset_bits) {
+    session_counter_bits = _session_counter_bits;
+    session_counter_mask = MakeMask64(session_counter_bits);
+    file_number_bits = _file_number_bits;
+    file_number_mask = MakeMask64(file_number_bits);
+    offset_bits = _offset_bits;
+    offset_mask = MakeMask64(offset_bits);
+  }
+
+  void Decode(const CacheKey& key) {
+    uint64_t file_num_etc64, offset_etc64;
+    DeconstructNonemptyCacheKey(key, &file_num_etc64, &offset_etc64);
+
+    // First decode session counter
+    if (offset_bits + session_counter_bits <= 64) {
+      // fully recoverable from offset_etc64
+      decoded_session_counter =
+          ReverseBits((offset_etc64 ^ base_offset_etc64)) &
+          session_counter_mask;
+    } else if (file_number_bits + session_counter_bits <= 64) {
+      // fully recoverable from file_num_etc64
+      decoded_session_counter = DownwardInvolution(
+          (file_num_etc64 ^ base_file_num_etc64) & session_counter_mask);
+    } else {
+      // Need to combine parts from each word.
+      // Piece1 will contain some correct prefix of the bottom bits of
+      // session counter.
+      uint64_t piece1 =
+          ReverseBits((offset_etc64 ^ base_offset_etc64) & ~offset_mask);
+      int piece1_bits = 64 - offset_bits;
+      // Piece2 will contain involuded bits that we can combine with piece1
+      // to infer rest of session counter
+      int piece2_bits = std::min(64 - file_number_bits, 64 - piece1_bits);
+      ASSERT_LT(piece2_bits, 64);
+      uint64_t piece2_mask = MakeMask64(piece2_bits);
+      uint64_t piece2 = (file_num_etc64 ^ base_file_num_etc64) & piece2_mask;
+
+      // Cancel out the part of piece2 that we can infer from piece1
+      // (DownwardInvolution distributes over xor)
+      piece2 ^= DownwardInvolution(piece1) & piece2_mask;
+
+      // Now we need to solve for the unknown original bits in higher
+      // positions than piece1 provides. We use Gaussian elimination
+      // because we know that a piece2_bits X piece2_bits submatrix of
+      // the matrix underlying DownwardInvolution times the vector of
+      // unknown original bits equals piece2.
+      //
+      // Build an augmented row matrix for that submatrix, built column by
+      // column.
+      std::array<uint64_t, 64> aug_rows{};
+      for (int i = 0; i < piece2_bits; ++i) {  // over columns
+        uint64_t col_i = DownwardInvolution(uint64_t{1} << piece1_bits << i);
+        ASSERT_NE(col_i & 1U, 0);
+        for (int j = 0; j < piece2_bits; ++j) {  // over rows
+          aug_rows[j] |= (col_i & 1U) << i;
+          col_i >>= 1;
+        }
+      }
+      // Augment with right hand side
+      for (int j = 0; j < piece2_bits; ++j) {  // over rows
+        aug_rows[j] |= (piece2 & 1U) << piece2_bits;
+        piece2 >>= 1;
+      }
+      // Run Gaussian elimination
+      for (int i = 0; i < piece2_bits; ++i) {  // over columns
+        // Find a row that can be used to cancel others
+        uint64_t canceller = 0;
+        // Note: Rows 0 through i-1 contain 1s in columns already eliminated
+        for (int j = i; j < piece2_bits; ++j) {  // over rows
+          if (aug_rows[j] & (uint64_t{1} << i)) {
+            // Swap into appropriate row
+            std::swap(aug_rows[i], aug_rows[j]);
+            // Keep a handy copy for row reductions
+            canceller = aug_rows[i];
+            break;
+          }
+        }
+        ASSERT_NE(canceller, 0);
+        for (int j = 0; j < piece2_bits; ++j) {  // over rows
+          if (i != j && ((aug_rows[j] >> i) & 1) != 0) {
+            // Row reduction
+            aug_rows[j] ^= canceller;
+          }
+        }
+      }
+      // Extract result
+      decoded_session_counter = piece1;
+      for (int j = 0; j < piece2_bits; ++j) {  // over rows
+        ASSERT_EQ(aug_rows[j] & piece2_mask, uint64_t{1} << j);
+        decoded_session_counter |= aug_rows[j] >> piece2_bits << piece1_bits
+                                                              << j;
+      }
+    }
+
+    decoded_offset =
+        offset_etc64 ^ base_offset_etc64 ^ ReverseBits(decoded_session_counter);
+
+    decoded_file_num = ReverseBits(file_num_etc64 ^ base_file_num_etc64 ^
+                                   DownwardInvolution(decoded_session_counter));
+  }
+};
+}  // anonymous namespace
+
+TEST_F(CacheKeyTest, Encodings) {
+  // This test primarily verifies this claim from cache_key.cc:
+  // // In fact, if DB ids were not involved, we would be guaranteed unique
+  // // cache keys for files generated in a single process until total bits for
+  // // biggest session_id_counter, orig_file_number, and offset_in_file
+  // // reach 128 bits.
+  //
+  // To demonstrate this, CacheKeyDecoder can reconstruct the structured inputs
+  // to the cache key when provided an output cache key, the unstructured
+  // inputs, and bounds on the structured inputs.
+  //
+  // See OffsetableCacheKey comments in cache_key.cc.
+
+  // We are going to randomly initialize some values that *should* not affect
+  // result
+  Random64 r{std::random_device{}()};
+
+  CacheKeyDecoder decoder;
+  db_id_ = r.Next();
+  base_session_upper_ = r.Next();
+  base_session_lower_ = r.Next();
+  if (base_session_lower_ == 0) {
+    base_session_lower_ = 1;
+  }
+
+  decoder.SetBaseCacheKey(GetBaseCacheKey());
+
+  // Loop over configurations and test those
+  for (int session_counter_bits = 0; session_counter_bits <= 64;
+       ++session_counter_bits) {
+    for (int file_number_bits = 1; file_number_bits <= 64; ++file_number_bits) {
+      // 62 bits max because unoptimized offset will be 64 bits in that case
+      for (int offset_bits = 0; offset_bits <= 62; ++offset_bits) {
+        if (session_counter_bits + file_number_bits + offset_bits > 128) {
+          break;
+        }
+
+        decoder.SetRanges(session_counter_bits, file_number_bits, offset_bits);
+
+        uint64_t session_counter = r.Next() & decoder.session_counter_mask;
+        uint64_t file_number = r.Next() & decoder.file_number_mask;
+        if (file_number == 0) {
+          // Minimum
+          file_number = 1;
+        }
+        uint64_t offset = r.Next() & decoder.offset_mask;
+        decoder.Decode(GetCacheKey(session_counter, file_number, offset));
+
+        EXPECT_EQ(decoder.decoded_session_counter, session_counter);
+        EXPECT_EQ(decoder.decoded_file_num, file_number);
+        EXPECT_EQ(decoder.decoded_offset, offset);
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBBlockCacheKeyTest, DBBlockCacheKeyTest,
+                        ::testing::Combine(::testing::Bool(),
+                                           ::testing::Bool()));
+
+class DBBlockCachePinningTest
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool, PinningTier, PinningTier, PinningTier>> {
+ public:
+  DBBlockCachePinningTest()
+      : DBTestBase("db_block_cache_test", /*env_do_fsync=*/false) {}
+
+  void SetUp() override {
+    partition_index_and_filters_ = std::get<0>(GetParam());
+    top_level_index_pinning_ = std::get<1>(GetParam());
+    partition_pinning_ = std::get<2>(GetParam());
+    unpartitioned_pinning_ = std::get<3>(GetParam());
+  }
+
+  bool partition_index_and_filters_;
+  PinningTier top_level_index_pinning_;
+  PinningTier partition_pinning_;
+  PinningTier unpartitioned_pinning_;
+};
+
+TEST_P(DBBlockCachePinningTest, TwoLevelDB) {
+  // Creates one file in L0 and one file in L1. Both files have enough data that
+  // their index and filter blocks are partitioned. The L1 file will also have
+  // a compression dictionary (those are trained only during compaction), which
+  // must be unpartitioned.
+  const int kKeySize = 32;
+  const int kBlockSize = 128;
+  const int kNumBlocksPerFile = 128;
+  const int kNumKeysPerFile = kBlockSize * kNumBlocksPerFile / kKeySize;
+
+  Options options = CurrentOptions();
+  // `kNoCompression` makes the unit test more portable. But it relies on the
+  // current behavior of persisting/accessing dictionary even when there's no
+  // (de)compression happening, which seems fairly likely to change over time.
+  options.compression = kNoCompression;
+  options.compression_opts.max_dict_bytes = 4 << 10;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(1 << 20 /* capacity */);
+  table_options.block_size = kBlockSize;
+  table_options.metadata_block_size = kBlockSize;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.metadata_cache_options.top_level_index_pinning =
+      top_level_index_pinning_;
+  table_options.metadata_cache_options.partition_pinning = partition_pinning_;
+  table_options.metadata_cache_options.unpartitioned_pinning =
+      unpartitioned_pinning_;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(10 /* bits_per_key */));
+  if (partition_index_and_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    table_options.partition_filters = true;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kKeySize)));
+    }
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      // Prevent trivial move so file will be rewritten with dictionary and
+      // reopened with L1's pinning settings.
+      CompactRangeOptions cro;
+      cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+  }
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Get base cache values
+  uint64_t filter_misses = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t index_misses = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t compression_dict_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS);
+
+  // Read a key from the L0 file
+  Get(Key(kNumKeysPerFile));
+  uint64_t expected_filter_misses = filter_misses;
+  uint64_t expected_index_misses = index_misses;
+  uint64_t expected_compression_dict_misses = compression_dict_misses;
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+
+  // Clear all unpinned blocks so unpinned blocks will show up as cache misses
+  // when reading a key from a file.
+  table_options.block_cache->EraseUnRefEntries();
+
+  // Read a key from the L1 file
+  Get(Key(0));
+  if (partition_index_and_filters_) {
+    if (top_level_index_pinning_ == PinningTier::kNone ||
+        top_level_index_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+    if (partition_pinning_ == PinningTier::kNone ||
+        partition_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  } else {
+    if (unpartitioned_pinning_ == PinningTier::kNone ||
+        unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+      ++expected_filter_misses;
+      ++expected_index_misses;
+    }
+  }
+  if (unpartitioned_pinning_ == PinningTier::kNone ||
+      unpartitioned_pinning_ == PinningTier::kFlushedAndSimilar) {
+    ++expected_compression_dict_misses;
+  }
+  ASSERT_EQ(expected_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(expected_index_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(expected_compression_dict_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_MISS));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBBlockCachePinningTest, DBBlockCachePinningTest,
+    ::testing::Combine(
+        ::testing::Bool(),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll),
+        ::testing::Values(PinningTier::kNone, PinningTier::kFlushedAndSimilar,
+                          PinningTier::kAll)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_bloom_filter_test.cc b/src/rocksdb/db/db_bloom_filter_test.cc
new file mode 100644
index 000000000..d68ab6115
--- /dev/null
+++ b/src/rocksdb/db/db_bloom_filter_test.cc
@@ -0,0 +1,3498 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cstring>
+#include <iomanip>
+#include <sstream>
+#include <string>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_reservation_manager.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/filter_policy_internal.h"
+#include "table/format.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::shared_ptr<const FilterPolicy> Create(double bits_per_key,
+                                           const std::string& name) {
+  return BloomLikeFilterPolicy::Create(name, bits_per_key);
+}
+const std::string kLegacyBloom = test::LegacyBloomFilterPolicy::kClassName();
+const std::string kFastLocalBloom =
+    test::FastLocalBloomFilterPolicy::kClassName();
+const std::string kStandard128Ribbon =
+    test::Standard128RibbonFilterPolicy::kClassName();
+const std::string kAutoBloom = BloomFilterPolicy::kClassName();
+const std::string kAutoRibbon = RibbonFilterPolicy::kClassName();
+}  // anonymous namespace
+
+// DB tests related to bloom filter.
+
+class DBBloomFilterTest : public DBTestBase {
+ public:
+  DBBloomFilterTest()
+      : DBTestBase("db_bloom_filter_test", /*env_do_fsync=*/true) {}
+};
+
+class DBBloomFilterTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<std::string, bool, uint32_t>> {
+  //                             public testing::WithParamInterface<bool> {
+ protected:
+  std::string bfp_impl_;
+  bool partition_filters_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestWithParam()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+  ~DBBloomFilterTestWithParam() override {}
+
+  void SetUp() override {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+    format_version_ = std::get<2>(GetParam());
+  }
+};
+
+class DBBloomFilterTestDefFormatVersion : public DBBloomFilterTestWithParam {};
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5;
+  }
+};
+
+// KeyMayExist can lead to a few false positives, but not false negatives.
+// To make test deterministic, use a much larger number of bits per key-20 than
+// bits in the key, so that false positives are eliminated
+TEST_P(DBBloomFilterTestDefFormatVersion, KeyMayExist) {
+  do {
+    ReadOptions ropts;
+    std::string value;
+    anon::OptionsOverride options_override;
+    options_override.filter_policy = Create(20, bfp_impl_);
+    options_override.partition_filters = partition_filters_;
+    options_override.metadata_block_size = 32;
+    options_override.full_block_cache = true;
+    Options options = CurrentOptions(options_override);
+    if (partition_filters_) {
+      auto* table_options =
+          options.table_factory->GetOptions<BlockBasedTableOptions>();
+      if (table_options != nullptr &&
+          table_options->index_type !=
+              BlockBasedTableOptions::kTwoLevelIndexSearch) {
+        // In the current implementation partitioned filters depend on
+        // partitioned indexes
+        continue;
+      }
+    }
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+
+    ASSERT_OK(Put(1, "a", "b"));
+    bool value_found = false;
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(value_found);
+    ASSERT_EQ("b", value);
+
+    ASSERT_OK(Flush(1));
+    value.clear();
+
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(
+        db_->KeyMayExist(ropts, handles_[1], "a", &value, &value_found));
+    ASSERT_TRUE(!value_found);
+    // assert that no new files were opened and no new blocks were
+    // read into block cache.
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "a"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                          true /* disallow trivial move */));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    ASSERT_OK(Delete(1, "c"));
+
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "c", &value));
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+    // KeyMayExist function only checks data in block caches, which is not used
+    // by plain table format.
+  } while (
+      ChangeOptions(kSkipPlainTable | kSkipHashIndex | kSkipFIFOCompaction));
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloomCustomPrefixExtractor) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor =
+        std::make_shared<SliceTransformLimitedDomainGeneric>();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ(
+        0,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ(
+        2,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    ro.total_order_seek = true;
+    // NOTE: total_order_seek no longer affects Get()
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif  // ROCKSDB_LITE
+
+    // No bloom on extractor changed, after re-open
+    options.prefix_extractor.reset(NewCappedPrefixTransform(10));
+    Reopen(options);
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, GetFilterByPrefixBloom) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo"));
+    ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2"));
+    ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar"));
+
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    ASSERT_EQ("foo", Get("barbarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("foo2", Get("barbarbar2"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("barbarbar3"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    ASSERT_EQ("NOT_FOUND", Get("barfoofoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+
+    ro.total_order_seek = true;
+    // NOTE: total_order_seek no longer affects Get()
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+
+    // No bloom on extractor changed
+#ifndef ROCKSDB_LITE
+    ASSERT_OK(db_->SetOptions({{"prefix_extractor", "capped:10"}}));
+    ASSERT_EQ("NOT_FOUND", Get("foobarbar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ(
+        3,
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+#endif  // ROCKSDB_LITE
+
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_F(DBBloomFilterTest, WholeKeyFilterProp) {
+  for (bool partition_filters : {true, false}) {
+    Options options = last_options_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+    bbto.whole_key_filtering = false;
+    if (partition_filters) {
+      bbto.partition_filters = true;
+      bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    DestroyAndReopen(options);
+
+    WriteOptions wo;
+    ReadOptions ro;
+    FlushOptions fo;
+    fo.wait = true;
+    std::string value;
+
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(dbfull()->Flush(fo));
+
+    Reopen(options);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Reopen with whole key filtering enabled and prefix extractor
+    // NULL. Bloom filter should be off for both of whole key and
+    // prefix bloom.
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.prefix_extractor.reset();
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    // Write DB with only full key filtering.
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Reopen with both of whole key off and prefix extractor enabled.
+    // Still no bloom filter should be used.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+
+    // Try to create a DB with mixed files:
+    ASSERT_OK(dbfull()->Put(wo, "foobar", "foo"));
+    // Needs insert some keys to make sure files are not filtered out by key
+    // ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    options.prefix_extractor.reset();
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+
+    // Try to create a DB with mixed files.
+    ASSERT_OK(dbfull()->Put(wo, "barfoo", "bar"));
+    // In this case needs insert some keys to make sure files are
+    // not filtered out by key ranges.
+    ASSERT_OK(dbfull()->Put(wo, "aaa", ""));
+    ASSERT_OK(dbfull()->Put(wo, "zzz", ""));
+    ASSERT_OK(Flush());
+
+    // Now we have two files:
+    // File 1: An older file with prefix bloom.
+    // File 2: A newer file with whole bloom filter.
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 3);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+
+    // Reopen with the same setting: only whole key is used
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 4);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 5);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 6);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+
+    // Restart with both filters are allowed
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 7);
+    // File 1 will has it filtered out.
+    // File 2 will not, as prefix `foo` exists in the file.
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 8);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 10);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+
+    // Restart with only prefix bloom is allowed.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    Reopen(options);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("foo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 11);
+    ASSERT_EQ("NOT_FOUND", Get("bar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("foo", Get("foobar"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    ASSERT_EQ("bar", Get("barfoo"));
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 12);
+    uint64_t bloom_filter_useful_all_levels = 0;
+    for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+      if (kv.second.bloom_filter_useful > 0) {
+        bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+      }
+    }
+    ASSERT_EQ(12, bloom_filter_useful_all_levels);
+    get_perf_context()->Reset();
+  }
+}
+
+TEST_P(DBBloomFilterTestWithParam, BloomFilter) {
+  do {
+    Options options = CurrentOptions();
+    env_->count_random_reads_ = true;
+    options.env = env_;
+    // ChangeCompactOptions() only changes compaction style, which does not
+    // trigger reset of table_factory
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy = Create(10, bfp_impl_);
+    table_options.partition_filters = partition_filters_;
+    if (partition_filters_) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.format_version = format_version_;
+    if (format_version_ >= 4) {
+      // value delta encoding challenged more with index interval > 1
+      table_options.index_block_restart_interval = 8;
+    }
+    table_options.metadata_block_size = 32;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Populate multiple layers
+    const int N = 10000;
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Compact(1, "a", "z");
+    for (int i = 0; i < N; i += 100) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    ASSERT_OK(Flush(1));
+
+    // Prevent auto compactions triggered by seeks
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+
+    // Lookup present keys.  Should rarely read from small sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    int reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d present => %d reads\n", N, reads);
+    ASSERT_GE(reads, N);
+    if (partition_filters_) {
+      // Without block cache, we read an extra partition filter per each
+      // level*read and a partition index per each read
+      ASSERT_LE(reads, 4 * N + 2 * N / 100);
+    } else {
+      ASSERT_LE(reads, N + 2 * N / 100);
+    }
+
+    // Lookup present keys.  Should rarely read from either sstable.
+    env_->random_read_counter_.Reset();
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i) + ".missing"));
+    }
+    reads = env_->random_read_counter_.Read();
+    fprintf(stderr, "%d missing => %d reads\n", N, reads);
+    if (partition_filters_) {
+      // With partitioned filter we read one extra filter per level per each
+      // missed read.
+      ASSERT_LE(reads, 2 * N + 3 * N / 100);
+    } else {
+      ASSERT_LE(reads, 3 * N / 100);
+    }
+
+#ifndef ROCKSDB_LITE
+    // Sanity check some table properties
+    std::map<std::string, std::string> props;
+    ASSERT_TRUE(db_->GetMapProperty(
+        handles_[1], DB::Properties::kAggregatedTableProperties, &props));
+    uint64_t nkeys = N + N / 100;
+    uint64_t filter_size = ParseUint64(props["filter_size"]);
+    EXPECT_LE(filter_size,
+              (partition_filters_ ? 12 : 11) * nkeys / /*bits / byte*/ 8);
+    if (bfp_impl_ == kAutoRibbon) {
+      // Sometimes using Ribbon filter which is more space-efficient
+      EXPECT_GE(filter_size, 7 * nkeys / /*bits / byte*/ 8);
+    } else {
+      // Always Bloom
+      EXPECT_GE(filter_size, 10 * nkeys / /*bits / byte*/ 8);
+    }
+
+    uint64_t num_filter_entries = ParseUint64(props["num_filter_entries"]);
+    EXPECT_EQ(num_filter_entries, nkeys);
+#endif  // ROCKSDB_LITE
+
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+namespace {
+
+class AlwaysTrueBitsBuilder : public FilterBitsBuilder {
+ public:
+  void AddKey(const Slice&) override {}
+  size_t EstimateEntriesAdded() override { return 0U; }
+  Slice Finish(std::unique_ptr<const char[]>* /* buf */) override {
+    // Interpreted as "always true" filter (0 probes over 1 byte of
+    // payload, 5 bytes metadata)
+    return Slice("\0\0\0\0\0\0", 6);
+  }
+  using FilterBitsBuilder::Finish;
+  size_t ApproximateNumEntries(size_t) override { return SIZE_MAX; }
+};
+
+class AlwaysTrueFilterPolicy : public ReadOnlyBuiltinFilterPolicy {
+ public:
+  explicit AlwaysTrueFilterPolicy(bool skip) : skip_(skip) {}
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext&) const override {
+    if (skip_) {
+      return nullptr;
+    } else {
+      return new AlwaysTrueBitsBuilder();
+    }
+  }
+
+ private:
+  bool skip_;
+};
+
+}  // anonymous namespace
+
+TEST_P(DBBloomFilterTestWithParam, SkipFilterOnEssentiallyZeroBpk) {
+  constexpr int maxKey = 10;
+  auto PutFn = [&]() {
+    int i;
+    // Put
+    for (i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(Key(i), Key(i)));
+    }
+    Flush();
+  };
+  auto GetFn = [&]() {
+    int i;
+    // Get OK
+    for (i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(Key(i)));
+    }
+    // Get NotFound
+    for (; i < maxKey * 2; i++) {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  };
+  auto PutAndGetFn = [&]() {
+    PutFn();
+    GetFn();
+  };
+#ifndef ROCKSDB_LITE
+  std::map<std::string, std::string> props;
+  const auto& kAggTableProps = DB::Properties::kAggregatedTableProperties;
+#endif  // ROCKSDB_LITE
+
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.partition_filters = partition_filters_;
+  if (partition_filters_) {
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  }
+  table_options.format_version = format_version_;
+
+  // Test 1: bits per key < 0.5 means skip filters -> no filter
+  // constructed or read.
+  table_options.filter_policy = Create(0.4, bfp_impl_);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor contruction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Test 2: use custom API to skip filters -> no filter constructed
+  // or read.
+  table_options.filter_policy.reset(
+      new AlwaysTrueFilterPolicy(/* skip */ true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor construction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Control test: using an actual filter with 100% FP rate -> the filter
+  // is constructed and checked on read.
+  table_options.filter_policy.reset(
+      new AlwaysTrueFilterPolicy(/* skip */ false));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify filter is accessed (and constructed)
+  EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+            maxKey * 2);
+  EXPECT_EQ(
+      TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+      maxKey);
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_NE(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+
+  // Test 3 (options test): Able to read existing filters with longstanding
+  // generated options file entry `filter_policy=rocksdb.BuiltinBloomFilter`
+  ASSERT_OK(FilterPolicy::CreateFromString(ConfigOptions(),
+                                           "rocksdb.BuiltinBloomFilter",
+                                           &table_options.filter_policy));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  GetFn();
+
+  // Verify filter is accessed
+  EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+            maxKey * 2);
+  EXPECT_EQ(
+      TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+      maxKey);
+
+  // But new filters are not generated (configuration details unknown)
+  DestroyAndReopen(options);
+  PutAndGetFn();
+
+  // Verify no filter access nor construction
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE), 0);
+  EXPECT_EQ(TestGetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE), 0);
+
+#ifndef ROCKSDB_LITE
+  props.clear();
+  ASSERT_TRUE(db_->GetMapProperty(kAggTableProps, &props));
+  EXPECT_EQ(props["filter_size"], "0");
+#endif  // ROCKSDB_LITE
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestDefFormatVersion,
+    ::testing::Values(
+        std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatDef, DBBloomFilterTestWithParam,
+    ::testing::Values(
+        std::make_tuple(kAutoBloom, true, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoBloom, false, test::kDefaultFormatVersion),
+        std::make_tuple(kAutoRibbon, false, test::kDefaultFormatVersion)));
+
+INSTANTIATE_TEST_CASE_P(
+    FormatLatest, DBBloomFilterTestWithParam,
+    ::testing::Values(std::make_tuple(kAutoBloom, true, kLatestFormatVersion),
+                      std::make_tuple(kAutoBloom, false, kLatestFormatVersion),
+                      std::make_tuple(kAutoRibbon, false,
+                                      kLatestFormatVersion)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBBloomFilterTest, BloomFilterRate) {
+  while (ChangeFilterOptions()) {
+    Options options = CurrentOptions();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    get_perf_context()->EnablePerLevelPerfContext();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+
+    // Check if they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+
+    // Check if filter is useful
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey * 0.98);
+    ASSERT_GE(
+        (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful,
+        maxKey * 0.98);
+    get_perf_context()->Reset();
+  }
+}
+
+namespace {
+struct CompatibilityConfig {
+  std::shared_ptr<const FilterPolicy> policy;
+  bool partitioned;
+  uint32_t format_version;
+
+  void SetInTableOptions(BlockBasedTableOptions* table_options) {
+    table_options->filter_policy = policy;
+    table_options->partition_filters = partitioned;
+    if (partitioned) {
+      table_options->index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    } else {
+      table_options->index_type =
+          BlockBasedTableOptions::IndexType::kBinarySearch;
+    }
+    table_options->format_version = format_version;
+  }
+};
+// High bits per key -> almost no FPs
+std::shared_ptr<const FilterPolicy> kCompatibilityBloomPolicy{
+    NewBloomFilterPolicy(20)};
+// bloom_before_level=-1 -> always use Ribbon
+std::shared_ptr<const FilterPolicy> kCompatibilityRibbonPolicy{
+    NewRibbonFilterPolicy(20, -1)};
+
+std::vector<CompatibilityConfig> kCompatibilityConfigs = {
+    {kCompatibilityBloomPolicy, false, BlockBasedTableOptions().format_version},
+    {kCompatibilityBloomPolicy, true, BlockBasedTableOptions().format_version},
+    {kCompatibilityBloomPolicy, false, /* legacy Bloom */ 4U},
+    {kCompatibilityRibbonPolicy, false,
+     BlockBasedTableOptions().format_version},
+    {kCompatibilityRibbonPolicy, true, BlockBasedTableOptions().format_version},
+};
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, BloomFilterCompatibility) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.level0_file_num_compaction_trigger =
+      static_cast<int>(kCompatibilityConfigs.size()) + 1;
+  options.max_open_files = -1;
+
+  Close();
+
+  // Create one file for each kind of filter. Each file covers a distinct key
+  // range.
+  for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+    BlockBasedTableOptions table_options;
+    kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+    ASSERT_TRUE(table_options.filter_policy != nullptr);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+
+    std::string prefix = std::to_string(i) + "_";
+    ASSERT_OK(Put(prefix + "A", "val"));
+    ASSERT_OK(Put(prefix + "Z", "val"));
+    ASSERT_OK(Flush());
+  }
+
+  // Test filter is used between each pair of {reader,writer} configurations,
+  // because any built-in FilterPolicy should be able to read filters from any
+  // other built-in FilterPolicy
+  for (size_t i = 0; i < kCompatibilityConfigs.size(); ++i) {
+    BlockBasedTableOptions table_options;
+    kCompatibilityConfigs[i].SetInTableOptions(&table_options);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    Reopen(options);
+    for (size_t j = 0; j < kCompatibilityConfigs.size(); ++j) {
+      std::string prefix = std::to_string(j) + "_";
+      ASSERT_EQ("val", Get(prefix + "A"));  // Filter positive
+      ASSERT_EQ("val", Get(prefix + "Z"));  // Filter positive
+      // Filter negative, with high probability
+      ASSERT_EQ("NOT_FOUND", Get(prefix + "Q"));
+      EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE),
+                2);
+      EXPECT_EQ(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), 1);
+    }
+  }
+}
+
+// To align with the type of hash entry being reserved in implementation.
+using FilterConstructionReserveMemoryHash = uint64_t;
+
+class ChargeFilterConstructionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<
+          CacheEntryRoleOptions::Decision, std::string, bool, bool>> {
+ public:
+  ChargeFilterConstructionTestWithParam()
+      : DBTestBase("db_bloom_filter_tests",
+                   /*env_do_fsync=*/true),
+        num_key_(0),
+        charge_filter_construction_(std::get<0>(GetParam())),
+        policy_(std::get<1>(GetParam())),
+        partition_filters_(std::get<2>(GetParam())),
+        detect_filter_construct_corruption_(std::get<3>(GetParam())) {
+    if (charge_filter_construction_ ==
+            CacheEntryRoleOptions::Decision::kDisabled ||
+        policy_ == kLegacyBloom) {
+      // For these cases, we only interested in whether filter construction
+      // cache charging happens instead of its accuracy. Therefore we don't
+      // need many keys.
+      num_key_ = 5;
+    } else if (partition_filters_) {
+      // For PartitionFilter case, since we set
+      // table_options.metadata_block_size big enough such that each partition
+      // trigger at least 1 dummy entry reservation each for hash entries and
+      // final filter, we need a large number of keys to ensure we have at least
+      // two partitions.
+      num_key_ = 18 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else if (policy_ == kFastLocalBloom) {
+      // For Bloom Filter + FullFilter case, since we design the num_key_ to
+      // make hash entry cache charging be a multiple of dummy entries, the
+      // correct behavior of charging final filter on top of it will trigger at
+      // least another dummy entry insertion. Therefore we can assert that
+      // behavior and we don't need a large number of keys to verify we
+      // indeed charge the final filter for in cache, even though final
+      // filter is a lot smaller than hash entries.
+      num_key_ = 1 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    } else {
+      // For Ribbon Filter + FullFilter case, we need a large enough number of
+      // keys so that charging final filter after releasing the hash entries
+      // reservation will trigger at least another dummy entry (or equivalently
+      // to saying, causing another peak in cache charging) as banding
+      // reservation might not be a multiple of dummy entry.
+      num_key_ = 12 *
+                 CacheReservationManagerImpl<
+                     CacheEntryRole::kFilterConstruction>::GetDummyEntrySize() /
+                 sizeof(FilterConstructionReserveMemoryHash);
+    }
+  }
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions table_options;
+
+    // We set cache capacity big enough to prevent cache full for convenience in
+    // calculation.
+    constexpr std::size_t kCacheCapacity = 100 * 1024 * 1024;
+
+    table_options.cache_usage_options.options_overrides.insert(
+        {CacheEntryRole::kFilterConstruction,
+         {/*.charged = */ charge_filter_construction_}});
+    table_options.filter_policy = Create(10, policy_);
+    table_options.partition_filters = partition_filters_;
+    if (table_options.partition_filters) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      // We set table_options.metadata_block_size big enough so that each
+      // partition trigger at least 1 dummy entry insertion each for hash
+      // entries and final filter.
+      table_options.metadata_block_size = 409000;
+    }
+    table_options.detect_filter_construct_corruption =
+        detect_filter_construct_corruption_;
+
+    LRUCacheOptions lo;
+    lo.capacity = kCacheCapacity;
+    lo.num_shard_bits = 0;  // 2^0 shard
+    lo.strict_capacity_limit = true;
+    cache_ = std::make_shared<
+        TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>(
+        (NewLRUCache(lo)));
+    table_options.block_cache = cache_;
+
+    return table_options;
+  }
+
+  std::size_t GetNumKey() { return num_key_; }
+
+  CacheEntryRoleOptions::Decision ChargeFilterConstructMemory() {
+    return charge_filter_construction_;
+  }
+
+  std::string GetFilterPolicy() { return policy_; }
+
+  bool PartitionFilters() { return partition_filters_; }
+
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+  GetCache() {
+    return cache_;
+  }
+
+ private:
+  std::size_t num_key_;
+  CacheEntryRoleOptions::Decision charge_filter_construction_;
+  std::string policy_;
+  bool partition_filters_;
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache_;
+  bool detect_filter_construct_corruption_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    ChargeFilterConstructionTestWithParam,
+    ChargeFilterConstructionTestWithParam,
+    ::testing::Values(
+        std::make_tuple(CacheEntryRoleOptions::Decision::kDisabled,
+                        kFastLocalBloom, false, false),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kFastLocalBloom, true, true),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, false, true),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, false),
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled,
+                        kStandard128Ribbon, true, true),
+
+        std::make_tuple(CacheEntryRoleOptions::Decision::kEnabled, kLegacyBloom,
+                        false, false)));
+
+// TODO: Speed up this test, and reduce disk space usage (~700MB)
+// The current test inserts many keys (on the scale of dummy entry size)
+// in order to make small memory user (e.g, final filter, partitioned hash
+// entries/filter/banding) , which is proportional to the number of
+// keys, big enough so that its cache charging triggers dummy entry insertion
+// and becomes observable in the test.
+//
+// However, inserting that many keys slows down this test and leaves future
+// developers an opportunity to speed it up.
+//
+// Possible approaches & challenges:
+// 1. Use sync point during cache charging of filter construction
+//
+// Benefit: It does not rely on triggering dummy entry insertion
+// but the sync point to verify small memory user is charged correctly.
+//
+// Challenge: this approach is intrusive.
+//
+// 2. Make dummy entry size configurable and set it small in the test
+//
+// Benefit: It increases the precision of cache charging and therefore
+// small memory usage can still trigger insertion of dummy entry.
+//
+// Challenge: change CacheReservationManager related APIs and a hack
+// might be needed to control the size of dummmy entry of
+// CacheReservationManager used in filter construction for testing
+// since CacheReservationManager is not exposed at the high level.
+//
+TEST_P(ChargeFilterConstructionTestWithParam, Basic) {
+  Options options = CurrentOptions();
+  // We set write_buffer_size big enough so that in the case where there is
+  // filter construction cache charging, flush won't be triggered before we
+  // manually trigger it for clean testing
+  options.write_buffer_size = 640 << 20;
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::shared_ptr<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFilterConstruction>>
+      cache = GetCache();
+  options.create_if_missing = true;
+  // Disable auto compaction to prevent its unexpected side effect
+  // to the number of keys per partition designed by us in the test
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  int num_key = static_cast<int>(GetNumKey());
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  ASSERT_EQ(cache->GetChargedCacheIncrementSum(), 0)
+      << "Flush was triggered too early in the test case with filter "
+         "construction cache charging - please make sure no flush triggered "
+         "during the key insertions above";
+
+  ASSERT_OK(Flush());
+
+  bool charge_filter_construction = (ChargeFilterConstructMemory() ==
+                                     CacheEntryRoleOptions::Decision::kEnabled);
+  std::string policy = GetFilterPolicy();
+  bool partition_filters = PartitionFilters();
+  bool detect_filter_construct_corruption =
+      table_options.detect_filter_construct_corruption;
+
+  std::deque<std::size_t> filter_construction_cache_res_peaks =
+      cache->GetChargedCachePeaks();
+  std::size_t filter_construction_cache_res_increments_sum =
+      cache->GetChargedCacheIncrementSum();
+
+  if (!charge_filter_construction) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0);
+    return;
+  }
+
+  if (policy == kLegacyBloom) {
+    EXPECT_EQ(filter_construction_cache_res_peaks.size(), 0)
+        << "There shouldn't be filter construction cache charging as this "
+           "feature does not support kLegacyBloom";
+    return;
+  }
+
+  const std::size_t kDummyEntrySize = CacheReservationManagerImpl<
+      CacheEntryRole::kFilterConstruction>::GetDummyEntrySize();
+
+  const std::size_t predicted_hash_entries_cache_res =
+      num_key * sizeof(FilterConstructionReserveMemoryHash);
+  ASSERT_EQ(predicted_hash_entries_cache_res % kDummyEntrySize, 0)
+      << "It's by this test's design that predicted_hash_entries_cache_res is "
+         "a multipe of dummy entry";
+
+  const std::size_t predicted_hash_entries_cache_res_dummy_entry_num =
+      predicted_hash_entries_cache_res / kDummyEntrySize;
+  const std::size_t predicted_final_filter_cache_res =
+      static_cast<std::size_t>(
+          std::ceil(1.0 * predicted_hash_entries_cache_res_dummy_entry_num / 6 *
+                    (policy == kStandard128Ribbon ? 0.7 : 1))) *
+      kDummyEntrySize;
+  const std::size_t predicted_banding_cache_res =
+      static_cast<std::size_t>(
+          std::ceil(predicted_hash_entries_cache_res_dummy_entry_num * 2.5)) *
+      kDummyEntrySize;
+
+  if (policy == kFastLocalBloom) {
+    /* kFastLocalBloom + FullFilter
+     *        p0
+     *       /  \
+     *    b /    \
+     *     /      \
+     *    /        \
+     *  0/          \
+     *  hash entries = b - 0, final filter = p0 - b
+     *  p0 = hash entries + final filter
+     *
+     *  The test is designed in a way such that the reservation for b is a
+     *  multiple of dummy entries so that reservation for (p0 - b)
+     *  will trigger at least another dummy entry insertion.
+     *
+     * kFastLocalBloom + FullFilter +
+     * detect_filter_construct_corruption
+     *  The peak p0 stays the same as
+     *  (kFastLocalBloom + FullFilter) but just lasts
+     *  longer since we release hash entries reservation later.
+     *
+     * kFastLocalBloom + PartitionedFilter
+     *                   p1
+     *                  /  \
+     *        p0     b'/    \
+     *       /  \     /      \
+     *    b /    \   /        \
+     *     /      \ /          \
+     *    /        a            \
+     *  0/                       \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries1 = b' - a
+     *  parittioned final filter1 = p0 - b, parittioned final filter2 = p1 - b'
+     *
+     *  (increment p0 - 0) + (increment p1 - a)
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + final filter
+     *
+     * kFastLocalBloom + PartitionedFilter +
+     * detect_filter_construct_corruption
+     *  The peak p0, p1 stay the same as
+     *  (kFastLocalBloom + PartitionedFilter) but just
+     *  last longer since we release hash entries reservation later.
+     *
+     */
+    if (!partition_filters) {
+      EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+          << "Filter construction cache charging should have only 1 peak in "
+             "case: kFastLocalBloom + FullFilter";
+      std::size_t filter_construction_cache_res_peak =
+          filter_construction_cache_res_peaks[0];
+      EXPECT_GT(filter_construction_cache_res_peak,
+                predicted_hash_entries_cache_res)
+          << "The testing number of hash entries is designed to make hash "
+             "entries cache charging be multiples of dummy entries"
+             " so the correct behavior of charging final filter on top of it"
+             " should've triggered at least another dummy entry insertion";
+
+      std::size_t predicted_filter_construction_cache_res_peak =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 0.9);
+      EXPECT_LE(filter_construction_cache_res_peak,
+                predicted_filter_construction_cache_res_peak * 1.1);
+      return;
+    } else {
+      EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+          << "Filter construction cache charging should have multiple peaks "
+             "in case: kFastLocalBloom + "
+             "PartitionedFilter";
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+
+  if (policy == kStandard128Ribbon) {
+    /* kStandard128Ribbon + FullFilter
+     *        p0
+     *       /  \  p1
+     *      /    \/\
+     *   b /     b' \
+     *    /          \
+     *  0/            \
+     *  hash entries = b - 0, banding = p0 - b, final filter = p1 - b'
+     *  p0 = hash entries + banding
+     *
+     *  The test is designed in a way such that the reservation for (p1 - b')
+     *  will trigger at least another dummy entry insertion
+     *  (or equivelantly to saying, creating another peak).
+     *
+     * kStandard128Ribbon + FullFilter +
+     * detect_filter_construct_corruption
+     *
+     *         new p0
+     *          /  \
+     *         /    \
+     *     pre p0    \
+     *       /        \
+     *      /          \
+     *   b /            \
+     *    /              \
+     *  0/                \
+     *  hash entries = b - 0, banding = pre p0 - b,
+     *  final filter = new p0 - pre p0
+     *  new p0 =  hash entries + banding + final filter
+     *
+     *  The previous p0 will no longer be a peak since under
+     *  detect_filter_construct_corruption == true, we do not release hash
+     *  entries reserveration (like p0 - b' previously) until after final filter
+     *  creation and post-verification
+     *
+     * kStandard128Ribbon + PartitionedFilter
+     *                     p3
+     *        p0           /\  p4
+     *       /  \ p1      /  \ /\
+     *      /    \/\  b''/    a' \
+     *   b /     b' \   /         \
+     *    /          \ /           \
+     *  0/            a             \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries2 = b'' - a
+     *  partitioned banding1 = p0 - b, partitioned banding2 = p3 - b''
+     *  parittioned final filter1 = p1 - b',parittioned final filter2 = p4 - a'
+     *
+     *  (increment p0 - 0) + (increment p1 - b')
+     *  + (increment p3 - a) + (increment p4 - a')
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned banding1 + parittioned banding2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + banding + final filter
+     *
+     * kStandard128Ribbon + PartitionedFilter +
+     * detect_filter_construct_corruption
+     *
+     *                          new p3
+     *                          /    \
+     *                        pre p3  \
+     *        new p0          /        \
+     *         /  \          /          \
+     *      pre p0 \        /            \
+     *       /      \    b'/              \
+     *      /        \    /                \
+     *   b /          \  /                  \
+     *    /            \a                    \
+     *  0/                                    \
+     *  partitioned hash entries1 = b - 0, partitioned hash entries2 = b' - a
+     *  partitioned banding1 = pre p0 - b, partitioned banding2 = pre p3 - b'
+     *  parittioned final filter1 = new p0 - pre p0,
+     *  parittioned final filter2 = new p3 - pre p3
+     *
+     *  The previous p0 and p3 will no longer be a peak since under
+     *  detect_filter_construct_corruption == true, we do not release hash
+     *  entries reserveration (like p0 - b', p3 - a' previously) until after
+     *  parittioned final filter creation and post-verification
+     *
+     *  However, increments sum stay the same as shown below:
+     *    (increment new p0 - 0) + (increment new p3 - a)
+     *  = partitioned hash entries1 + partitioned hash entries2
+     *  + parittioned banding1 + parittioned banding2
+     *  + parittioned final filter1 + parittioned final filter2
+     *  = hash entries + banding + final filter
+     *
+     */
+    if (!partition_filters) {
+      ASSERT_GE(
+          std::floor(
+              1.0 * predicted_final_filter_cache_res /
+              CacheReservationManagerImpl<
+                  CacheEntryRole::kFilterConstruction>::GetDummyEntrySize()),
+          1)
+          << "Final filter cache charging too small for this test - please "
+             "increase the number of keys";
+      if (!detect_filter_construct_corruption) {
+        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 2)
+            << "Filter construction cache charging should have 2 peaks in "
+               "case: kStandard128Ribbon + "
+               "FullFilter. "
+               "The second peak is resulted from charging the final filter "
+               "after "
+               "decreasing the hash entry reservation since the testing final "
+               "filter reservation is designed to be at least 1 dummy entry "
+               "size";
+
+        std::size_t filter_construction_cache_res_peak =
+            filter_construction_cache_res_peaks[0];
+        std::size_t predicted_filter_construction_cache_res_peak =
+            predicted_hash_entries_cache_res + predicted_banding_cache_res;
+        EXPECT_GE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 0.9);
+        EXPECT_LE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 1.1);
+      } else {
+        EXPECT_EQ(filter_construction_cache_res_peaks.size(), 1)
+            << "Filter construction cache charging should have 1 peaks in "
+               "case: kStandard128Ribbon + FullFilter "
+               "+ detect_filter_construct_corruption. "
+               "The previous second peak now disappears since we don't "
+               "decrease the hash entry reservation"
+               "until after final filter reservation and post-verification";
+
+        std::size_t filter_construction_cache_res_peak =
+            filter_construction_cache_res_peaks[0];
+        std::size_t predicted_filter_construction_cache_res_peak =
+            predicted_hash_entries_cache_res + predicted_banding_cache_res +
+            predicted_final_filter_cache_res;
+        EXPECT_GE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 0.9);
+        EXPECT_LE(filter_construction_cache_res_peak,
+                  predicted_filter_construction_cache_res_peak * 1.1);
+      }
+      return;
+    } else {
+      if (!detect_filter_construct_corruption) {
+        EXPECT_GE(filter_construction_cache_res_peaks.size(), 3)
+            << "Filter construction cache charging should have more than 3 "
+               "peaks "
+               "in case: kStandard128Ribbon + "
+               "PartitionedFilter";
+      } else {
+        EXPECT_GE(filter_construction_cache_res_peaks.size(), 2)
+            << "Filter construction cache charging should have more than 2 "
+               "peaks "
+               "in case: kStandard128Ribbon + "
+               "PartitionedFilter + detect_filter_construct_corruption";
+      }
+      std::size_t predicted_filter_construction_cache_res_increments_sum =
+          predicted_hash_entries_cache_res + predicted_banding_cache_res +
+          predicted_final_filter_cache_res;
+      EXPECT_GE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 0.9);
+      EXPECT_LE(filter_construction_cache_res_increments_sum,
+                predicted_filter_construction_cache_res_increments_sum * 1.1);
+      return;
+    }
+  }
+}
+
+class DBFilterConstructionCorruptionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<
+          std::tuple<bool /* detect_filter_construct_corruption */, std::string,
+                     bool /* partition_filters */>> {
+ public:
+  DBFilterConstructionCorruptionTestWithParam()
+      : DBTestBase("db_bloom_filter_tests",
+                   /*env_do_fsync=*/true) {}
+
+  BlockBasedTableOptions GetBlockBasedTableOptions() {
+    BlockBasedTableOptions table_options;
+    table_options.detect_filter_construct_corruption = std::get<0>(GetParam());
+    table_options.filter_policy = Create(10, std::get<1>(GetParam()));
+    table_options.partition_filters = std::get<2>(GetParam());
+    if (table_options.partition_filters) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      // We set table_options.metadata_block_size small enough so we can
+      // trigger filter partitioning with GetNumKey() amount of keys
+      table_options.metadata_block_size = 10;
+    }
+
+    return table_options;
+  }
+
+  // Return an appropriate amount of keys for testing
+  // to generate a long filter (i.e, size >= 8 + kMetadataLen)
+  std::size_t GetNumKey() { return 5000; }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBFilterConstructionCorruptionTestWithParam,
+    DBFilterConstructionCorruptionTestWithParam,
+    ::testing::Values(std::make_tuple(false, kFastLocalBloom, false),
+                      std::make_tuple(true, kFastLocalBloom, false),
+                      std::make_tuple(true, kFastLocalBloom, true),
+                      std::make_tuple(true, kStandard128Ribbon, false),
+                      std::make_tuple(true, kStandard128Ribbon, true)));
+
+TEST_P(DBFilterConstructionCorruptionTestWithParam, DetectCorruption) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int num_key = static_cast<int>(GetNumKey());
+  Status s;
+
+  // Case 1: No corruption in filter construction
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+  s = Flush();
+  EXPECT_TRUE(s.ok());
+
+  // Case 2: Corruption of hash entries in filter construction
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  if (table_options.detect_filter_construct_corruption) {
+    EXPECT_TRUE(s.IsCorruption());
+    EXPECT_TRUE(
+        s.ToString().find("Filter's hash entries checksum mismatched") !=
+        std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  // Case 3: Corruption of filter content in filter construction
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperFilter", [&](void* arg) {
+        std::pair<std::unique_ptr<char[]>*, std::size_t>* TEST_arg_pair =
+            (std::pair<std::unique_ptr<char[]>*, std::size_t>*)arg;
+        std::size_t filter_size = TEST_arg_pair->second;
+        // 5 is the kMetadataLen and
+        assert(filter_size >= 8 + 5);
+        std::unique_ptr<char[]>* filter_content_to_corrupt =
+            TEST_arg_pair->first;
+        std::memset(filter_content_to_corrupt->get(), '\0', 8);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  if (table_options.detect_filter_construct_corruption) {
+    EXPECT_TRUE(s.IsCorruption());
+    EXPECT_TRUE(s.ToString().find("Corrupted filter content") !=
+                std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperFilter");
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_P(DBFilterConstructionCorruptionTestWithParam,
+       DynamicallyTurnOnAndOffDetectConstructCorruption) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
+  // We intend to turn on
+  // table_options.detect_filter_construct_corruption dynamically
+  // therefore we override this test parmater's value
+  table_options.detect_filter_construct_corruption = false;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+
+  int num_key = static_cast<int>(GetNumKey());
+  Status s;
+
+  DestroyAndReopen(options);
+
+  // Case 1: !table_options.detect_filter_construct_corruption
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  ASSERT_FALSE(table_options.detect_filter_construct_corruption);
+  EXPECT_TRUE(s.ok());
+
+  // Case 2: dynamically turn on
+  // table_options.detect_filter_construct_corruption
+  ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                              "{detect_filter_construct_corruption=true;}"}}));
+
+  for (int i = 0; i < num_key; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+  }
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "XXPH3FilterBitsBuilder::Finish::TamperHashEntries", [&](void* arg) {
+        std::deque<uint64_t>* hash_entries_to_corrupt =
+            (std::deque<uint64_t>*)arg;
+        assert(!hash_entries_to_corrupt->empty());
+        *(hash_entries_to_corrupt->begin()) =
+            *(hash_entries_to_corrupt->begin()) ^ uint64_t { 1 };
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  s = Flush();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearCallBack(
+      "XXPH3FilterBitsBuilder::Finish::"
+      "TamperHashEntries");
+
+  auto updated_table_options =
+      db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+  EXPECT_TRUE(updated_table_options->detect_filter_construct_corruption);
+  EXPECT_TRUE(s.IsCorruption());
+  EXPECT_TRUE(s.ToString().find("Filter's hash entries checksum mismatched") !=
+              std::string::npos);
+
+  // Case 3: dynamically turn off
+  // table_options.detect_filter_construct_corruption
+  ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
+                              "{detect_filter_construct_corruption=false;}"}}));
+  updated_table_options =
+      db_->GetOptions().table_factory->GetOptions<BlockBasedTableOptions>();
+  EXPECT_FALSE(updated_table_options->detect_filter_construct_corruption);
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+// NOTE: This class is referenced by HISTORY.md as a model for a wrapper
+// FilterPolicy selecting among configurations based on context.
+class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
+ public:
+  explicit LevelAndStyleCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                           int bpk_otherwise)
+      : policy_fifo_(NewBloomFilterPolicy(bpk_fifo)),
+        policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
+        policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
+
+  const char* Name() const override {
+    return "LevelAndStyleCustomFilterPolicy";
+  }
+
+  // OK to use built-in policy name because we are deferring to a
+  // built-in builder. We aren't changing the serialized format.
+  const char* CompatibilityName() const override {
+    return policy_fifo_->CompatibilityName();
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    if (context.compaction_style == kCompactionStyleFIFO) {
+      return policy_fifo_->GetBuilderWithContext(context);
+    } else if (context.level_at_creation == 0) {
+      return policy_l0_other_->GetBuilderWithContext(context);
+    } else {
+      return policy_otherwise_->GetBuilderWithContext(context);
+    }
+  }
+
+  FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override {
+    // OK to defer to any of them; they all can parse built-in filters
+    // from any settings.
+    return policy_fifo_->GetFilterBitsReader(contents);
+  }
+
+ private:
+  const std::unique_ptr<const FilterPolicy> policy_fifo_;
+  const std::unique_ptr<const FilterPolicy> policy_l0_other_;
+  const std::unique_ptr<const FilterPolicy> policy_otherwise_;
+};
+
+static std::map<TableFileCreationReason, std::string>
+    table_file_creation_reason_to_string{
+        {TableFileCreationReason::kCompaction, "kCompaction"},
+        {TableFileCreationReason::kFlush, "kFlush"},
+        {TableFileCreationReason::kMisc, "kMisc"},
+        {TableFileCreationReason::kRecovery, "kRecovery"},
+    };
+
+class TestingContextCustomFilterPolicy
+    : public LevelAndStyleCustomFilterPolicy {
+ public:
+  explicit TestingContextCustomFilterPolicy(int bpk_fifo, int bpk_l0_other,
+                                            int bpk_otherwise)
+      : LevelAndStyleCustomFilterPolicy(bpk_fifo, bpk_l0_other, bpk_otherwise) {
+  }
+
+  FilterBitsBuilder* GetBuilderWithContext(
+      const FilterBuildingContext& context) const override {
+    test_report_ += "cf=";
+    test_report_ += context.column_family_name;
+    test_report_ += ",s=";
+    test_report_ +=
+        OptionsHelper::compaction_style_to_string[context.compaction_style];
+    test_report_ += ",n=";
+    test_report_ += std::to_string(context.num_levels);
+    test_report_ += ",l=";
+    test_report_ += std::to_string(context.level_at_creation);
+    test_report_ += ",b=";
+    test_report_ += std::to_string(int{context.is_bottommost});
+    test_report_ += ",r=";
+    test_report_ += table_file_creation_reason_to_string[context.reason];
+    test_report_ += "\n";
+
+    return LevelAndStyleCustomFilterPolicy::GetBuilderWithContext(context);
+  }
+
+  std::string DumpTestReport() {
+    std::string rv;
+    std::swap(rv, test_report_);
+    return rv;
+  }
+
+ private:
+  mutable std::string test_report_;
+};
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, ContextCustomFilterPolicy) {
+  auto policy = std::make_shared<TestingContextCustomFilterPolicy>(15, 8, 5);
+  Options options;
+  for (bool fifo : {true, false}) {
+    options = CurrentOptions();
+    options.max_open_files = fifo ? -1 : options.max_open_files;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    options.compaction_style =
+        fifo ? kCompactionStyleFIFO : kCompactionStyleLevel;
+
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy = policy;
+    table_options.format_version = 5;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    TryReopen(options);
+    CreateAndReopenWithCF({fifo ? "abe" : "bob"}, options);
+
+    const int maxKey = 10000;
+    for (int i = 0; i < maxKey / 2; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    // Add a large key to make the file contain wide range
+    ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555)));
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+    for (int i = maxKey / 2; i < maxKey; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i)));
+    }
+    Flush(1);
+    EXPECT_EQ(policy->DumpTestReport(),
+              fifo ? "cf=abe,s=kCompactionStyleFIFO,n=7,l=0,b=0,r=kFlush\n"
+                   : "cf=bob,s=kCompactionStyleLevel,n=7,l=0,b=0,r=kFlush\n");
+
+    // Check that they can be found
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ(Key(i), Get(1, Key(i)));
+    }
+    // Since we have two tables / two filters, we might have Bloom checks on
+    // our queries, but no more than one "useful" per query on a found key.
+    EXPECT_LE(TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey);
+
+    // Check that we have two filters, each about
+    // fifo: 0.12% FP rate (15 bits per key)
+    // level: 2.3% FP rate (8 bits per key)
+    for (int i = 0; i < maxKey; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+    }
+    {
+      auto useful_count =
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+      EXPECT_GE(useful_count, maxKey * 2 * (fifo ? 0.9980 : 0.975));
+      EXPECT_LE(useful_count, maxKey * 2 * (fifo ? 0.9995 : 0.98));
+    }
+
+    if (!fifo) {  // FIFO doesn't fully support CompactRange
+      // Full compaction
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr,
+                                  nullptr));
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=bob,s=kCompactionStyleLevel,n=7,l=1,b=1,r=kCompaction\n");
+
+      // Check that we now have one filter, about 9.2% FP rate (5 bits per key)
+      for (int i = 0; i < maxKey; i++) {
+        ASSERT_EQ("NOT_FOUND", Get(1, Key(i + 33333)));
+      }
+      {
+        auto useful_count =
+            TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+        EXPECT_GE(useful_count, maxKey * 0.90);
+        EXPECT_LE(useful_count, maxKey * 0.91);
+      }
+    } else {
+#ifndef ROCKSDB_LITE
+      // Also try external SST file
+      {
+        std::string file_path = dbname_ + "/external.sst";
+        SstFileWriter sst_file_writer(EnvOptions(), options, handles_[1]);
+        ASSERT_OK(sst_file_writer.Open(file_path));
+        ASSERT_OK(sst_file_writer.Put("key", "value"));
+        ASSERT_OK(sst_file_writer.Finish());
+      }
+      // Note: kCompactionStyleLevel is default, ignored if num_levels == -1
+      EXPECT_EQ(policy->DumpTestReport(),
+                "cf=abe,s=kCompactionStyleLevel,n=-1,l=-1,b=0,r=kMisc\n");
+#endif
+    }
+
+    // Destroy
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_[1] = nullptr;
+  }
+}
+
+class SliceTransformLimitedDomain : public SliceTransform {
+  const char* Name() const override { return "SliceTransformLimitedDomain"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 5);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 5 && src[0] == 'x';
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 5 && dst[0] == 'x';
+  }
+};
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter1) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1111_AAAA", "val1"));
+  ASSERT_OK(Put("x1112_AAAA", "val2"));
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val5"));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("x1111_AAAA"), "val1");
+  ASSERT_EQ(Get("x1112_AAAA"), "val2");
+  ASSERT_EQ(Get("x1113_AAAA"), "val3");
+  ASSERT_EQ(Get("x1114_AAAA"), "val4");
+  // Was not added to filter but rocksdb will try to read it from the filter
+  ASSERT_EQ(Get("zzzzz_AAAA"), "val5");
+}
+
+TEST_F(DBBloomFilterTest, PrefixExtractorWithFilter2) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<SliceTransformLimitedDomain>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("x1113_AAAA", "val3"));
+  ASSERT_OK(Put("x1114_AAAA", "val4"));
+  // Not in domain, wont be added to filter
+  ASSERT_OK(Put("zzzzz_AAAA", "val1"));
+  ASSERT_OK(Put("zzzzz_AAAB", "val2"));
+  ASSERT_OK(Put("zzzzz_AAAC", "val3"));
+  ASSERT_OK(Put("zzzzz_AAAD", "val4"));
+
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> iter_res;
+  auto iter = db_->NewIterator(ReadOptions());
+  // Seek to a key that was not in Domain
+  for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) {
+    iter_res.emplace_back(iter->value().ToString());
+  }
+
+  std::vector<std::string> expected_res = {"val1", "val2", "val3", "val4"};
+  ASSERT_EQ(iter_res, expected_res);
+  delete iter;
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kPrefixLen = 4;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  options.memtable_whole_key_filtering = false;
+  Reopen(options);
+  std::string key1("AAAABBBB");
+  std::string key2("AAAACCCC");  // not in DB
+  std::string key3("AAAADDDD");
+  std::string key4("AAAAEEEE");
+  std::string value1("Value1");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+  // same prefix, bloom filter false positive
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // enable whole key bloom filter
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+
+  // verify whole key filtering does not depend on prefix_extractor
+  options.prefix_extractor.reset();
+  Reopen(options);
+  // check memtable bloom stats
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  // whole key bloom filter kicks in and determines it's a miss
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+}
+
+TEST_F(DBBloomFilterTest, MemtableWholeKeyBloomFilterMultiGet) {
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio = 0.015;
+  options.memtable_whole_key_filtering = true;
+  Reopen(options);
+  std::string key1("AA");
+  std::string key2("BB");
+  std::string key3("CC");
+  std::string key4("DD");
+  std::string key_not("EE");
+  std::string value1("Value1");
+  std::string value2("Value2");
+  std::string value3("Value3");
+  std::string value4("Value4");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key2, value2, WriteOptions()));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put(key4, value4, WriteOptions()));
+
+  // Delete key2 and key3
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "BA", "CZ"));
+
+  // Read without snapshot
+  auto results = MultiGet({key_not, key1, key2, key3, key4});
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], "NOT_FOUND");
+  ASSERT_EQ(results[3], "NOT_FOUND");
+  ASSERT_EQ(results[4], value4);
+
+  // Also check Get
+  ASSERT_EQ(Get(key1), value1);
+  ASSERT_EQ(Get(key2), "NOT_FOUND");
+  ASSERT_EQ(Get(key3), "NOT_FOUND");
+  ASSERT_EQ(Get(key4), value4);
+
+  // Read with snapshot
+  results = MultiGet({key_not, key1, key2, key3, key4}, snapshot);
+  ASSERT_EQ(results[0], "NOT_FOUND");
+  ASSERT_EQ(results[1], value1);
+  ASSERT_EQ(results[2], value2);
+  ASSERT_EQ(results[3], value3);
+  ASSERT_EQ(results[4], "NOT_FOUND");
+
+  // Also check Get
+  ASSERT_EQ(Get(key1, snapshot), value1);
+  ASSERT_EQ(Get(key2, snapshot), value2);
+  ASSERT_EQ(Get(key3, snapshot), value3);
+  ASSERT_EQ(Get(key4, snapshot), "NOT_FOUND");
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBBloomFilterTest, MemtablePrefixBloomOutOfDomain) {
+  constexpr size_t kPrefixSize = 8;
+  const std::string kKey = "key";
+  assert(kKey.size() < kPrefixSize);
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixSize));
+  options.memtable_prefix_bloom_size_ratio = 0.25;
+  Reopen(options);
+  ASSERT_OK(Put(kKey, "v"));
+  ASSERT_EQ("v", Get(kKey));
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+  iter->Seek(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+  iter->SeekForPrev(kKey);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(kKey, iter->key());
+}
+
+class DBBloomFilterTestVaryPrefixAndFormatVer
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, uint32_t>> {
+ protected:
+  bool use_prefix_;
+  uint32_t format_version_;
+
+ public:
+  DBBloomFilterTestVaryPrefixAndFormatVer()
+      : DBTestBase("db_bloom_filter_tests", /*env_do_fsync=*/true) {}
+
+  ~DBBloomFilterTestVaryPrefixAndFormatVer() override {}
+
+  void SetUp() override {
+    use_prefix_ = std::get<0>(GetParam());
+    format_version_ = std::get<1>(GetParam());
+  }
+
+  static std::string UKey(uint32_t i) { return Key(static_cast<int>(i)); }
+};
+
+TEST_P(DBBloomFilterTestVaryPrefixAndFormatVer, PartitionedMultiGet) {
+  Options options = CurrentOptions();
+  if (use_prefix_) {
+    // Entire key from UKey()
+    options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  }
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(20));
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.whole_key_filtering = !use_prefix_;
+  if (use_prefix_) {  // (not related to prefix, just alternating between)
+    // Make sure code appropriately deals with metadata block size setting
+    // that is "too small" (smaller than minimum size for filter builder)
+    bbto.metadata_block_size = 63;
+  } else {
+    // Make sure the test will work even on platforms with large minimum
+    // filter size, due to large cache line size.
+    // (Largest cache line size + 10+% overhead.)
+    bbto.metadata_block_size = 290;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+
+  constexpr uint32_t N = 12000;
+  // Add N/2 evens
+  for (uint32_t i = 0; i < N; i += 2) {
+    ASSERT_OK(Put(UKey(i), UKey(i)));
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(TotalTableFiles(), 1);
+#endif
+
+  constexpr uint32_t Q = 29;
+  // MultiGet In
+  std::array<std::string, Q> keys;
+  std::array<Slice, Q> key_slices;
+  std::array<ColumnFamilyHandle*, Q> column_families;
+  // MultiGet Out
+  std::array<Status, Q> statuses;
+  std::array<PinnableSlice, Q> values;
+
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_USEFUL);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_POSITIVE);
+  TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE);
+
+  // Check that initial clump of keys only loads one partition filter from
+  // block cache.
+  // And that spread out keys load many partition filters.
+  // In both cases, mix present vs. not present keys.
+  for (uint32_t stride : {uint32_t{1}, (N / Q) | 1}) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(i * stride);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if ((i * stride % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (no FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_EQ(filter_useful, number_not_found);
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (stride == 1) {
+      EXPECT_EQ(filter_accesses, 1);
+    } else {
+      // for large stride
+      EXPECT_GE(filter_accesses, Q / 2 + 1);
+    }
+  }
+
+  // Check that a clump of keys (present and not) works when spanning
+  // two partitions
+  int found_spanning = 0;
+  for (uint32_t start = 0; start < N / 2;) {
+    for (uint32_t i = 0; i < Q; ++i) {
+      keys[i] = UKey(start + i);
+      key_slices[i] = Slice(keys[i]);
+      column_families[i] = db_->DefaultColumnFamily();
+      statuses[i] = Status();
+      values[i] = PinnableSlice();
+    }
+
+    db_->MultiGet(ropts, Q, &column_families[0], &key_slices[0], &values[0],
+                  /*timestamps=*/nullptr, &statuses[0], true);
+
+    // Confirm correct status results
+    uint32_t number_not_found = 0;
+    for (uint32_t i = 0; i < Q; ++i) {
+      if (((start + i) % 2) == 0) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+        ++number_not_found;
+      }
+    }
+
+    // Confirm correct Bloom stats (might see some FPs)
+    uint64_t filter_useful = TestGetAndResetTickerCount(
+        options,
+        use_prefix_ ? BLOOM_FILTER_PREFIX_USEFUL : BLOOM_FILTER_USEFUL);
+    uint64_t filter_checked =
+        TestGetAndResetTickerCount(options, use_prefix_
+                                                ? BLOOM_FILTER_PREFIX_CHECKED
+                                                : BLOOM_FILTER_FULL_POSITIVE) +
+        (use_prefix_ ? 0 : filter_useful);
+    EXPECT_GE(filter_useful, number_not_found - 2);  // possible FP
+    EXPECT_EQ(filter_checked, Q);
+    if (!use_prefix_) {
+      EXPECT_EQ(
+          TestGetAndResetTickerCount(options, BLOOM_FILTER_FULL_TRUE_POSITIVE),
+          Q - number_not_found);
+    }
+
+    // Confirm no duplicate loading of same filter partition
+    uint64_t filter_accesses =
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_HIT) +
+        TestGetAndResetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+    if (filter_accesses == 2) {
+      // Spanned across partitions.
+      ++found_spanning;
+      if (found_spanning >= 2) {
+        break;
+      } else {
+        // Ensure that at least once we have at least one present and
+        // one non-present key on both sides of partition boundary.
+        start += 2;
+      }
+    } else {
+      EXPECT_EQ(filter_accesses, 1);
+      // See explanation at "start += 2"
+      start += Q - 4;
+    }
+  }
+  EXPECT_TRUE(found_spanning >= 2);
+}
+
+INSTANTIATE_TEST_CASE_P(DBBloomFilterTestVaryPrefixAndFormatVer,
+                        DBBloomFilterTestVaryPrefixAndFormatVer,
+                        ::testing::Values(
+                            // (use_prefix, format_version)
+                            std::make_tuple(false, 2),
+                            std::make_tuple(false, 3),
+                            std::make_tuple(false, 4),
+                            std::make_tuple(false, 5), std::make_tuple(true, 2),
+                            std::make_tuple(true, 3), std::make_tuple(true, 4),
+                            std::make_tuple(true, 5)));
+
+#ifndef ROCKSDB_LITE
+namespace {
+static const std::string kPlainTable = "test_PlainTableBloom";
+}  // anonymous namespace
+
+class BloomStatsTestWithParam
+    : public DBBloomFilterTest,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  BloomStatsTestWithParam() {
+    bfp_impl_ = std::get<0>(GetParam());
+    partition_filters_ = std::get<1>(GetParam());
+
+    options_.create_if_missing = true;
+    options_.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(4));
+    options_.memtable_prefix_bloom_size_ratio =
+        8.0 * 1024.0 / static_cast<double>(options_.write_buffer_size);
+    if (bfp_impl_ == kPlainTable) {
+      assert(!partition_filters_);  // not supported in plain table
+      PlainTableOptions table_options;
+      options_.table_factory.reset(NewPlainTableFactory(table_options));
+    } else {
+      BlockBasedTableOptions table_options;
+      if (partition_filters_) {
+        table_options.partition_filters = partition_filters_;
+        table_options.index_type =
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      }
+      table_options.filter_policy = Create(10, bfp_impl_);
+      options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    }
+    options_.env = env_;
+
+    get_perf_context()->Reset();
+    DestroyAndReopen(options_);
+  }
+
+  ~BloomStatsTestWithParam() override {
+    get_perf_context()->Reset();
+    Destroy(options_);
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  std::string bfp_impl_;
+  bool partition_filters_;
+  Options options_;
+};
+
+// 1 Insert 2 K-V pairs into DB
+// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
+// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
+// 4 Call Flush() to create SST
+// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
+// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
+// Test both: block and plain SST
+TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  // check memtable bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  Flush();
+
+  // sanity checks
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_sst_miss_count);
+
+  // check SST bloom stats
+  ASSERT_EQ(value1, Get(key1));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+  ASSERT_EQ(value3, Get(key3));
+  ASSERT_EQ(2, get_perf_context()->bloom_sst_hit_count);
+
+  ASSERT_EQ("NOT_FOUND", Get(key2));
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+}
+
+// Same scenario as in BloomStatsTest but using an iterator
+TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
+  std::string key1("AAAA");
+  std::string key2("RXDB");  // not in DB
+  std::string key3("ZBRA");
+  std::string value1("Value1");
+  std::string value3("Value3");
+
+  ASSERT_OK(Put(key1, value1, WriteOptions()));
+  ASSERT_OK(Put(key3, value3, WriteOptions()));
+
+  std::unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
+
+  // check memtable bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+  ASSERT_EQ(0, get_perf_context()->bloom_memtable_miss_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_memtable_miss_count);
+  ASSERT_EQ(2, get_perf_context()->bloom_memtable_hit_count);
+
+  Flush();
+
+  iter.reset(dbfull()->NewIterator(ReadOptions()));
+
+  // Check SST bloom stats
+  iter->Seek(key1);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value1, iter->value().ToString());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key3);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(value3, iter->value().ToString());
+  uint64_t expected_hits = 2;
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+
+  iter->Seek(key2);
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_EQ(1, get_perf_context()->bloom_sst_miss_count);
+  ASSERT_EQ(expected_hits, get_perf_context()->bloom_sst_hit_count);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BloomStatsTestWithParam, BloomStatsTestWithParam,
+    ::testing::Values(std::make_tuple(kLegacyBloom, false),
+                      std::make_tuple(kLegacyBloom, true),
+                      std::make_tuple(kFastLocalBloom, false),
+                      std::make_tuple(kFastLocalBloom, true),
+                      std::make_tuple(kPlainTable, false)));
+
+namespace {
+void PrefixScanInit(DBBloomFilterTest* dbtest) {
+  char buf[100];
+  std::string keystr;
+  const int small_range_sstfiles = 5;
+  const int big_range_sstfiles = 5;
+
+  // Generate 11 sst files with the following prefix ranges.
+  // GROUP 0: [0,10]                              (level 1)
+  // GROUP 1: [1,2], [2,3], [3,4], [4,5], [5, 6]  (level 0)
+  // GROUP 2: [0,6], [0,7], [0,8], [0,9], [0,10]  (level 0)
+  //
+  // A seek with the previous API would do 11 random I/Os (to all the
+  // files).  With the new API and a prefix filter enabled, we should
+  // only do 2 random I/O, to the 2 files containing the key.
+
+  // GROUP 0
+  snprintf(buf, sizeof(buf), "%02d______:start", 0);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  snprintf(buf, sizeof(buf), "%02d______:end", 10);
+  keystr = std::string(buf);
+  ASSERT_OK(dbtest->Put(keystr, keystr));
+  ASSERT_OK(dbtest->Flush());
+  ASSERT_OK(dbtest->dbfull()->CompactRange(CompactRangeOptions(), nullptr,
+                                           nullptr));  // move to level 1
+
+  // GROUP 1
+  for (int i = 1; i <= small_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", i);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+
+  // GROUP 2
+  for (int i = 1; i <= big_range_sstfiles; i++) {
+    snprintf(buf, sizeof(buf), "%02d______:start", 0);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    snprintf(buf, sizeof(buf), "%02d______:end", small_range_sstfiles + i + 1);
+    keystr = std::string(buf);
+    ASSERT_OK(dbtest->Put(keystr, keystr));
+    dbtest->Flush();
+  }
+}
+}  // anonymous namespace
+
+TEST_F(DBBloomFilterTest, PrefixScan) {
+  while (ChangeFilterOptions()) {
+    int count;
+    Slice prefix;
+    Slice key;
+    char buf[100];
+    Iterator* iter;
+    snprintf(buf, sizeof(buf), "03______:");
+    prefix = Slice(buf, 8);
+    key = Slice(buf, 9);
+    ASSERT_EQ(key.difference_offset(prefix), 8);
+    ASSERT_EQ(prefix.difference_offset(key), 8);
+    // db configs
+    env_->count_random_reads_ = true;
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.disable_auto_compactions = true;
+    options.max_background_compactions = 2;
+    options.create_if_missing = true;
+    options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+    assert(!options.unordered_write);
+    // It is incompatible with allow_concurrent_memtable_write=false
+    options.allow_concurrent_memtable_write = false;
+
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+    table_options.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    // 11 RAND I/Os
+    DestroyAndReopen(options);
+    PrefixScanInit(this);
+    count = 0;
+    env_->random_read_counter_.Reset();
+    iter = db_->NewIterator(ReadOptions());
+    for (iter->Seek(prefix); iter->Valid(); iter->Next()) {
+      if (!iter->key().starts_with(prefix)) {
+        break;
+      }
+      count++;
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+    ASSERT_EQ(count, 2);
+    ASSERT_EQ(env_->random_read_counter_.Read(), 2);
+    Close();
+  }  // end of while
+}
+
+TEST_F(DBBloomFilterTest, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;
+  options.arena_block_size = 4 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.max_bytes_for_level_base = 256 * 1024;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 8;
+  options.max_background_flushes = 8;
+  options.compression = kNoCompression;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level_compaction_dynamic_level_bytes = true;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  CreateAndReopenWithCF({"mypikachu"}, options);
+
+  int numkeys = 200000;
+
+  // Generate randomly shuffled keys, so the updates are almost
+  // random.
+  std::vector<int> keys;
+  keys.reserve(numkeys);
+  for (int i = 0; i < numkeys; i += 2) {
+    keys.push_back(i);
+  }
+  RandomShuffle(std::begin(keys), std::end(keys), /*seed*/ 42);
+  int num_inserted = 0;
+  for (int key : keys) {
+    ASSERT_OK(Put(1, Key(key), "val"));
+    if (++num_inserted % 1000 == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+  ASSERT_OK(Put(1, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(numkeys), "val"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  if (NumTableFilesAtLevel(0, 1) == 0) {
+    // No Level 0 file. Create one.
+    ASSERT_OK(Put(1, Key(0), "val"));
+    ASSERT_OK(Put(1, Key(numkeys), "val"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  for (int i = 1; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "NOT_FOUND");
+  }
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  // Now we have three sorted run, L0, L5 and L6 with most files in L6 have
+  // no bloom filter. Most keys be checked bloom filters twice.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 65000 * 2);
+  ASSERT_LT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 120000 * 2);
+  uint64_t bloom_filter_useful_all_levels = 0;
+  for (auto& kv : (*(get_perf_context()->level_to_perf_context))) {
+    if (kv.second.bloom_filter_useful > 0) {
+      bloom_filter_useful_all_levels += kv.second.bloom_filter_useful;
+    }
+  }
+  ASSERT_GT(bloom_filter_useful_all_levels, 65000 * 2);
+  ASSERT_LT(bloom_filter_useful_all_levels, 120000 * 2);
+
+  for (int i = 0; i < numkeys; i += 2) {
+    ASSERT_EQ(Get(1, Key(i)), "val");
+  }
+
+  // Part 2 (read path): rewrite last level with blooms, then verify they get
+  // cached only if !optimize_filters_for_hits
+  options.disable_auto_compactions = true;
+  options.num_levels = 9;
+  options.optimize_filters_for_hits = false;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+  MoveFilesToLevel(7 /* level */, 1 /* column family index */);
+
+  std::string value = Get(1, Key(0));
+  uint64_t prev_cache_filter_hits =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  value = Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_hits + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Now that we know the filter blocks exist in the last level files, see if
+  // filter caching is skipped for this optimization
+  options.optimize_filters_for_hits = true;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  value = Get(1, Key(0));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+
+  // Check filter block ignored for files preloaded during DB::Open()
+  options.max_open_files = -1;
+  options.statistics = CreateDBStatistics();
+  bbto.block_cache.reset();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  uint64_t prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  Get(1, Key(0));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+
+  // Check filter block ignored for file trivially-moved to bottom level
+  bbto.block_cache.reset();
+  options.max_open_files = 100;  // setting > -1 makes it not preload all files
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  ASSERT_OK(Put(1, Key(numkeys + 1), "val"));
+  ASSERT_OK(Flush(1));
+
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions compact_options;
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  compact_options.change_level = true;
+  compact_options.target_level = 7;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsNotSupported());
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  prev_cache_filter_hits = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  prev_cache_filter_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  value = Get(1, Key(numkeys + 1));
+  ASSERT_EQ(prev_cache_filter_hits,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(prev_cache_filter_misses,
+            TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+
+  // Check filter block not cached for iterator
+  bbto.block_cache.reset();
+  options.statistics = CreateDBStatistics();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  ReopenWithColumnFamilies({"default", "mypikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions(), handles_[1]));
+  iter->SeekToFirst();
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(2 /* index and data block */,
+            TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  get_perf_context()->Reset();
+}
+
+int CountIter(std::unique_ptr<Iterator>& iter, const Slice& key) {
+  int count = 0;
+  for (iter->Seek(key); iter->Valid(); iter->Next()) {
+    count++;
+  }
+  EXPECT_OK(iter->status());
+  return count;
+}
+
+// use iterate_upper_bound to hint compatiability of existing bloom filters.
+// The BF is considered compatible if 1) upper bound and seek key transform
+// into the same string, or 2) the transformed seek key is of the same length
+// as the upper bound and two keys are adjacent according to the comparator.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterUpperBound) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.create_if_missing = true;
+    options.env = CurrentOptions().env;
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    table_options.index_shortening = BlockBasedTableOptions::
+        IndexShorteningMode::kShortenSeparatorsAndSuccessor;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("abcdxxx0", "val1"));
+    ASSERT_OK(Put("abcdxxx1", "val2"));
+    ASSERT_OK(Put("abcdxxx2", "val3"));
+    ASSERT_OK(Put("abcdxxx3", "val4"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // prefix_extractor has not changed, BF will always be read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+    }
+    {
+      Slice upper_bound("abcdzzzz");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcd0000"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.5");
+    {
+      // BF changed, [abcdxx00, abce) is a valid bound, will trigger BF read
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx00"), 4);
+      // should check bloom filter since upper bound meets requirement
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx01, abcey) is not valid bound since upper bound is too long for
+      // the BF in SST (capped:4)
+      Slice upper_bound("abcey");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx01"), 4);
+      // should skip bloom filter since upper bound is too long
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [abcdxx02, abcdy) is a valid bound since the prefix is the same
+      Slice upper_bound("abcdy");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abcdxx02"), 4);
+      // should check bloom filter since upper bound matches transformed seek
+      // key
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    {
+      // [aaaaaaaa, abce) is not a valid bound since 1) they don't share the
+      // same prefix, 2) the prefixes are not consecutive
+      Slice upper_bound("abce");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "aaaaaaaa"), 0);
+      // should skip bloom filter since mismatch is found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:3"}}));
+    {
+      // [abc, abd) is not a valid bound since the upper bound is too short
+      // for BF (capped:4)
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    // Same with re-open
+    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    // Set back to capped:4 and verify BF is always read
+    options.prefix_extractor.reset(NewCappedPrefixTransform(4));
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+    // Same if there's a problem initally loading prefix transform
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+        [&](void* arg) { *static_cast<bool*>(arg) = true; });
+    SyncPoint::GetInstance()->EnableProcessing();
+    Reopen(options);
+    {
+      Slice upper_bound("abd");
+      ReadOptions read_options;
+      read_options.prefix_same_as_start = true;
+      read_options.iterate_upper_bound = &upper_bound;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "abc"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+// Create multiple SST files each with a different prefix_extractor config,
+// verify iterators can read all SST files using the latest config.
+TEST_F(DBBloomFilterTest, DynamicBloomFilterMultipleSST) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy = Create(10, bfp_impl);
+    table_options.cache_index_and_filter_blocks = true;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    Slice upper_bound("foz90000");
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+
+    // first SST with fixed:1 BF
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foq1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 1);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    read_options.iterate_upper_bound = &upper_bound;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+    ASSERT_EQ(CountIter(iter, "gpk"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    // second SST with capped:3 BF
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foq5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // BF is cappped:3 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // both counters are incremented because BF is "not changed" for 1 of the
+      // 2 SST files, so filter is checked once and found no match.
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 5);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+    }
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
+    // third SST with fixed:2 BF
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foq8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    {
+      // BF is fixed:2 now
+      std::unique_ptr<Iterator> iter_tmp(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_tmp, "foo"), 9);
+      // the first and last BF are checked
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 7);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 1);
+      ASSERT_EQ(CountIter(iter_tmp, "gpk"), 0);
+      // only last BF is checked and not found
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 8);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+    }
+
+    // iter_old can only see the first SST, so checked plus 1
+    ASSERT_EQ(CountIter(iter_old, "foo"), 4);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    // iter was created after the first setoptions call so only full filter
+    // will check the filter
+    ASSERT_EQ(CountIter(iter, "foo"), 2);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 10);
+
+    {
+      // keys in all three SSTs are visible to iterator
+      // The range of [foo, foz90000] is compatible with (fixed:1) and (fixed:2)
+      // so +2 for checked counter
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 2);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 13);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+    }
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    {
+      std::unique_ptr<Iterator> iter_all(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter_all, "foo"), 6);
+      // all three SST are checked because the current options has the same as
+      // the remaining SST (capped:3)
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 16);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+      ASSERT_EQ(CountIter(iter_all, "gpk"), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 17);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 4);
+    }
+    // TODO(Zhongyi): Maybe also need to add Get calls to test point look up?
+  }
+}
+
+// Create a new column family in a running DB, change prefix_extractor
+// dynamically, verify the iterator created on the new column family behaves
+// as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterNewColumnFamily) {
+  int iteration = 0;
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu" + std::to_string(iteration)}, options);
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    // create a new CF and set prefix_extractor dynamically
+    options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+    CreateColumnFamilies({"ramen_dojo_" + std::to_string(iteration)}, options);
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    ASSERT_OK(Put(2, "foo3", "bar3"));
+    ASSERT_OK(Put(2, "foo4", "bar4"));
+    ASSERT_OK(Put(2, "foo5", "bar5"));
+    ASSERT_OK(Put(2, "foq6", "bar6"));
+    ASSERT_OK(Put(2, "fpq7", "bar7"));
+    dbfull()->Flush(FlushOptions());
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(
+        dbfull()->SetOptions(handles_[2], {{"prefix_extractor", "fixed:2"}}));
+    ASSERT_EQ(dbfull()->GetOptions(handles_[2]).prefix_extractor->AsString(),
+              "rocksdb.FixedPrefix.2");
+    {
+      std::unique_ptr<Iterator> iter(
+          db_->NewIterator(read_options, handles_[2]));
+      ASSERT_EQ(CountIter(iter, "foo"), 4);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 0);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[2]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[2]));
+    handles_[2] = nullptr;
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_[1] = nullptr;
+    iteration++;
+  }
+}
+
+// Verify it's possible to change prefix_extractor at runtime and iterators
+// behaves as expected
+TEST_F(DBBloomFilterTest, DynamicBloomFilterOptions) {
+  for (const auto& bfp_impl : BloomLikeFilterPolicy::GetAllFixedImpls()) {
+    Options options;
+    options.env = CurrentOptions().env;
+    options.create_if_missing = true;
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.disable_auto_compactions = true;
+    options.statistics = CreateDBStatistics();
+    // Enable prefix bloom for SST files
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.filter_policy = Create(10, bfp_impl);
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("fpa", "0"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo3", "bar3"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Put("foo5", "bar5"));
+    ASSERT_OK(Put("fpb", "1"));
+    dbfull()->Flush(FlushOptions());
+    ASSERT_OK(Put("foo6", "bar6"));
+    ASSERT_OK(Put("foo7", "bar7"));
+    ASSERT_OK(Put("foo8", "bar8"));
+    ASSERT_OK(Put("fpc", "2"));
+    dbfull()->Flush(FlushOptions());
+
+    ReadOptions read_options;
+    read_options.prefix_same_as_start = true;
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      ASSERT_EQ(CountIter(iter, "foo"), 12);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 3);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+    std::unique_ptr<Iterator> iter_old(db_->NewIterator(read_options));
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+
+    ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "capped:3"}}));
+    ASSERT_EQ(dbfull()->GetOptions().prefix_extractor->AsString(),
+              "rocksdb.CappedPrefix.3");
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      // "fp*" should be skipped
+      ASSERT_EQ(CountIter(iter, "foo"), 9);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 6);
+      ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    }
+
+    // iterator created before should not be affected and see all keys
+    ASSERT_EQ(CountIter(iter_old, "foo"), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 9);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 0);
+    ASSERT_EQ(CountIter(iter_old, "abc"), 0);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED), 12);
+    ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_PREFIX_USEFUL), 3);
+  }
+}
+
+TEST_F(DBBloomFilterTest, SeekForPrevWithPartitionedFilters) {
+  Options options = CurrentOptions();
+  constexpr size_t kNumKeys = 10000;
+  static_assert(kNumKeys <= 10000, "kNumKeys have to be <= 10000");
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeys + 10));
+  options.create_if_missing = true;
+  constexpr size_t kPrefixLength = 4;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(kPrefixLength));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(50));
+  bbto.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  bbto.block_size = 128;
+  bbto.metadata_block_size = 128;
+  bbto.partition_filters = true;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const std::string value(64, '\0');
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i;
+    ASSERT_OK(db_->Put(write_opts, oss.str(), value));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  // Use legacy, implicit prefix seek
+  read_opts.total_order_seek = false;
+  read_opts.auto_prefix_mode = false;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    // Seek with a key after each one added but with same prefix. One will
+    // surely cross a partition boundary.
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << std::fixed << i << "a";
+    it->SeekForPrev(oss.str());
+    ASSERT_OK(it->status());
+    ASSERT_TRUE(it->Valid());
+  }
+  it.reset();
+}
+
+namespace {
+class BackwardBytewiseComparator : public Comparator {
+ public:
+  const char* Name() const override { return "BackwardBytewiseComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    int min_size_neg = -static_cast<int>(std::min(a.size(), b.size()));
+    const char* a_end = a.data() + a.size();
+    const char* b_end = b.data() + b.size();
+    for (int i = -1; i >= min_size_neg; --i) {
+      if (a_end[i] != b_end[i]) {
+        if (static_cast<unsigned char>(a_end[i]) <
+            static_cast<unsigned char>(b_end[i])) {
+          return -1;
+        } else {
+          return 1;
+        }
+      }
+    }
+    return static_cast<int>(a.size()) - static_cast<int>(b.size());
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+const BackwardBytewiseComparator kBackwardBytewiseComparator{};
+
+class FixedSuffix4Transform : public SliceTransform {
+  const char* Name() const override { return "FixedSuffixTransform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data() + src.size() - 4, 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 4; }
+};
+
+std::pair<uint64_t, uint64_t> GetBloomStat(const Options& options, bool sst) {
+  if (sst) {
+    return {
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_CHECKED),
+        options.statistics->getAndResetTickerCount(BLOOM_FILTER_PREFIX_USEFUL)};
+  } else {
+    auto hit = std::exchange(get_perf_context()->bloom_memtable_hit_count, 0);
+    auto miss = std::exchange(get_perf_context()->bloom_memtable_miss_count, 0);
+    return {hit + miss, miss};
+  }
+}
+
+std::pair<uint64_t, uint64_t> CheckedAndUseful(uint64_t checked,
+                                               uint64_t useful) {
+  return {checked, useful};
+}
+}  // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * key.starts_with(prefix(key))
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter1) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = &kBackwardBytewiseComparator;
+  options.prefix_extractor = std::make_shared<FixedSuffix4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("321aaaa", "val1"));
+  ASSERT_OK(Put("112aaaa", "val2"));
+  ASSERT_OK(Put("009aaaa", "val3"));
+  ASSERT_OK(Put("baa", "val4"));  // out of domain
+  ASSERT_OK(Put("321abaa", "val5"));
+  ASSERT_OK(Put("zzz", "val6"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("999aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999abaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "abaa"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("999acaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "acaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("zzzz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "baa"), 3);
+      if (flushed) {  // TODO: fix memtable case
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
+// This uses a prefix_extractor + comparator combination that violates
+// one of the old obsolete, unnecessary axioms of prefix extraction:
+// * Compare(prefix(key), key) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter2) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(4));
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("aaaa123", "val1"));
+  ASSERT_OK(Put("aaaa211", "val2"));
+  ASSERT_OK(Put("aaaa900", "val3"));
+  ASSERT_OK(Put("aab", "val4"));  // out of domain
+  ASSERT_OK(Put("aaba123", "val5"));
+  ASSERT_OK(Put("qqqq123", "val7"));
+  ASSERT_OK(Put("qqqq", "val8"));
+  ASSERT_OK(Put("zzz", "val8"));  // out of domain
+
+  for (auto flushed : {false, true}) {
+    SCOPED_TRACE("flushed=" + std::to_string(flushed));
+    if (flushed) {
+      ASSERT_OK(Flush());
+    }
+    ReadOptions read_options;
+    if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+      read_options.auto_prefix_mode = true;
+    } else {
+      // TODO: why needed?
+      get_perf_context()->bloom_memtable_hit_count = 0;
+      get_perf_context()->bloom_memtable_miss_count = 0;
+    }
+    EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    {
+      Slice ub("aaaa000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does work as upper bound
+      Slice ub("aaaa");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa999"), 3);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      // Note: prefix does not work here as seek key
+      Slice ub("aaaa500");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaaa"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaba000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaba999"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+    {
+      Slice ub("aaca000");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "aaca999"), 0);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+    }
+    {
+      Slice ub("aaaz");
+      read_options.iterate_upper_bound = &ub;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "zzz"), 5);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+    }
+    {
+      // Note: prefix does work here as seek key, but only finds key equal
+      // to prefix (others with same prefix are less)
+      read_options.auto_prefix_mode = false;
+      read_options.iterate_upper_bound = nullptr;
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+      EXPECT_EQ(CountIter(iter, "qqqq"), 1);
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+    }
+  }
+}
+
+namespace {
+// A weird comparator that in combination with NonIdempotentFixed4Transform
+// breaks an old axiom of prefix filtering.
+class WeirdComparator : public Comparator {
+ public:
+  const char* Name() const override { return "WeirdComparator"; }
+
+  int Compare(const Slice& a, const Slice& b) const override {
+    bool a_in = a.size() >= 5;
+    bool b_in = b.size() >= 5;
+    if (a_in != b_in) {
+      // Order keys after prefixes
+      return a_in - b_in;
+    }
+    if (a_in) {
+      return BytewiseComparator()->Compare(a, b);
+    } else {
+      // Different ordering on the prefixes
+      return ReverseBytewiseComparator()->Compare(a, b);
+    }
+  }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+const WeirdComparator kWeirdComparator{};
+
+// Non-idempotentent because prefix is always 4 bytes, but this is
+// out-of-domain for keys to be assigned prefixes (>= 5 bytes)
+class NonIdempotentFixed4Transform : public SliceTransform {
+  const char* Name() const override { return "NonIdempotentFixed4Transform"; }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 4);
+  }
+
+  bool InDomain(const Slice& src) const override { return src.size() >= 5; }
+};
+}  // anonymous namespace
+
+// This uses a prefix_extractor + comparator combination that violates
+// two of the old obsolete, unnecessary axioms of prefix extraction:
+// * prefix(prefix(key)) == prefix(key)
+// * If Compare(k1, k2) <= 0, then Compare(prefix(k1), prefix(k2)) <= 0
+// This axiom is not really needed, and we validate that here.
+TEST_F(DBBloomFilterTest, WeirdPrefixExtractorWithFilter3) {
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(ROCKSDB_NAMESPACE::NewBloomFilterPolicy(10));
+  bbto.whole_key_filtering = false;
+
+  Options options = CurrentOptions();
+  options.prefix_extractor = std::make_shared<NonIdempotentFixed4Transform>();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.statistics = CreateDBStatistics();
+
+  for (auto weird_comparator : {false, true}) {
+    if (weird_comparator) {
+      options.comparator = &kWeirdComparator;
+    }
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("aaaa123", "val1"));
+    ASSERT_OK(Put("aaaa211", "val2"));
+    ASSERT_OK(Put("aaaa900", "val3"));
+    ASSERT_OK(Put("aab", "val4"));  // out of domain
+    ASSERT_OK(Put("aaba123", "val5"));
+    ASSERT_OK(Put("qqqq123", "val7"));
+    ASSERT_OK(Put("qqqq", "val8"));  // out of domain
+    ASSERT_OK(Put("zzzz", "val8"));  // out of domain
+
+    for (auto flushed : {false, true}) {
+      SCOPED_TRACE("flushed=" + std::to_string(flushed));
+      if (flushed) {
+        ASSERT_OK(Flush());
+      }
+      ReadOptions read_options;
+      if (flushed) {  // TODO: support auto_prefix_mode in memtable?
+        read_options.auto_prefix_mode = true;
+      } else {
+        // TODO: why needed?
+        get_perf_context()->bloom_memtable_hit_count = 0;
+        get_perf_context()->bloom_memtable_miss_count = 0;
+      }
+      EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      {
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa000"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        // Note: the count works with weird_comparator because "aaaa" is
+        // ordered as the last of the prefixes
+        Slice ub("aaaa999");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaaa"), 3);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("aaba9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaba0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        Slice ub("aaca9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aaca0"), 0);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 1));
+      }
+      {
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq0"), 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(1, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("qqqq9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "qqqq"), weird_comparator ? 7 : 2);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        // Note: prefix as seek key is not bloom-optimized
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "zzzz"), weird_comparator ? 8 : 1);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+      {
+        Slice ub("zzzz9");
+        read_options.iterate_upper_bound = &ub;
+        std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+        EXPECT_EQ(CountIter(iter, "aab"), weird_comparator ? 6 : 5);
+        EXPECT_EQ(GetBloomStat(options, flushed), CheckedAndUseful(0, 0));
+      }
+    }
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_filter_test.cc b/src/rocksdb/db/db_compaction_filter_test.cc
new file mode 100644
index 000000000..be863d4f6
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_filter_test.cc
@@ -0,0 +1,1036 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static int cfilter_count = 0;
+static int cfilter_skips = 0;
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBTestCompactionFilter : public DBTestBase {
+ public:
+  DBTestCompactionFilter()
+      : DBTestBase("db_compaction_filter_test", /*env_do_fsync=*/true) {}
+};
+
+// Param variant of DBTestBase::ChangeCompactOptions
+class DBTestCompactionFilterWithCompactParam
+    : public DBTestCompactionFilter,
+      public ::testing::WithParamInterface<DBTestBase::OptionConfig> {
+ public:
+  DBTestCompactionFilterWithCompactParam() : DBTestCompactionFilter() {
+    option_config_ = GetParam();
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    if (option_config_ == kDefault || option_config_ == kUniversalCompaction ||
+        option_config_ == kUniversalCompactionMultiLevel) {
+      options.create_if_missing = true;
+    }
+    if (option_config_ == kLevelSubcompactions ||
+        option_config_ == kUniversalSubcompactions) {
+      assert(options.max_subcompactions > 1);
+    }
+    Reopen(options);
+  }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+INSTANTIATE_TEST_CASE_P(
+    CompactionFilterWithOption, DBTestCompactionFilterWithCompactParam,
+    ::testing::Values(DBTestBase::OptionConfig::kDefault,
+                      DBTestBase::OptionConfig::kUniversalCompaction,
+                      DBTestBase::OptionConfig::kUniversalCompactionMultiLevel,
+                      DBTestBase::OptionConfig::kLevelSubcompactions,
+                      DBTestBase::OptionConfig::kUniversalSubcompactions));
+#else
+// Run fewer cases in non-full valgrind to save time.
+INSTANTIATE_TEST_CASE_P(CompactionFilterWithOption,
+                        DBTestCompactionFilterWithCompactParam,
+                        ::testing::Values(DBTestBase::OptionConfig::kDefault));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class DeleteFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    return true;
+  }
+
+  bool FilterMergeOperand(int /*level*/, const Slice& /*key*/,
+                          const Slice& /*operand*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class DeleteISFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i > 5 && i <= 105) {
+      return true;
+    }
+    return false;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+// Skip x if floor(x/10) is even, use range skips. Requires that keys are
+// zero-padded to length 10.
+class SkipEvenFilter : public CompactionFilter {
+ public:
+  Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                    const Slice& /*existing_value*/, std::string* /*new_value*/,
+                    std::string* skip_until) const override {
+    cfilter_count++;
+    int i = std::stoi(key.ToString());
+    if (i / 10 % 2 == 0) {
+      char key_str[100];
+      snprintf(key_str, sizeof(key_str), "%010d", i / 10 * 10 + 10);
+      *skip_until = key_str;
+      ++cfilter_skips;
+      return Decision::kRemoveAndSkipUntil;
+    }
+    return Decision::kKeep;
+  }
+
+  bool IgnoreSnapshots() const override { return true; }
+
+  const char* Name() const override { return "DeleteFilter"; }
+};
+
+class ConditionalFilter : public CompactionFilter {
+ public:
+  explicit ConditionalFilter(const std::string* filtered_value)
+      : filtered_value_(filtered_value) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return value.ToString() == *filtered_value_;
+  }
+
+  const char* Name() const override { return "ConditionalFilter"; }
+
+ private:
+  const std::string* filtered_value_;
+};
+
+class ChangeFilter : public CompactionFilter {
+ public:
+  explicit ChangeFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    assert(new_value != nullptr);
+    *new_value = NEW_VALUE;
+    *value_changed = true;
+    return false;
+  }
+
+  const char* Name() const override { return "ChangeFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false,
+                             bool check_context_cf_id = false)
+      : check_context_(check_context),
+        check_context_cf_id_(check_context_cf_id),
+        compaction_filter_created_(false) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    if (check_context_cf_id_) {
+      EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
+    }
+    compaction_filter_created_ = true;
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  bool compaction_filter_created() const { return compaction_filter_created_; }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  bool check_context_cf_id_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+  std::atomic<uint32_t> expect_cf_id_;
+  bool compaction_filter_created_;
+};
+
+// This filter factory is configured with a `TableFileCreationReason`. Only
+// table files created for that reason will undergo filtering. This
+// configurability makes it useful to tests for filtering non-compaction table
+// files, such as "CompactionFilterFlush" and "CompactionFilterRecovery".
+class DeleteFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DeleteFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    EXPECT_EQ(reason_, context.reason);
+    if (context.reason == TableFileCreationReason::kCompaction &&
+        !context.is_manual_compaction) {
+      // Table files created by automatic compaction do not undergo filtering.
+      // Presumably some tests rely on this.
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+    return std::unique_ptr<CompactionFilter>(new DeleteFilter());
+  }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+// Delete Filter Factory which ignores snapshots
+class DeleteISFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new DeleteISFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "DeleteFilterFactory"; }
+};
+
+class SkipEvenFilterFactory : public CompactionFilterFactory {
+ public:
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (context.is_manual_compaction) {
+      return std::unique_ptr<CompactionFilter>(new SkipEvenFilter());
+    } else {
+      return std::unique_ptr<CompactionFilter>(nullptr);
+    }
+  }
+
+  const char* Name() const override { return "SkipEvenFilterFactory"; }
+};
+
+class ConditionalFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalFilterFactory(const Slice& filtered_value)
+      : filtered_value_(filtered_value.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalFilter(&filtered_value_));
+  }
+
+  const char* Name() const override { return "ConditionalFilterFactory"; }
+
+ private:
+  std::string filtered_value_;
+};
+
+class ChangeFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ChangeFilterFactory() {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new ChangeFilter());
+  }
+
+  const char* Name() const override { return "ChangeFilterFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilter) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // Push all files to the highest level L2. Verify that
+  // the compaction is each level invokes the filter for
+  // all the keys in that level.
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+  cfilter_count = 0;
+
+  // All the files are in the lowest level.
+  // Verify that all but the 100001st record
+  // has sequence number zero. The 100001st record
+  // is at the tip of this snapshot and cannot
+  // be zeroed out.
+  int count = 0;
+  int total = 0;
+  Arena arena;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        read_options, &arena, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+  }
+  ASSERT_EQ(total, 100000);
+  ASSERT_EQ(count, 0);
+
+  // overwrite all the 100K keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+
+  // push all files to the highest level L2. This
+  // means that all keys should pass at least once
+  // via the compaction filter
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_NE(NumTableFilesAtLevel(2, 1), 0);
+
+  // create a new database with the compaction
+  // filter in such a way that it deletes all keys
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // write all the keys once again.
+  for (int i = 0; i < 100000; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_NE(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 1), 0);
+
+  // Push all files to the highest level L2. This
+  // triggers the compaction filter to delete all keys,
+  // verify that at the end of the compaction process,
+  // nothing is left.
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 100000);
+  cfilter_count = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  ASSERT_EQ(cfilter_count, 0);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+
+  {
+    // Scan the entire database to ensure that nothing is left
+    std::unique_ptr<Iterator> iter(
+        db_->NewIterator(ReadOptions(), handles_[1]));
+    iter->SeekToFirst();
+    count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(count, 0);
+  }
+
+  // The sequence number of the remaining record
+  // is not zeroed out even though it is at the
+  // level Lmax because this record is at the tip
+  count = 0;
+  {
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(
+        read_options, &arena, kMaxSequenceNumber, handles_[1]));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      ASSERT_NE(ikey.sequence, (unsigned)0);
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 0);
+  }
+}
+
+// Tests the edge case where compaction does not produce any output -- all
+// entries are deleted. The compaction should create bunch of 'DeleteFile'
+// entries in VersionEdit, but none of the 'AddFile's.
+TEST_F(DBTestCompactionFilter, CompactionFilterDeletesAll) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteFilterFactory>(
+      TableFileCreationReason::kCompaction);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // this will produce empty file (delete compaction filter)
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_OK(itr->status());
+  // empty db
+  ASSERT_TRUE(!itr->Valid());
+
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterFlush) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by flush.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kFlush);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterRecovery) {
+  // Tests a `CompactionFilterFactory` that filters when table file is created
+  // by recovery.
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DeleteFilterFactory>(TableFileCreationReason::kRecovery);
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Puts and Merges are purged in recovery.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("a"));
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+
+  // However, Puts and Merges are preserved by flush.
+  ASSERT_OK(Put("a", "v"));
+  ASSERT_OK(Merge("b", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+
+  // Likewise, compaction does not apply filtering.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("v", Get("a"));
+  ASSERT_EQ("v", Get("b"));
+}
+
+TEST_P(DBTestCompactionFilterWithCompactParam,
+       CompactionFilterWithValueChange) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compaction_filter_factory = std::make_shared<ChangeFilterFactory>();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write 100K+1 keys, these are written to a few files
+  // in L0. We do this so that the current snapshot points
+  // to the 100001 key.The compaction filter is  not invoked
+  // on keys that are visible via a snapshot because we
+  // anyways cannot delete it.
+  const std::string value(10, 'x');
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // push all files to  lower levels
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  } else {
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+  }
+
+  // re-write all data again
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // push all files to  lower levels. This should
+  // invoke the compaction filter for all 100000 keys.
+  ASSERT_OK(Flush(1));
+  if (option_config_ != kUniversalCompactionMultiLevel &&
+      option_config_ != kUniversalSubcompactions) {
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+  } else {
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+  }
+
+  // verify that all keys now have the new value that
+  // was set by the compaction process.
+  for (int i = 0; i < 100001; i++) {
+    char key[100];
+    snprintf(key, sizeof(key), "B%010d", i);
+    std::string newvalue = Get(1, key);
+    ASSERT_EQ(newvalue.compare(NEW_VALUE), 0);
+  }
+}
+
+TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
+  std::string one, two, three, four;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+  PutFixed64(&four, 4);
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  options.num_levels = 3;
+  // Filter out keys with value is 2.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalFilterFactory>(two);
+  DestroyAndReopen(options);
+
+  // In the same compaction, a value type needs to be deleted based on
+  // compaction filter, and there is a merge type for the key. compaction
+  // filter result is ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  std::string newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("foo");
+  ASSERT_EQ(newvalue, three);
+
+  // value key can be deleted based on compaction filter, leaving only
+  // merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("bar");
+  ASSERT_EQ("NOT_FOUND", newvalue);
+  ASSERT_OK(db_->Merge(WriteOptions(), "bar", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("bar");
+  ASSERT_EQ(two, two);
+
+  // Compaction filter never applies to merge keys.
+  ASSERT_OK(db_->Put(WriteOptions(), "foobar", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("foobar");
+  ASSERT_EQ(newvalue, three);
+
+  // In the same compaction, both of value type and merge type keys need to be
+  // deleted based on compaction filter, and there is a merge type for the key.
+  // For both keys, compaction filter results are ignored.
+  ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two));
+  ASSERT_OK(Flush());
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  newvalue = Get("barfoo");
+  ASSERT_EQ(newvalue, four);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
+  KeepFilterFactory* filter = new KeepFilterFactory(true, true);
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 8;
+  Reopen(options);
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      ASSERT_OK(Put(key, value));
+    }
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Force a manual compaction
+  cfilter_count = 0;
+  filter->expect_manual_compaction_.store(true);
+  filter->expect_full_compaction_.store(true);
+  filter->expect_cf_id_.store(0);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(cfilter_count, 700);
+  ASSERT_EQ(NumSortedRuns(0), 1);
+  ASSERT_TRUE(filter->compaction_filter_created());
+
+  // Verify total number of keys is correct after manual compaction.
+  {
+    int count = 0;
+    int total = 0;
+    Arena arena;
+    InternalKeyComparator icmp(options.comparator);
+    ReadOptions read_options;
+    ScopedArenaIterator iter(dbfull()->NewInternalIterator(read_options, &arena,
+                                                           kMaxSequenceNumber));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      total++;
+      if (ikey.sequence != 0) {
+        count++;
+      }
+      iter->Next();
+    }
+    ASSERT_EQ(total, 700);
+    ASSERT_EQ(count, 0);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
+  KeepFilterFactory* filter = new KeepFilterFactory(false, true);
+  filter->expect_cf_id_.store(1);
+
+  Options options = CurrentOptions();
+  options.compaction_filter_factory.reset(filter);
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  int num_keys_per_file = 400;
+  for (int j = 0; j < 3; j++) {
+    // Write several keys.
+    const std::string value(10, 'x');
+    for (int i = 0; i < num_keys_per_file; i++) {
+      char key[100];
+      snprintf(key, sizeof(key), "B%08d%02d", i, j);
+      ASSERT_OK(Put(1, key, value));
+    }
+    ASSERT_OK(Flush(1));
+    // Make sure next file is much smaller so automatic compaction will not
+    // be triggered.
+    num_keys_per_file /= 2;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(filter->compaction_filter_created());
+}
+
+#ifndef ROCKSDB_LITE
+// Compaction filters aplies to all records, regardless snapshots.
+TEST_F(DBTestCompactionFilter, CompactionFilterIgnoreSnapshot) {
+  std::string five = std::to_string(5);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<DeleteISFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Put some data.
+  const Snapshot* snapshot = nullptr;
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+
+    if (table == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  assert(snapshot != nullptr);
+
+  cfilter_count = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // The filter should delete 40 records.
+  ASSERT_EQ(40, cfilter_count);
+
+  {
+    // Scan the entire database as of the snapshot to ensure
+    // that nothing is left
+    ReadOptions read_options;
+    read_options.snapshot = snapshot;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->SeekToFirst();
+    ASSERT_OK(iter->status());
+    int count = 0;
+    while (iter->Valid()) {
+      count++;
+      iter->Next();
+    }
+    ASSERT_EQ(count, 6);
+    read_options.snapshot = nullptr;
+    std::unique_ptr<Iterator> iter1(db_->NewIterator(read_options));
+    ASSERT_OK(iter1->status());
+    iter1->SeekToFirst();
+    count = 0;
+    while (iter1->Valid()) {
+      count++;
+      iter1->Next();
+    }
+    // We have deleted 10 keys from 40 using the compaction filter
+    //  Keys 6-9 before the snapshot and 100-105 after the snapshot
+    ASSERT_EQ(count, 30);
+  }
+
+  // Release the snapshot and compact again -> now all records should be
+  // removed.
+  db_->ReleaseSnapshot(snapshot);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTestCompactionFilter, SkipUntil) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Write 100K keys, these are written to a few files in L0.
+  for (int table = 0; table < 4; ++table) {
+    // Key ranges in tables are [0, 38], [106, 149], [212, 260], [318, 371].
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      ASSERT_OK(Put(key, std::to_string(table * 1000 + i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  cfilter_skips = 0;
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Number of skips in tables: 2, 3, 3, 3.
+  ASSERT_EQ(11, cfilter_skips);
+
+  for (int table = 0; table < 4; ++table) {
+    for (int i = table * 6; i < 39 + table * 11; ++i) {
+      int k = table * 100 + i;
+      char key[100];
+      snprintf(key, sizeof(key), "%010d", table * 100 + i);
+      auto expected = std::to_string(table * 1000 + i);
+      std::string val;
+      Status s = db_->Get(ReadOptions(), key, &val);
+      if (k / 10 % 2 == 0) {
+        ASSERT_TRUE(s.IsNotFound());
+      } else {
+        ASSERT_OK(s);
+        ASSERT_EQ(expected, val);
+      }
+    }
+  }
+}
+
+TEST_F(DBTestCompactionFilter, SkipUntilWithBloomFilter) {
+  BlockBasedTableOptions table_options;
+  table_options.whole_key_filtering = false;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(100, false));
+
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(9));
+  options.compaction_filter_factory = std::make_shared<SkipEvenFilterFactory>();
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("0000000010", "v10"));
+  ASSERT_OK(Put("0000000020", "v20"));  // skipped
+  ASSERT_OK(Put("0000000050", "v50"));
+  ASSERT_OK(Flush());
+
+  cfilter_skips = 0;
+  EXPECT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  EXPECT_EQ(1, cfilter_skips);
+
+  Status s;
+  std::string val;
+
+  s = db_->Get(ReadOptions(), "0000000010", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v10", val);
+
+  s = db_->Get(ReadOptions(), "0000000020", &val);
+  EXPECT_TRUE(s.IsNotFound());
+
+  s = db_->Get(ReadOptions(), "0000000050", &val);
+  ASSERT_OK(s);
+  EXPECT_EQ("v50", val);
+}
+
+class TestNotSupportedFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return true;
+  }
+
+  const char* Name() const override { return "NotSupported"; }
+  bool IgnoreSnapshots() const override { return false; }
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalse) {
+  Options options = CurrentOptions();
+  options.compaction_filter = new TestNotSupportedFilter();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_OK(Put("z", "v20"));
+  ASSERT_OK(Flush());
+
+  // Comapction should fail because IgnoreSnapshots() = false
+  EXPECT_TRUE(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsNotSupported());
+
+  delete options.compaction_filter;
+}
+
+class TestNotSupportedFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit TestNotSupportedFilterFactory(TableFileCreationReason reason)
+      : reason_(reason) {}
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    return reason_ == reason;
+  }
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /* context */) override {
+    return std::unique_ptr<CompactionFilter>(new TestNotSupportedFilter());
+  }
+
+  const char* Name() const override { return "TestNotSupportedFilterFactory"; }
+
+ private:
+  const TableFileCreationReason reason_;
+};
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseDuringFlush) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kFlush);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(Flush().IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, IgnoreSnapshotsFalseRecovery) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<TestNotSupportedFilterFactory>(
+          TableFileCreationReason::kRecovery);
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v10"));
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTestCompactionFilter, DropKeyWithSingleDelete) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("a", "v0"));
+  ASSERT_OK(Put("b", "v0"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(SingleDelete("b"));
+  ASSERT_OK(Flush());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = options.num_levels - 1;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+  Close();
+
+  class DeleteFilterV2 : public CompactionFilter {
+   public:
+    Decision FilterV2(int /*level*/, const Slice& key, ValueType /*value_type*/,
+                      const Slice& /*existing_value*/,
+                      std::string* /*new_value*/,
+                      std::string* /*skip_until*/) const override {
+      if (key.starts_with("b")) {
+        return Decision::kPurge;
+      }
+      return Decision::kRemove;
+    }
+
+    const char* Name() const override { return "DeleteFilterV2"; }
+  } delete_filter_v2;
+
+  options.compaction_filter = &delete_filter_v2;
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("b", "v1"));
+  ASSERT_OK(Put("x", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("r", "v1"));
+  ASSERT_OK(Put("z", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_OK(SingleDelete("b"));
+  ASSERT_OK(Flush());
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_compaction_test.cc b/src/rocksdb/db/db_compaction_test.cc
new file mode 100644
index 000000000..ba9c50b9a
--- /dev/null
+++ b/src/rocksdb/db/db_compaction_test.cc
@@ -0,0 +1,8227 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <tuple>
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/concurrent_task_limiter.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/utilities/convenience.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/concurrent_task_limiter_impl.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// SYNC_POINT is not supported in released Windows mode.
+#if !defined(ROCKSDB_LITE)
+
+class CompactionStatsCollector : public EventListener {
+ public:
+  CompactionStatsCollector()
+      : compaction_completed_(
+            static_cast<int>(CompactionReason::kNumOfReasons)) {
+    for (auto& v : compaction_completed_) {
+      v.store(0);
+    }
+  }
+
+  ~CompactionStatsCollector() override {}
+
+  void OnCompactionCompleted(DB* /* db */,
+                             const CompactionJobInfo& info) override {
+    int k = static_cast<int>(info.compaction_reason);
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    assert(k >= 0 && k < num_of_reasons);
+    compaction_completed_[k]++;
+  }
+
+  void OnExternalFileIngested(
+      DB* /* db */, const ExternalFileIngestionInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kExternalSstIngestion);
+    compaction_completed_[k]++;
+  }
+
+  void OnFlushCompleted(DB* /* db */, const FlushJobInfo& /* info */) override {
+    int k = static_cast<int>(CompactionReason::kFlush);
+    compaction_completed_[k]++;
+  }
+
+  int NumberOfCompactions(CompactionReason reason) const {
+    int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+    int k = static_cast<int>(reason);
+    assert(k >= 0 && k < num_of_reasons);
+    return compaction_completed_.at(k).load();
+  }
+
+ private:
+  std::vector<std::atomic<int>> compaction_completed_;
+};
+
+class DBCompactionTest : public DBTestBase {
+ public:
+  DBCompactionTest()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {}
+
+ protected:
+  /*
+   * Verifies compaction stats of cfd are valid.
+   *
+   * For each level of cfd, its compaction stats are valid if
+   * 1) sum(stat.counts) == stat.count, and
+   * 2) stat.counts[i] == collector.NumberOfCompactions(i)
+   */
+  void VerifyCompactionStats(ColumnFamilyData& cfd,
+                             const CompactionStatsCollector& collector) {
+#ifndef NDEBUG
+    InternalStats* internal_stats_ptr = cfd.internal_stats();
+    ASSERT_NE(internal_stats_ptr, nullptr);
+    const std::vector<InternalStats::CompactionStats>& comp_stats =
+        internal_stats_ptr->TEST_GetCompactionStats();
+    const int num_of_reasons =
+        static_cast<int>(CompactionReason::kNumOfReasons);
+    std::vector<int> counts(num_of_reasons, 0);
+    // Count the number of compactions caused by each CompactionReason across
+    // all levels.
+    for (const auto& stat : comp_stats) {
+      int sum = 0;
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] += stat.counts[i];
+        sum += stat.counts[i];
+      }
+      ASSERT_EQ(sum, stat.count);
+    }
+    // Verify InternalStats bookkeeping matches that of
+    // CompactionStatsCollector, assuming that all compactions complete.
+    for (int i = 0; i < num_of_reasons; i++) {
+      ASSERT_EQ(collector.NumberOfCompactions(static_cast<CompactionReason>(i)),
+                counts[i]);
+    }
+#endif /* NDEBUG */
+  }
+};
+
+class DBCompactionTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBCompactionTestWithParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBCompactionTestWithBottommostParam
+    : public DBTestBase,
+      public testing::WithParamInterface<BottommostLevelCompaction> {
+ public:
+  DBCompactionTestWithBottommostParam()
+      : DBTestBase("db_compaction_test", /*env_do_fsync=*/true) {
+    bottommost_level_compaction_ = GetParam();
+  }
+
+  BottommostLevelCompaction bottommost_level_compaction_;
+};
+
+class DBCompactionDirectIOTest : public DBCompactionTest,
+                                 public ::testing::WithParamInterface<bool> {
+ public:
+  DBCompactionDirectIOTest() : DBCompactionTest() {}
+};
+
+// Param = true : target level is non-empty
+// Param = false: level between target level and source level
+//  is not empty.
+class ChangeLevelConflictsWithAuto
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  ChangeLevelConflictsWithAuto() : DBCompactionTest() {}
+};
+
+// Param = true: grab the compaction pressure token (enable
+// parallel compactions)
+// Param = false: Not grab the token (no parallel compactions)
+class RoundRobinSubcompactionsAgainstPressureToken
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  RoundRobinSubcompactionsAgainstPressureToken() {
+    grab_pressure_token_ = GetParam();
+  }
+  bool grab_pressure_token_;
+};
+
+class RoundRobinSubcompactionsAgainstResources
+    : public DBCompactionTest,
+      public ::testing::WithParamInterface<std::tuple<int, int>> {
+ public:
+  RoundRobinSubcompactionsAgainstResources() {
+    total_low_pri_threads_ = std::get<0>(GetParam());
+    max_compaction_limits_ = std::get<1>(GetParam());
+  }
+  int total_low_pri_threads_;
+  int max_compaction_limits_;
+};
+
+namespace {
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() { flushed_files_.clear(); }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+class SstStatsCollector : public EventListener {
+ public:
+  SstStatsCollector() : num_ssts_creation_started_(0) {}
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /* info */) override {
+    ++num_ssts_creation_started_;
+  }
+
+  int num_ssts_creation_started() { return num_ssts_creation_started_; }
+
+ private:
+  std::atomic<int> num_ssts_creation_started_;
+};
+
+static const int kCDTValueSize = 1000;
+static const int kCDTKeysPerBuffer = 4;
+static const int kCDTNumLevels = 8;
+Options DeletionTriggerOptions(Options options) {
+  options.compression = kNoCompression;
+  options.write_buffer_size = kCDTKeysPerBuffer * (kCDTValueSize + 24);
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.num_levels = kCDTNumLevels;
+  options.level0_file_num_compaction_trigger = 1;
+  options.target_file_size_base = options.write_buffer_size * 2;
+  options.target_file_size_multiplier = 2;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * options.target_file_size_multiplier;
+  options.max_bytes_for_level_multiplier = 2;
+  options.disable_auto_compactions = false;
+  options.compaction_options_universal.max_size_amplification_percent = 100;
+  return options;
+}
+
+bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
+                              const SstFileMetaData& b) {
+  if (c->CompareWithoutTimestamp(a.smallestkey, b.smallestkey) >= 0) {
+    if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+      // b.smallestkey <= a.smallestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+    // a.smallestkey < b.smallestkey <= a.largestkey
+    return true;
+  }
+  if (c->CompareWithoutTimestamp(a.largestkey, b.largestkey) <= 0) {
+    if (c->CompareWithoutTimestamp(a.largestkey, b.smallestkey) >= 0) {
+      // b.smallestkey <= a.largestkey <= b.largestkey
+      return true;
+    }
+  } else if (c->CompareWithoutTimestamp(a.smallestkey, b.largestkey) <= 0) {
+    // a.smallestkey <= b.largestkey < a.largestkey
+    return true;
+  }
+  return false;
+}
+
+// Identifies all files between level "min_level" and "max_level"
+// which has overlapping key range with "input_file_meta".
+void GetOverlappingFileNumbersForLevelCompaction(
+    const ColumnFamilyMetaData& cf_meta, const Comparator* comparator,
+    int min_level, int max_level, const SstFileMetaData* input_file_meta,
+    std::set<std::string>* overlapping_file_names) {
+  std::set<const SstFileMetaData*> overlapping_files;
+  overlapping_files.insert(input_file_meta);
+  for (int m = min_level; m <= max_level; ++m) {
+    for (auto& file : cf_meta.levels[m].files) {
+      for (auto* included_file : overlapping_files) {
+        if (HaveOverlappingKeyRanges(comparator, *included_file, file)) {
+          overlapping_files.insert(&file);
+          overlapping_file_names->insert(file.name);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+const SstFileMetaData* PickFileRandomly(const ColumnFamilyMetaData& cf_meta,
+                                        Random* rand, int* level = nullptr) {
+  auto file_id = rand->Uniform(static_cast<int>(cf_meta.file_count)) + 1;
+  for (auto& level_meta : cf_meta.levels) {
+    if (file_id <= level_meta.files.size()) {
+      if (level != nullptr) {
+        *level = level_meta.level;
+      }
+      auto result = rand->Uniform(file_id);
+      return &(level_meta.files[result]);
+    }
+    file_id -= static_cast<uint32_t>(level_meta.files.size());
+  }
+  assert(false);
+  return nullptr;
+}
+}  // anonymous namespace
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// All the TEST_P tests run once with sub_compactions disabled (i.e.
+// options.max_subcompactions = 1) and once with it enabled
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTrigger) {
+  for (int tid = 0; tid < 3; ++tid) {
+    uint64_t db_size[2];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // the following only disable stats update in DB::Open()
+      // and should not affect the result of this test.
+      options.skip_stats_update_on_db_open = true;
+    } else if (tid == 2) {
+      // third pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    const int kTestSize = kCDTKeysPerBuffer * 1024;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+
+    if (options.compaction_style == kCompactionStyleUniversal) {
+      // Claim: in universal compaction none of the original data will remain
+      // once compactions settle.
+      //
+      // Proof: The compensated size of the file containing the most tombstones
+      // is enough on its own to trigger size amp compaction. Size amp
+      // compaction is a full compaction, so all tombstones meet the obsolete
+      // keys they cover.
+      ASSERT_EQ(0, db_size[1]);
+    } else {
+      // Claim: in level compaction at most `db_size[0] / 2` of the original
+      // data will remain once compactions settle.
+      //
+      // Proof: Assume the original data is all in the bottom level. If it were
+      // not, it would meet its tombstone sooner. The original data size is
+      // large enough to require fanout to bottom level to be greater than
+      // `max_bytes_for_level_multiplier == 2`. In the level just above,
+      // tombstones must cover less than `db_size[0] / 4` bytes since fanout >=
+      // 2 and file size is compensated by doubling the size of values we expect
+      // are covered (`kDeletionWeightOnCompaction == 2`). The tombstones in
+      // levels above must cover less than `db_size[0] / 8` bytes of original
+      // data, `db_size[0] / 16`, and so on.
+      ASSERT_GT(db_size[0] / 2, db_size[1]);
+    }
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBCompactionTest, SkipStatsUpdateTest) {
+  // This test verify UpdateAccumulatedStats is not on
+  // if options.skip_stats_update_on_db_open = true
+  // The test will need to be updated if the internal behavior changes.
+
+  Options options = DeletionTriggerOptions(CurrentOptions());
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  const int kTestSize = kCDTKeysPerBuffer * 512;
+  std::vector<std::string> values;
+  for (int k = 0; k < kTestSize; ++k) {
+    values.push_back(rnd.RandomString(kCDTValueSize));
+    ASSERT_OK(Put(Key(k), values[k]));
+  }
+
+  ASSERT_OK(Flush());
+
+  Close();
+
+  int update_acc_stats_called = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionStorageInfo::UpdateAccumulatedStats",
+      [&](void* /* arg */) { ++update_acc_stats_called; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen the DB with stats-update disabled
+  options.skip_stats_update_on_db_open = true;
+  options.max_open_files = 20;
+  Reopen(options);
+
+  ASSERT_EQ(update_acc_stats_called, 0);
+
+  // Repeat the reopen process, but this time we enable
+  // stats-update.
+  options.skip_stats_update_on_db_open = false;
+  Reopen(options);
+
+  ASSERT_GT(update_acc_stats_called, 0);
+
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.max_open_files = 20;
+  options.level0_file_num_compaction_trigger = 3;
+  // Avoid many shards with small max_open_files, where as little as
+  // two table insertions could lead to an LRU eviction, depending on
+  // hash values.
+  options.table_cache_numshardbits = 2;
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  int num_table_cache_lookup = 0;
+  int num_new_table_reader = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0", [&](void* arg) {
+        assert(arg != nullptr);
+        bool no_io = *(reinterpret_cast<bool*>(arg));
+        if (!no_io) {
+          // filter out cases for table properties queries.
+          num_table_cache_lookup++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::GetTableReader:0",
+      [&](void* /*arg*/) { num_new_table_reader++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < options.level0_file_num_compaction_trigger; ++k) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    ASSERT_OK(Put(Key(10 - k), "bar"));
+    if (k < options.level0_file_num_compaction_trigger - 1) {
+      num_table_cache_lookup = 0;
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      // preloading iterator issues one table cache lookup and create
+      // a new table reader, if not preloaded.
+      int old_num_table_cache_lookup = num_table_cache_lookup;
+      ASSERT_GE(num_table_cache_lookup, 1);
+      ASSERT_EQ(num_new_table_reader, 1);
+
+      num_table_cache_lookup = 0;
+      num_new_table_reader = 0;
+      ASSERT_EQ(Key(k), Get(Key(k)));
+      // lookup iterator from table cache and no need to create a new one.
+      ASSERT_EQ(old_num_table_cache_lookup + num_table_cache_lookup, 2);
+      ASSERT_EQ(num_new_table_reader, 0);
+    }
+  }
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Preloading iterator issues one table cache lookup and creates
+  // a new table reader. One file is created for flush and one for compaction.
+  // Compaction inputs make no table cache look-up for data/range deletion
+  // iterators
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 2);
+  int old_num_table_cache_lookup2 = num_table_cache_lookup;
+
+  // Create new iterator for:
+  // (1) 1 for verifying flush results
+  // (2) 1 for verifying compaction results.
+  // (3) New TableReaders will not be created for compaction inputs
+  ASSERT_EQ(num_new_table_reader, 2);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // Only verifying compaction outputs issues one table cache lookup
+  // for both data block and range deletion block).
+  // May preload table cache too.
+  ASSERT_GE(num_table_cache_lookup, 1);
+  old_num_table_cache_lookup2 = num_table_cache_lookup;
+  // One for verifying compaction results.
+  // No new iterator created for compaction.
+  ASSERT_EQ(num_new_table_reader, 1);
+
+  num_table_cache_lookup = 0;
+  num_new_table_reader = 0;
+  ASSERT_EQ(Key(1), Get(Key(1)));
+  ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
+  ASSERT_EQ(num_new_table_reader, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionDeletionTriggerReopen) {
+  for (int tid = 0; tid < 2; ++tid) {
+    uint64_t db_size[3];
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.max_subcompactions = max_subcompactions_;
+
+    if (tid == 1) {
+      // second pass with universal compaction
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+    }
+
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+    Close();
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    // insert relatively small amount of data to trigger auto compaction.
+    for (int k = 0; k < kTestSize / 10; ++k) {
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+    // this time we're expecting significant drop in size.
+    //
+    // See "CompactionDeletionTrigger" test for proof that at most
+    // `db_size[0] / 2` of the original data remains. In addition to that, this
+    // test inserts `db_size[0] / 10` to push the tombstones into SST files and
+    // then through automatic compactions. So in total `3 * db_size[0] / 5` of
+    // the original data may remain.
+    ASSERT_GT(3 * db_size[0] / 5, db_size[2]);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeBottomPri) {
+  ASSERT_OK(Put(Key(50), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(100), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(200), ""));
+  ASSERT_OK(Flush());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,3", FilesPerLevel(0));
+
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(2), ""));
+  ASSERT_OK(Put(Key(199), ""));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2,0,3", FilesPerLevel(0));
+
+  // Now we have 2 L0 files, and 3 L2 files, and a manual compaction will
+  // be triggered.
+  // Two compaction jobs will run. One compacts 2 L0 files in Low Pri Pool
+  // and one compact to L2 in bottom pri pool.
+  int low_pri_count = 0;
+  int bottom_pri_count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "ThreadPoolImpl::Impl::BGThread:BeforeRun", [&](void* arg) {
+        Env::Priority* pri = reinterpret_cast<Env::Priority*>(arg);
+        // First time is low pri pool in the test case.
+        if (low_pri_count == 0 && bottom_pri_count == 0) {
+          ASSERT_EQ(Env::Priority::LOW, *pri);
+        }
+        if (*pri == Env::Priority::LOW) {
+          low_pri_count++;
+        } else {
+          bottom_pri_count++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(1, bottom_pri_count);
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // Recompact bottom most level uses bottom pool
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(1, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  env_->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  // Low pri pool is used if bottom pool has size 0.
+  ASSERT_EQ(2, low_pri_count);
+  ASSERT_EQ(2, bottom_pri_count);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, DisableStatsUpdateReopen) {
+  uint64_t db_size[3];
+  for (int test = 0; test < 2; ++test) {
+    Options options = DeletionTriggerOptions(CurrentOptions());
+    options.skip_stats_update_on_db_open = (test == 0);
+
+    env_->random_read_counter_.Reset();
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    // round 1 --- insert key/value pairs.
+    const int kTestSize = kCDTKeysPerBuffer * 512;
+    std::vector<std::string> values;
+    for (int k = 0; k < kTestSize; ++k) {
+      values.push_back(rnd.RandomString(kCDTValueSize));
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // L1 and L2 can fit deletions iff size compensation does not take effect,
+    // i.e., when `skip_stats_update_on_db_open == true`. Move any remaining
+    // files at or above L2 down to L3 to ensure obsolete data does not
+    // accidentally meet its tombstone above L3. This makes the final size more
+    // deterministic and easy to see whether size compensation for deletions
+    // took effect.
+    MoveFilesToLevel(3 /* level */);
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[0]));
+    Close();
+
+    // round 2 --- disable auto-compactions and issue deletions.
+    options.create_if_missing = false;
+    options.disable_auto_compactions = true;
+
+    env_->random_read_counter_.Reset();
+    Reopen(options);
+
+    for (int k = 0; k < kTestSize; ++k) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[1]));
+    Close();
+    // as auto_compaction is off, we shouldn't see any reduction in db size.
+    ASSERT_LE(db_size[0], db_size[1]);
+
+    // round 3 --- reopen db with auto_compaction on and see if
+    // deletion compensation still work.
+    options.disable_auto_compactions = false;
+    Reopen(options);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_OK(Size(Key(0), Key(kTestSize - 1), &db_size[2]));
+
+    if (options.skip_stats_update_on_db_open) {
+      // If update stats on DB::Open is disable, we don't expect
+      // deletion entries taking effect.
+      //
+      // The deletions are small enough to fit in L1 and L2, and obsolete keys
+      // were moved to L3+, so none of the original data should have been
+      // dropped.
+      ASSERT_LE(db_size[0], db_size[2]);
+    } else {
+      // Otherwise, we should see a significant drop in db size.
+      //
+      // See "CompactionDeletionTrigger" test for proof that at most
+      // `db_size[0] / 2` of the original data remains.
+      ASSERT_GT(db_size[0] / 2, db_size[2]);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionTrigger) {
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      values.push_back(rnd.RandomString(990));
+      ASSERT_OK(Put(1, Key(i), values[i]));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(1, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+    values.push_back(rnd.RandomString(990));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(1, "", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1);
+}
+
+TEST_F(DBCompactionTest, BGCompactionsAllowed) {
+  // Create several column families. Make compaction triggers in all of them
+  // and see number of compactions scheduled to be less than allowed.
+  const int kNumKeysPerFile = 100;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 3;
+  // Should speed up compaction when there are 4 files.
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 20;
+  options.soft_pending_compaction_bytes_limit = 1 << 30;  // Infinitely large
+  options.max_background_compactions = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+  // Block all threads in thread pool.
+  const size_t kTotalTasks = 4;
+  env_->SetBackgroundThreads(4, Env::LOW);
+  test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  Random rnd(301);
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Create two more files for one column family, which triggers speed up
+  // condition, three compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(2, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(2, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 2));
+  }
+  ASSERT_EQ(3U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  // Unblock all threads to unblock all compactions.
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify number of compactions allowed will come back to 1.
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].Reset();
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_tasks[i], Env::Priority::LOW);
+    sleeping_tasks[i].WaitUntilSleeping();
+  }
+  for (int cf = 0; cf < 4; cf++) {
+    for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(i), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+      ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1);
+    }
+  }
+
+  // Now all column families qualify compaction but only one should be
+  // scheduled, because no column family hits speed up condition.
+  ASSERT_EQ(1U, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+
+  for (size_t i = 0; i < kTotalTasks; i++) {
+    sleeping_tasks[i].WakeUp();
+    sleeping_tasks[i].WaitUntilDone();
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+
+  // Write 8MB (80 values, each 100K)
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  std::vector<std::string> values;
+  for (int i = 0; i < 80; i++) {
+    values.push_back(rnd.RandomString(100000));
+    ASSERT_OK(Put(1, Key(i), values[i]));
+  }
+
+  // Reopening moves updates to level-0
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                        true /* disallow trivial move */));
+
+  ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+  for (int i = 0; i < 80; i++) {
+    ASSERT_EQ(Get(1, Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, MinorCompactionsHappen) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 10000;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    const int N = 500;
+
+    int starting_num_tables = TotalTableFiles(1);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(1000, 'v')));
+    }
+    int ending_num_tables = TotalTableFiles(1);
+    ASSERT_GT(ending_num_tables, starting_num_tables);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    for (int i = 0; i < N; i++) {
+      ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i)));
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile1) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, UserKeyCrossFile2) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("4", "A"));
+  ASSERT_OK(Put("3", "A"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("2", "A"));
+  ASSERT_OK(SingleDelete("3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("2", "B"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("NOT_FOUND", Get("3"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitioner) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_OK(Put("aaaa1", "A2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // move both files down to l1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A2", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 1;
+  std::shared_ptr<SstPartitionerFactory> factory(
+      NewSstPartitionerFixedPrefixFactory(4));
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  // create first file and flush to l0
+  ASSERT_OK(Put("aaaa1", "A"));
+  ASSERT_OK(Put("bbbb1", "B"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files;
+  dbfull()->GetLiveFilesMetaData(&files);
+  ASSERT_EQ(2, files.size());
+  ASSERT_EQ("A", Get("aaaa1"));
+  ASSERT_EQ("B", Get("bbbb1"));
+}
+
+TEST_F(DBCompactionTest, ZeroSeqIdCompaction) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  // compaction options
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  compact_opt.output_file_size_limit = 4096;
+  const size_t key_len =
+      static_cast<size_t>(compact_opt.output_file_size_limit) / 5;
+
+  DestroyAndReopen(options);
+
+  std::vector<const Snapshot*> snaps;
+
+  // create first file and flush to l0
+  for (auto& key : {"1", "2", "3", "3", "3", "3"}) {
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // create second file and flush to l0
+  for (auto& key : {"3", "4", "5", "6", "7", "8"}) {
+    ASSERT_OK(Put(key, std::string(key_len, 'A')));
+    snaps.push_back(dbfull()->GetSnapshot());
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // move both files down to l1
+  ASSERT_OK(
+      dbfull()->CompactFiles(compact_opt, collector->GetFlushedFiles(), 1));
+
+  // release snap so that first instance of key(3) can have seqId=0
+  for (auto snap : snaps) {
+    dbfull()->ReleaseSnapshot(snap);
+  }
+
+  // create 3 files in l0 so to trigger compaction
+  for (int i = 0; i < options.level0_file_num_compaction_trigger; i++) {
+    ASSERT_OK(Put("2", std::string(1, 'A')));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_OK(Put("", ""));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionUnknownOutputSize) {
+  // github issue #2249
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+
+  // create two files in l1 that we can compact
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < options.level0_file_num_compaction_trigger; j++) {
+      ASSERT_OK(Put(std::to_string(2 * i), std::string(1, 'A')));
+      ASSERT_OK(Put(std::to_string(2 * i + 1), std::string(1, 'A')));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(2, cf_meta.levels[1].files.size());
+  std::vector<std::string> input_filenames;
+  for (const auto& sst_file : cf_meta.levels[1].files) {
+    input_filenames.push_back(sst_file.name);
+  }
+
+  // note CompactionOptions::output_file_size_limit is unset.
+  CompactionOptions compact_opt;
+  compact_opt.compression = kNoCompression;
+  ASSERT_OK(dbfull()->CompactFiles(compact_opt, input_filenames, 1));
+}
+
+// Check that writes done during a memtable compaction are recovered
+// if the database is shutdown during the memtable compaction.
+TEST_F(DBCompactionTest, RecoverDuringMemtableCompaction) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Trigger a long memtable compaction and reopen the database during it
+    ASSERT_OK(Put(1, "foo", "v1"));  // Goes to 1st log file
+    ASSERT_OK(Put(1, "big1", std::string(10000000, 'x')));  // Fills memtable
+    ASSERT_OK(Put(1, "big2", std::string(1000, 'y')));  // Triggers compaction
+    ASSERT_OK(Put(1, "bar", "v2"));                     // Goes to new log file
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1"));
+    ASSERT_EQ(std::string(1000, 'y'), Get(1, "big2"));
+  } while (ChangeOptions());
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveOneFile) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t num_keys = 80;
+  int32_t value_size = 100 * 1024;  // 100 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  for (int i = 0; i < num_keys; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+
+  // Reopening moves updates to L0
+  Reopen(options);
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 1);  // 1 file in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // 0 files in L1
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  LiveFileMetaData level0_file = metadata[0];  // L0 file meta
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Compaction will initiate a trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  // File moved From L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);  // 0 files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // 1 file in L1
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  ASSERT_EQ(metadata[0].name /* level1_file.name */, level0_file.name);
+  ASSERT_EQ(metadata[0].size /* level1_file.size */, level0_file.size);
+
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveNonOverlappingFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {100, 199}, {300, 399}, {0, 99},    {200, 299},
+      {600, 699}, {400, 499}, {500, 550}, {551, 599},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+
+  // Since data is non-overlapping we expect compaction to initiate
+  // a trivial move
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // We expect that all the files were trivially moved from L0 to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0) /* level1_files */, level0_files);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  trivial_move = 0;
+  non_trivial_move = 0;
+  values.clear();
+  DestroyAndReopen(options);
+  // Same ranges as above but overlapping
+  ranges = {
+      {100, 199},
+      {300, 399},
+      {0, 99},
+      {200, 299},
+      {600, 699},
+      {400, 499},
+      {500, 560},  // this range overlap with the next
+                   // one
+      {551, 599},
+  };
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+  ASSERT_EQ(trivial_move, 0);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveTargetLevel) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 300]
+  for (int32_t i = 0; i <= 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [600 => 700]
+  for (int32_t i = 600; i <= 700; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int32_t i = 0; i <= 300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+  for (int32_t i = 600; i <= 700; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialOverlappingL0) {
+  class SubCompactionEventListener : public EventListener {
+   public:
+    void OnSubcompactionCompleted(const SubcompactionJobInfo&) override {
+      sub_compaction_finished_++;
+    }
+    std::atomic<int> sub_compaction_finished_{0};
+  };
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  SubCompactionEventListener* listener = new SubCompactionEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // For subcompactino to trigger, output level needs to be non-empty.
+  ASSERT_OK(Put("key", ""));
+  ASSERT_OK(Put("kez", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key", ""));
+  ASSERT_OK(Put("kez", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Ranges that are only briefly overlapping so that they won't be trivially
+  // moved but subcompaction ranges would only contain a subset of files.
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {100, 199}, {198, 399}, {397, 600}, {598, 800}, {799, 900}, {895, 999},
+  };
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());    // Multiple files in L0
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 1);  // One file in L1
+
+  listener->sub_compaction_finished_ = 0;
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (max_subcompactions_ > 3) {
+    // RocksDB might not generate the exact number of sub compactions.
+    // Here we validate that at least subcompaction happened.
+    ASSERT_GT(listener->sub_compaction_finished_.load(), 2);
+  }
+
+  // We expect that all the files were compacted to L1
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1, 0), 1);
+
+  for (size_t i = 0; i < ranges.size(); i++) {
+    for (int32_t j = ranges[i].first; j <= ranges[i].second; j++) {
+      ASSERT_EQ(Get(Key(j)), values[j]);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompactionPartial) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  // Purpose of dependencies:
+  // 4 -> 1: ensure the order of two non-trivial compactions
+  // 5 -> 2 and 5 -> 3: ensure we do a check before two non-trivial compactions
+  // are installed
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::ManualPartial:4", "DBCompaction::ManualPartial:1"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:2"},
+       {"DBCompaction::ManualPartial:5", "DBCompaction::ManualPartial:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:4");
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:3");
+        } else {  // second non-trivial compaction
+          TEST_SYNC_POINT("DBCompaction::ManualPartial:2");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.num_levels = 7;
+  options.max_subcompactions = max_subcompactions_;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 1 << 23;  // 8 MB
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 6;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Trivial move the two non-overlapping files to level 6
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L6
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 1 files in L0
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr, false));
+  ASSERT_OK(dbfull()->TEST_CompactRange(4, nullptr, nullptr, nullptr, false));
+  // 2 files in L6, 1 file in L5
+  ASSERT_EQ("0,0,0,0,0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 6);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    // First non-trivial compaction is triggered
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:1");
+  // file 4 [300 => 400)
+  for (int32_t i = 300; i <= 400; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 5 [400 => 500)
+  for (int32_t i = 400; i <= 500; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 6 [500 => 600)
+  for (int32_t i = 500; i <= 600; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  // Second non-trivial compaction is triggered
+  ASSERT_OK(Flush());
+
+  // Before two non-trivial compactions are installed, there are 3 files in L0
+  ASSERT_EQ("3,0,0,0,0,1,2", FilesPerLevel(0));
+  TEST_SYNC_POINT("DBCompaction::ManualPartial:5");
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // After two non-trivial compactions are installed, there is 1 file in L6, and
+  // 1 file in L1
+  ASSERT_EQ("0,1,0,0,0,0,1", FilesPerLevel(0));
+  threads.join();
+
+  for (int32_t i = 0; i < 600; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+// Disable as the test is flaky.
+TEST_F(DBCompactionTest, DISABLED_ManualPartialFill) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  bool first = true;
+  bool second = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompaction::PartialFill:4", "DBCompaction::PartialFill:1"},
+       {"DBCompaction::PartialFill:2", "DBCompaction::PartialFill:3"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun", [&](void* /*arg*/) {
+        if (first) {
+          TEST_SYNC_POINT("DBCompaction::PartialFill:4");
+          first = false;
+          TEST_SYNC_POINT("DBCompaction::PartialFill:3");
+        } else if (second) {
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L2, 1 in L0
+  ASSERT_EQ("1,0,2", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr, false));
+  // 2 files in L2, 1 in L1
+  ASSERT_EQ("0,1,2", FilesPerLevel(0));
+
+  ASSERT_EQ(trivial_move, 2);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::port::Thread threads([&] {
+    compact_options.change_level = false;
+    compact_options.exclusive_manual_compaction = false;
+    std::string begin_string = Key(0);
+    std::string end_string = Key(199);
+    Slice begin(begin_string);
+    Slice end(end_string);
+    ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  });
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:1");
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  TEST_SYNC_POINT("DBCompaction::PartialFill:2");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  threads.join();
+
+  for (int32_t i = 0; i < 4300; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+}
+
+TEST_F(DBCompactionTest, ManualCompactionWithUnorderedWrite) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("bar", "v1"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { ASSERT_OK(Put("foo", "v2")); });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ManualCompactionWithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  writer.join();
+  ASSERT_EQ(Get("foo"), "v2");
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Reopen(options);
+  ASSERT_EQ(Get("foo"), "v2");
+}
+
+TEST_F(DBCompactionTest, DeleteFileRange) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  // Add 2 non-overlapping files
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file 1 [0 => 100]
+  for (int32_t i = 0; i < 100; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // file 2 [100 => 300]
+  for (int32_t i = 100; i < 300; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // 2 files in L0
+  ASSERT_EQ("2", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  // 2 files in L2
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  // file 3 [ 0 => 200]
+  for (int32_t i = 0; i < 200; i++) {
+    values[i] = rnd.RandomString(value_size);
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  // Many files 4 [300 => 4300)
+  for (int32_t i = 0; i <= 5; i++) {
+    for (int32_t j = 300; j < 4300; j++) {
+      if (j == 2300) {
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify level sizes
+  uint64_t target_size = 4 * options.max_bytes_for_level_base;
+  for (int32_t i = 1; i < options.num_levels; i++) {
+    ASSERT_LE(SizeAtLevel(i), target_size);
+    target_size = static_cast<uint64_t>(target_size *
+                                        options.max_bytes_for_level_multiplier);
+  }
+
+  const size_t old_num_files = CountFiles();
+  std::string begin_string = Key(1000);
+  std::string end_string = Key(2000);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+
+  int32_t deleted_count = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    if (i < 1000 || i > 2000) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    } else {
+      ReadOptions roptions;
+      std::string result;
+      Status s = db_->Get(roptions, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound() || s.ok());
+      if (s.IsNotFound()) {
+        deleted_count++;
+      }
+    }
+  }
+  ASSERT_GT(deleted_count, 0);
+  begin_string = Key(5000);
+  end_string = Key(6000);
+  Slice begin1(begin_string);
+  Slice end1(end_string);
+  // Try deleting files in range which contain no keys
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin1, &end1));
+
+  // Push data from level 0 to level 1 to force all data to be deleted
+  // Note that we don't delete level 0 files
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(
+      DeleteFilesInRange(db_, db_->DefaultColumnFamily(), nullptr, nullptr));
+
+  int32_t deleted_count2 = 0;
+  for (int32_t i = 0; i < 4300; i++) {
+    ReadOptions roptions;
+    std::string result;
+    ASSERT_TRUE(db_->Get(roptions, Key(i), &result).IsNotFound());
+    deleted_count2++;
+  }
+  ASSERT_GT(deleted_count2, deleted_count);
+  const size_t new_num_files = CountFiles();
+  ASSERT_GT(old_num_files, new_num_files);
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRanges) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1024 * 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 4;
+  options.max_background_compactions = 3;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+
+  // file [0 => 100), [100 => 200), ... [900, 1000)
+  for (auto i = 0; i < 10; i++) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      values[k] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("10", FilesPerLevel(0));
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,10", FilesPerLevel(0));
+
+  // file [0 => 100), [200 => 300), ... [800, 900)
+  for (auto i = 0; i < 10; i += 2) {
+    for (auto j = 0; j < 100; j++) {
+      auto k = i * 100 + j;
+      ASSERT_OK(Put(Key(k), values[k]));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5,0,10", FilesPerLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ("0,5,10", FilesPerLevel(0));
+
+  // Delete files in range [0, 299] (inclusive)
+  {
+    auto begin_str1 = Key(0), end_str1 = Key(100);
+    auto begin_str2 = Key(100), end_str2 = Key(200);
+    auto begin_str3 = Key(200), end_str3 = Key(299);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size()));
+    ASSERT_EQ("0,3,7", FilesPerLevel(0));
+
+    // Keys [0, 300) should not exist.
+    for (auto i = 0; i < 300; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete files in range [600, 999) (exclusive)
+  {
+    auto begin_str1 = Key(600), end_str1 = Key(800);
+    auto begin_str2 = Key(700), end_str2 = Key(900);
+    auto begin_str3 = Key(800), end_str3 = Key(999);
+    Slice begin1(begin_str1), end1(end_str1);
+    Slice begin2(begin_str2), end2(end_str2);
+    Slice begin3(begin_str3), end3(end_str3);
+    std::vector<RangePtr> ranges;
+    ranges.push_back(RangePtr(&begin1, &end1));
+    ranges.push_back(RangePtr(&begin2, &end2));
+    ranges.push_back(RangePtr(&begin3, &end3));
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(),
+                                  ranges.data(), ranges.size(), false));
+    ASSERT_EQ("0,1,4", FilesPerLevel(0));
+
+    // Keys [600, 900) should not exist.
+    for (auto i = 600; i < 900; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+    for (auto i = 300; i < 600; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+    for (auto i = 900; i < 1000; i++) {
+      ASSERT_EQ(Get(Key(i)), values[i]);
+    }
+  }
+
+  // Delete all files.
+  {
+    RangePtr range;
+    ASSERT_OK(DeleteFilesInRanges(db_, db_->DefaultColumnFamily(), &range, 1));
+    ASSERT_EQ("", FilesPerLevel(0));
+
+    for (auto i = 0; i < 1000; i++) {
+      ReadOptions ropts;
+      std::string result;
+      auto s = db_->Get(ropts, Key(i), &result);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, DeleteFileRangeFileEndpointsOverlapBug) {
+  // regression test for #2833: groups of files whose user-keys overlap at the
+  // endpoints could be split by `DeleteFilesInRange`. This caused old data to
+  // reappear, either because a new version of the key was removed, or a range
+  // deletion was partially dropped. It could also cause non-overlapping
+  // invariant to be violated if the files dropped by DeleteFilesInRange were
+  // a subset of files that a range deletion spans.
+  const int kNumL0Files = 2;
+  const int kValSize = 8 << 10;  // 8KB
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  // The snapshot prevents key 1 from having its old version dropped. The low
+  // `target_file_size_base` ensures two keys will be in each output file.
+  const Snapshot* snapshot = nullptr;
+  Random rnd(301);
+  // The value indicates which flush the key belonged to, which is enough
+  // for us to determine the keys' relative ages. After L0 flushes finish,
+  // files look like:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[0]
+  // File 1:               1 -> vals[1], 2 -> vals[1]
+  //
+  // Then L0->L1 compaction happens, which outputs keys as follows:
+  //
+  // File 0: 0 -> vals[0], 1 -> vals[1]
+  // File 1:               1 -> vals[0], 2 -> vals[1]
+  //
+  // DeleteFilesInRange shouldn't be allowed to drop just file 0, as that
+  // would cause `1 -> vals[0]` (an older key) to reappear.
+  std::string vals[kNumL0Files];
+  for (int i = 0; i < kNumL0Files; ++i) {
+    vals[i] = rnd.RandomString(kValSize);
+    ASSERT_OK(Put(Key(i), vals[i]));
+    ASSERT_OK(Put(Key(i + 1), vals[i]));
+    ASSERT_OK(Flush());
+    if (i == 0) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify `DeleteFilesInRange` can't drop only file 0 which would cause
+  // "1 -> vals[0]" to reappear.
+  std::string begin_str = Key(0), end_str = Key(1);
+  Slice begin = begin_str, end = end_str;
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  ASSERT_EQ(vals[1], Get(Key(1)));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBCompactionTestWithParam, TrivialMoveToLastLevelWithFiles) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 200; i++) {
+    ASSERT_EQ(Get(Key(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionThirdPath) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up first path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+  ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+  ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+  ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+  ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+  ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Always gets compacted into 1 Level1 file,
+  // 0/1 Level 0 file
+  for (int num = 0; num < 3; num++) {
+    key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1", FilesPerLevel(0));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBCompactionTestWithParam, LevelCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 4 * 1024 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"}, option_vector[1]);
+
+  // Configure CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 4 * 1024 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"}, option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_filesperlevel = [&](const std::string& expected) {
+    ASSERT_EQ(expected, FilesPerLevel(0));
+    ASSERT_EQ(expected, FilesPerLevel(1));
+    ASSERT_EQ(expected, FilesPerLevel(2));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // Check that default column family uses db_paths.
+  // And Column family "one" uses cf_paths.
+
+  // The compaction in level0 outputs the sst files in level1.
+  // The first path cannot hold level1's data(400KB+400KB > 500KB),
+  // so every compaction move a sst file to second path. Please
+  // refer to LevelCompactionBuilder::GetPathId.
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+  check_sstfilecount(0, 1);
+  check_sstfilecount(1, 2);
+
+  generate_file();
+  check_sstfilecount(1, 3);
+
+  // (1, 4)
+  generate_file();
+  check_filesperlevel("1,4");
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 1)
+  generate_file();
+  check_filesperlevel("1,4,1");
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  // (1, 4, 2)
+  generate_file();
+  check_filesperlevel("1,4,2");
+  check_sstfilecount(2, 2);
+  check_sstfilecount(1, 4);
+  check_sstfilecount(0, 1);
+
+  check_getvalues();
+
+  {  // Also verify GetLiveFilesStorageInfo with db_paths / cf_paths
+    std::vector<LiveFileStorageInfo> new_infos;
+    LiveFilesStorageInfoOptions lfsio;
+    lfsio.wal_size_for_flush = UINT64_MAX;  // no flush
+    ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsio, &new_infos));
+    std::unordered_map<std::string, int> live_sst_by_dir;
+    for (auto& info : new_infos) {
+      if (info.file_type == kTableFile) {
+        live_sst_by_dir[info.directory]++;
+        // Verify file on disk (no directory confusion)
+        uint64_t size;
+        ASSERT_OK(env_->GetFileSize(
+            info.directory + "/" + info.relative_filename, &size));
+        ASSERT_EQ(info.size, size);
+      }
+    }
+    ASSERT_EQ(3U * 3U, live_sst_by_dir.size());
+    for (auto& paths : {options.db_paths, cf_opt1.cf_paths, cf_opt2.cf_paths}) {
+      ASSERT_EQ(1, live_sst_by_dir[paths[0].path]);
+      ASSERT_EQ(4, live_sst_by_dir[paths[1].path]);
+      ASSERT_EQ(2, live_sst_by_dir[paths[2].path]);
+    }
+  }
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBCompactionTestWithParam, ConvertCompactionStyle) {
+  Random rnd(301);
+  int max_key_level_insert = 200;
+  int max_key_universal_insert = 600;
+
+  // Stage 1: generate a db with level compaction
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_bytes_for_level_base = 500 << 10;  // 500KB
+  options.max_bytes_for_level_multiplier = 1;
+  options.target_file_size_base = 200 << 10;  // 200KB
+  options.target_file_size_multiplier = 1;
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key_level_insert; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(TotalTableFiles(1, 4), 1);
+  int non_level0_num_files = 0;
+  for (int i = 1; i < options.num_levels; i++) {
+    non_level0_num_files += NumTableFilesAtLevel(i, 1);
+  }
+  ASSERT_GT(non_level0_num_files, 0);
+
+  // Stage 2: reopen with universal compaction - should fail
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Stage 3: compact into a single file and move the file to level 0
+  options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = INT_MAX;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = INT_MAX;
+  options.max_bytes_for_level_multiplier = 1;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  // cannot use kForceOptimized here because the compaction here is expected
+  // to generate one output file
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForce;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+
+  // Only 1 file in L0
+  ASSERT_EQ("1", FilesPerLevel(1));
+
+  // Stage 4: re-open in universal compaction style and do some db operations
+  options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 3;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  options.num_levels = 1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) {
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+  }
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = 1; i < options.num_levels; i++) {
+    ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0);
+  }
+
+  // verify keys inserted in both level compaction style and universal
+  // compaction style
+  std::string keys_in_db;
+  Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+  ASSERT_OK(iter->status());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    keys_in_db.append(iter->key().ToString());
+    keys_in_db.push_back(',');
+  }
+  delete iter;
+
+  std::string expected_keys;
+  for (int i = 0; i <= max_key_universal_insert; i++) {
+    expected_keys.append(Key(i));
+    expected_keys.push_back(',');
+  }
+
+  ASSERT_EQ(keys_in_db, expected_keys);
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_a) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "b", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "b"));
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "a"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "v"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(a->v)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(a->v)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "e"));
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "c", "cv"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "d", "dv"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "", ""));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Delete(1, "d"));
+    ASSERT_OK(Delete(1, "b"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+    env_->SleepForMicroseconds(1000000);  // Wait for compaction to finish
+    ASSERT_EQ("(->)(c->cv)", Contents(1));
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBCompactionTest, ManualAutoRace) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBCompactionTest::ManualAutoRace:1"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", ""));
+  ASSERT_OK(Put(1, "bar", ""));
+  // Generate four files in CF 0, which should trigger an auto compaction
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", ""));
+  ASSERT_OK(Put("bar", ""));
+  ASSERT_OK(Flush());
+
+  // The auto compaction is scheduled but waited until here
+  TEST_SYNC_POINT("DBCompactionTest::ManualAutoRace:1");
+  // The auto compaction will wait until the manual compaction is registerd
+  // before processing so that it will be cancelled.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  ASSERT_OK(dbfull()->CompactRange(cro, handles_[1], nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(1));
+
+  // Eventually the cancelled compaction will be rescheduled and executed.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, ManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    uint64_t prev_block_cache_add =
+        options.statistics->getTickerCount(BLOCK_CACHE_ADD);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+    ASSERT_OK(db_->CompactRange(cro, handles_[1], nullptr, nullptr));
+    // Verify manual compaction doesn't fill block cache
+    ASSERT_EQ(prev_block_cache_add,
+              options.statistics->getTickerCount(BLOCK_CACHE_ADD));
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, ManualLevelCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+  options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "p", "begin"));
+      ASSERT_OK(Put(1, "q", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("3", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Populate a different range
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put(1, "c", "begin"));
+      ASSERT_OK(Put(1, "e", "end"));
+      ASSERT_OK(Flush(1));
+    }
+    ASSERT_EQ("3,1", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f", 1);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    // Compact all
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("1,2", FilesPerLevel(1));
+    ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+    CompactRangeOptions compact_options;
+    compact_options.target_path_id = 1;
+    compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+    ASSERT_OK(
+        db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,1", FilesPerLevel(1));
+    ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+    ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+    ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+    if (iter == 0) {
+      DestroyAndReopen(options);
+      options = CurrentOptions();
+      options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760);
+      options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760);
+      options.max_background_flushes = 1;
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, FilesDeletedAfterCompaction) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v2"));
+    Compact(1, "a", "z");
+    const size_t num_files = CountLiveFiles();
+    for (int i = 0; i < 10; i++) {
+      ASSERT_OK(Put(1, "foo", "v2"));
+      Compact(1, "a", "z");
+    }
+    ASSERT_EQ(CountLiveFiles(), num_files);
+  } while (ChangeCompactOptions());
+}
+
+// Check level comapction with compact files
+TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.level0_stop_writes_trigger = 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  int output_level = static_cast<int>(cf_meta.levels.size()) - 1;
+  for (int file_picked = 5; file_picked > 0; --file_picked) {
+    std::set<std::string> overlapping_file_names;
+    std::vector<std::string> compaction_input_file_names;
+    for (int f = 0; f < file_picked; ++f) {
+      int level = 0;
+      auto file_meta = PickFileRandomly(cf_meta, &rnd, &level);
+      compaction_input_file_names.push_back(file_meta->name);
+      GetOverlappingFileNumbersForLevelCompaction(
+          cf_meta, options.comparator, level, output_level, file_meta,
+          &overlapping_file_names);
+    }
+
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                     compaction_input_file_names,
+                                     output_level));
+
+    // Make sure all overlapping files do not exist after compaction
+    dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+    VerifyCompactionResult(cf_meta, overlapping_file_names);
+  }
+
+  // make sure all key-values are still there.
+  for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_NE(Get(1, std::to_string(key)), "NOT_FOUND");
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, PartialCompactionFailure) {
+  Options options;
+  const int kKeySize = 16;
+  const int kKvSize = 1000;
+  const int kKeysPerBuffer = 100;
+  const int kNumL1Files = 5;
+  options.create_if_missing = true;
+  options.write_buffer_size = kKeysPerBuffer * kKvSize;
+  options.max_write_buffer_number = 2;
+  options.target_file_size_base =
+      options.write_buffer_size * (options.max_write_buffer_number - 1);
+  options.level0_file_num_compaction_trigger = kNumL1Files;
+  options.max_bytes_for_level_base =
+      options.level0_file_num_compaction_trigger *
+      options.target_file_size_base;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options.max_subcompactions = max_subcompactions_;
+
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  // stop the compaction thread until we simulate the file creation failure.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  const int kNumInsertedKeys = options.level0_file_num_compaction_trigger *
+                               (options.max_write_buffer_number - 1) *
+                               kKeysPerBuffer;
+
+  Random rnd(301);
+  std::vector<std::string> keys;
+  std::vector<std::string> values;
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    keys.emplace_back(rnd.RandomString(kKeySize));
+    values.emplace_back(rnd.RandomString(kKvSize - kKeySize));
+    ASSERT_OK(Put(Slice(keys[k]), Slice(values[k])));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // Make sure the number of L0 files can trigger compaction.
+  ASSERT_GE(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+
+  auto previous_num_level0_files = NumTableFilesAtLevel(0);
+
+  // Fail the first file creation.
+  env_->non_writable_count_ = 1;
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Expect compaction to fail here as one file will fail its
+  // creation.
+  ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok());
+
+  // Verify L0 -> L1 compaction does fail.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Verify all L0 files are still there.
+  ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files);
+
+  // All key-values must exist after compaction fails.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+
+  env_->non_writable_count_ = 0;
+
+  // Make sure RocksDB will not get into corrupted state.
+  Reopen(options);
+
+  // Verify again after reopen.
+  for (int k = 0; k < kNumInsertedKeys; ++k) {
+    ASSERT_EQ(values[k], Get(keys[k]));
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, DeleteMovedFileAfterCompaction) {
+  // iter 1 -- delete_obsolete_files_period_micros == 0
+  for (int iter = 0; iter < 2; ++iter) {
+    // This test triggers move compaction and verifies that the file is not
+    // deleted when it's part of move compaction
+    Options options = CurrentOptions();
+    options.env = env_;
+    if (iter == 1) {
+      options.delete_obsolete_files_period_micros = 0;
+    }
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger =
+        2;  // trigger compaction when we have 2 files
+    OnFileDeletionListener* listener = new OnFileDeletionListener();
+    options.listeners.emplace_back(listener);
+    options.max_subcompactions = max_subcompactions_;
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    // Create two 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute L0->L1
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+
+    // block compactions
+    test::SleepingBackgroundTask sleeping_task;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                   Env::Priority::LOW);
+
+    options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+    Reopen(options);
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+    // let compactions go
+    sleeping_task.WakeUp();
+    sleeping_task.WaitUntilDone();
+
+    // this should execute L1->L2 (move)
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+    std::vector<LiveFileMetaData> metadata;
+    db_->GetLiveFilesMetaData(&metadata);
+    ASSERT_EQ(metadata.size(), 1U);
+    auto moved_file_name = metadata[0].name;
+
+    // Create two more 1MB sst files
+    for (int i = 0; i < 2; ++i) {
+      // Create 1MB sst file
+      for (int j = 0; j < 100; ++j) {
+        ASSERT_OK(Put(Key(i * 50 + j + 100), rnd.RandomString(10 * 1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // this should execute both L0->L1 and L1->L2 (merge with previous file)
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+    // iterator is holding the file
+    ASSERT_OK(env_->FileExists(dbname_ + moved_file_name));
+
+    listener->SetExpectedFileName(dbname_ + moved_file_name);
+    ASSERT_OK(iterator->status());
+    iterator.reset();
+
+    // this file should have been compacted away
+    ASSERT_NOK(env_->FileExists(dbname_ + moved_file_name));
+    listener->VerifyMatchedCount(1);
+  }
+}
+
+TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) {
+  if (!Zlib_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_subcompactions = max_subcompactions_;
+  // First two levels have no compression, so that a trivial move between
+  // them will be allowed. Level 2 has Zlib compression so that a trivial
+  // move to level 3 will not be allowed
+  options.compression_per_level = {kNoCompression, kNoCompression,
+                                   kZlibCompression};
+  int matches = 0, didnt_match = 0, trivial_move = 0, non_trivial = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:Matches",
+      [&](void* /*arg*/) { matches++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Compaction::InputCompressionMatchesOutput:DidntMatch",
+      [&](void* /*arg*/) { didnt_match++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are going to level 0
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to fill up level 0
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(4, GetSstFileCount(dbname_));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4", FilesPerLevel(0));
+
+  // (1, 4, 1)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,1", FilesPerLevel(0));
+
+  // (1, 4, 2)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,2", FilesPerLevel(0));
+
+  // (1, 4, 3)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,3", FilesPerLevel(0));
+
+  // (1, 4, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,4", FilesPerLevel(0));
+
+  // (1, 4, 5)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,5", FilesPerLevel(0));
+
+  // (1, 4, 6)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,6", FilesPerLevel(0));
+
+  // (1, 4, 7)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,7", FilesPerLevel(0));
+
+  // (1, 4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,4,8", FilesPerLevel(0));
+
+  ASSERT_EQ(matches, 12);
+  // Currently, the test relies on the number of calls to
+  // InputCompressionMatchesOutput() per compaction.
+  const int kCallsToInputCompressionMatch = 2;
+  ASSERT_EQ(didnt_match, 8 * kCallsToInputCompressionMatch);
+  ASSERT_EQ(trivial_move, 12);
+  ASSERT_EQ(non_trivial, 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 5;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit);
+
+  options.max_background_compactions = 3;
+  options.soft_pending_compaction_bytes_limit = 200;
+  options.hard_pending_compaction_bytes_limit = 150;
+  DestroyAndReopen(options);
+  ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit);
+}
+
+// This tests for a bug that could cause two level0 compactions running
+// concurrently
+// TODO(aekmekji): Make sure that the reason this fails when run with
+// max_subcompactions > 1 is not a correctness issue but just inherent to
+// running parallel L0-L1 compactions
+TEST_F(DBCompactionTest, SuggestCompactRangeNoTwoLevel0Compactions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.max_background_compactions = 2;
+
+  DestroyAndReopen(options);
+
+  // fill up the DB
+  Random rnd(301);
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactionJob::Run():Start",
+        "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1"},
+       {"DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2",
+        "CompactionJob::Run():End"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // trigger L0 compaction
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:1");
+
+  GenerateNewRandomFile(&rnd, /* nowait */ true);
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger + 1;
+       num++) {
+    GenerateNewRandomFile(&rnd, /* nowait */ true);
+    ASSERT_OK(Flush());
+  }
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::SuggestCompactRangeNoTwoLevel0Compactions:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+static std::string ShortKey(int i) {
+  assert(i < 10000);
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key%04d", i);
+  return std::string(buf);
+}
+
+TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The key size is guaranteed to be <= 8
+  class ShortKeyComparator : public Comparator {
+    int Compare(const ROCKSDB_NAMESPACE::Slice& a,
+                const ROCKSDB_NAMESPACE::Slice& b) const override {
+      assert(a.size() <= 8);
+      assert(b.size() <= 8);
+      return BytewiseComparator()->Compare(a, b);
+    }
+    const char* Name() const override { return "ShortKeyComparator"; }
+    void FindShortestSeparator(
+        std::string* start,
+        const ROCKSDB_NAMESPACE::Slice& limit) const override {
+      return BytewiseComparator()->FindShortestSeparator(start, limit);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      return BytewiseComparator()->FindShortSuccessor(key);
+    }
+  } short_key_cmp;
+  Options options = CurrentOptions();
+  options.target_file_size_base = 100000000;
+  options.write_buffer_size = 100000000;
+  options.max_subcompactions = max_subcompactions_;
+  options.comparator = &short_key_cmp;
+  DestroyAndReopen(options);
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::vector<std::string> values;
+  // File with keys [ 0 => 99 ]
+  for (int i = 0; i < 100; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 (trivial move) then move L1 files to L3
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 3;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 1);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  // File with keys [ 100 => 199 ]
+  for (int i = 100; i < 200; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // then compacte the bottommost level L3=>L3 (non trivial move)
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 4);
+  ASSERT_EQ(non_trivial_move, 1);
+
+  // File with keys [ 200 => 299 ]
+  for (int i = 200; i < 300; i++) {
+    values.push_back(rnd.RandomString(value_size));
+    ASSERT_OK(Put(ShortKey(i), values[i]));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("1,0,0,1", FilesPerLevel(0));
+  trivial_move = 0;
+  non_trivial_move = 0;
+  compact_options = CompactRangeOptions();
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kSkip;
+  // Compaction will do L0=>L1 L1=>L2 L2=>L3 (3 trivial moves)
+  // and will skip bottommost level compaction
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,2", FilesPerLevel(0));
+  ASSERT_EQ(trivial_move, 3);
+  ASSERT_EQ(non_trivial_move, 0);
+
+  for (int i = 0; i < 300; i++) {
+    ASSERT_EQ(Get(ShortKey(i)), values[i]);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0Compaction) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.write_buffer_size = 2 << 20;  // 2MB
+
+  BlockBasedTableOptions table_options;
+  table_options.block_cache = NewLRUCache(64 << 20);  // 64MB
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0Compaction:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4   5   6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB 2MB 1MB 1MB 1MB 1MB
+  // score:                     1.5 1.3 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction.
+  //
+  // Files 6-9 are the longest span of available files for which
+  // work-per-deleted-file decreases (see "score" row above).
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(0), ""));  // prevents trivial move
+    if (i == 5) {
+      TEST_SYNC_POINT("DBCompactionTest::IntraL0Compaction:L0ToL1Ready");
+      ASSERT_OK(Put(Key(i + 1), value + value));
+    } else {
+      ASSERT_OK(Put(Key(i + 1), value));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has the 2MB file (not compacted) and 4MB file (output of L0->L0)
+  ASSERT_EQ(2, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_GE(level_to_files[0][i].fd.file_size, 1 << 21);
+  }
+
+  // The index/filter in the file produced by intra-L0 should not be pinned.
+  // That means clearing unref'd entries in block cache and re-accessing the
+  // file produced by intra-L0 should bump the index block miss count.
+  uint64_t prev_index_misses =
+      TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  table_options.block_cache->EraseUnRefEntries();
+  ASSERT_EQ("", Get(Key(0)));
+  ASSERT_EQ(prev_index_misses + 1,
+            TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(DBCompactionTestWithParam, IntraL0CompactionDoesNotObsoleteDeletions) {
+  // regression test for issue #2722: L0->L0 compaction can resurrect deleted
+  // keys from older L0 files if L1+ files' key-ranges do not include the key.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin flushing files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+        "L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // index:   0   1   2   3   4    5    6   7   8   9
+  // size:  1MB 1MB 1MB 1MB 1MB  1MB  1MB 1MB 1MB 1MB
+  // score:                     1.25 1.33 1.5 2.0 inf
+  //
+  // Files 0-4 will be included in an L0->L1 compaction.
+  //
+  // L0->L0 will be triggered since the sync points guarantee compaction to base
+  // level is still blocked when files 5-9 trigger another compaction. All files
+  // 5-9 are included in the L0->L0 due to work-per-deleted file decreasing.
+  //
+  // Put a key-value in files 0-4. Delete that key in files 5-9. Verify the
+  // L0->L0 preserves the deletion such that the key remains deleted.
+  for (int i = 0; i < 10; ++i) {
+    // key 0 serves both to prevent trivial move and as the key we want to
+    // verify is not resurrected by L0->L0 compaction.
+    if (i < 5) {
+      ASSERT_OK(Put(Key(0), ""));
+    } else {
+      ASSERT_OK(Delete(Key(0)));
+    }
+    if (i == 5) {
+      TEST_SYNC_POINT(
+          "DBCompactionTest::IntraL0CompactionDoesNotObsoleteDeletions:"
+          "L0ToL1Ready");
+    }
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GE(level_to_files.size(), 2);  // at least L0 and L1
+  // L0 has a single output file from L0->L0
+  ASSERT_EQ(1, level_to_files[0].size());
+  ASSERT_GT(level_to_files[1].size(), 0);
+  ASSERT_GE(level_to_files[0][0].fd.file_size, 1 << 22);
+
+  ReadOptions roptions;
+  std::string result;
+  ASSERT_TRUE(db_->Get(roptions, Key(0), &result).IsNotFound());
+}
+
+TEST_P(DBCompactionTestWithParam, FullCompactionInBottomPriThreadPool) {
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  for (bool use_universal_compaction : {false, true}) {
+    Options options = CurrentOptions();
+    if (use_universal_compaction) {
+      options.compaction_style = kCompactionStyleUniversal;
+    } else {
+      options.compaction_style = kCompactionStyleLevel;
+      options.level_compaction_dynamic_level_bytes = true;
+    }
+    options.num_levels = 4;
+    options.write_buffer_size = 100 << 10;     // 100KB
+    options.target_file_size_base = 32 << 10;  // 32KB
+    options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+    // Trigger compaction if size amplification exceeds 110%
+    options.compaction_options_universal.max_size_amplification_percent = 110;
+    DestroyAndReopen(options);
+
+    int num_bottom_pri_compactions = 0;
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BGWorkBottomCompaction",
+        [&](void* /*arg*/) { ++num_bottom_pri_compactions; });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      ASSERT_EQ(NumSortedRuns(), num);
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx);
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ(1, num_bottom_pri_compactions);
+
+    // Verify that size amplification did occur
+    ASSERT_EQ(NumSortedRuns(), 1);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_F(DBCompactionTest, OptimizedDeletionObsoleting) {
+  // Deletions can be dropped when compacted to non-last level if they fall
+  // outside the lower-level files' key-ranges.
+  const int kNumL0Files = 4;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // put key 1 and 3 in separate L1, L2 files.
+  // So key 0, 2, and 4+ fall outside these levels' key-ranges.
+  for (int level = 2; level >= 1; --level) {
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(Put(Key(2 * i + 1), "val"));
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(level);
+    ASSERT_EQ(2, NumTableFilesAtLevel(level));
+  }
+
+  // Delete keys in range [1, 4]. These L0 files will be compacted with L1:
+  // - Tombstones for keys 2 and 4 can be dropped early.
+  // - Tombstones for keys 1 and 3 must be kept due to L2 files' key-ranges.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Delete(Key(i + 1)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i + 1), &value).IsNotFound());
+  }
+  ASSERT_EQ(2, options.statistics->getTickerCount(
+                   COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE));
+  ASSERT_EQ(2,
+            options.statistics->getTickerCount(COMPACTION_KEY_DROP_OBSOLETE));
+}
+
+TEST_F(DBCompactionTest, CompactFilesPendingL0Bug) {
+  // https://www.facebook.com/groups/rocksdb.dev/permalink/1389452781153232/
+  // CompactFiles() had a bug where it failed to pick a compaction when an L0
+  // compaction existed, but marked it as scheduled anyways. It'd never be
+  // unmarked as scheduled, so future compactions or DB close could hang.
+  const int kNumL0Files = 5;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files - 1;
+  options.max_background_compactions = 2;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTest::CompactFilesPendingL0Bug:Picked"},
+       {"DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto schedule_multi_compaction_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  // Files 0-3 will be included in an L0->L1 compaction.
+  //
+  // File 4 will be included in a call to CompactFiles() while the first
+  // compaction is running.
+  for (int i = 0; i < kNumL0Files - 1; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));  // sentinel to prevent trivial move
+    ASSERT_OK(Put(Key(i + 1), "val"));
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:Picked");
+  // file 4 flushed after 0-3 picked
+  ASSERT_OK(Put(Key(kNumL0Files), "val"));
+  ASSERT_OK(Flush());
+
+  // previously DB close would hang forever as this situation caused scheduled
+  // compactions count to never decrement to zero.
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_EQ(kNumL0Files, cf_meta.levels[0].files.size());
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   0 /* output_level */));
+  TEST_SYNC_POINT("DBCompactionTest::CompactFilesPendingL0Bug:ManualCompacted");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactFilesOverlapInL0Bug) {
+  // Regression test for bug of not pulling in L0 files that overlap the user-
+  // specified input files in time- and key-ranges.
+  ASSERT_OK(Put(Key(0), "old_val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(0), "new_val"));
+  ASSERT_OK(Flush());
+
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(dbfull()->DefaultColumnFamily(), &cf_meta);
+  ASSERT_GE(cf_meta.levels.size(), 2);
+  ASSERT_EQ(2, cf_meta.levels[0].files.size());
+
+  // Compacting {new L0 file, L1 file} should pull in the old L0 file since it
+  // overlaps in key-range and time-range.
+  std::vector<std::string> input_filenames;
+  input_filenames.push_back(cf_meta.levels[0].files.front().name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), input_filenames,
+                                   1 /* output_level */));
+  ASSERT_EQ("new_val", Get(Key(0)));
+}
+
+TEST_F(DBCompactionTest, DeleteFilesInRangeConflictWithCompaction) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const Snapshot* snapshot = nullptr;
+  const int kMaxKey = 10;
+
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), Key(i)));
+    ASSERT_OK(Delete(Key(i)));
+    if (!snapshot) {
+      snapshot = db_->GetSnapshot();
+    }
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Put(Key(kMaxKey), Key(kMaxKey)));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // test DeleteFilesInRange() deletes the files already picked for compaction
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifestStart",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCompaction:Finish",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // release snapshot which mark bottommost file for compaction
+  db_->ReleaseSnapshot(snapshot);
+  std::string begin_string = Key(0);
+  std::string end_string = Key(kMaxKey + 1);
+  Slice begin(begin_string);
+  Slice end(end_string);
+  ASSERT_OK(DeleteFilesInRange(db_, db_->DefaultColumnFamily(), &begin, &end));
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  CreateAndReopenWithCF({"one"}, options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    ASSERT_OK(Flush());
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_NE(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  // release snapshot and wait for compactions to finish. Single-file
+  // compactions should be triggered, which reduce the size of each bottom-level
+  // file without changing file count.
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_EQ(kMaxSequenceNumber, dbfull()->bottommost_files_mark_threshold_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kBottommostFiles);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is smaller than it was before as it was rewritten without
+    // deletion markers/deleted keys.
+    ASSERT_LT(post_file.size, pre_file.size);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, NoCompactBottomLevelFilesWithDeletions) {
+  // bottom-level files may contain deletions due to snapshots protecting the
+  // deleted keys. Once the snapshot is released, we should see files with many
+  // such deletions undergo single-file compactions. But when disabling auto
+  // compactions, it shouldn't be triggered which may causing too many
+  // background jobs.
+  const int kNumKeysPerFile = 1024;
+  const int kNumLevelFiles = 4;
+  const int kValueSize = 128;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumLevelFiles;
+  // inflate it a bit to account for key/metadata overhead
+  options.target_file_size_base = 120 * kNumKeysPerFile * kValueSize / 100;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    if (i == kNumLevelFiles - 1) {
+      snapshot = db_->GetSnapshot();
+      // delete every other key after grabbing a snapshot, so these deletions
+      // and the keys they cover can't be dropped until after the snapshot is
+      // released.
+      for (int j = 0; j < kNumLevelFiles * kNumKeysPerFile; j += 2) {
+        ASSERT_OK(Delete(Key(j)));
+      }
+    }
+    ASSERT_OK(Flush());
+    if (i < kNumLevelFiles - 1) {
+      ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr));
+  ASSERT_EQ(kNumLevelFiles, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> pre_release_metadata, post_release_metadata;
+  db_->GetLiveFilesMetaData(&pre_release_metadata);
+  // just need to bump seqnum so ReleaseSnapshot knows the newest key in the SST
+  // files does not need to be preserved in case of a future snapshot.
+  ASSERT_OK(Put(Key(0), "val"));
+
+  // release snapshot and no compaction should be triggered.
+  std::atomic<int> num_compactions{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { num_compactions.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  db_->ReleaseSnapshot(snapshot);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, num_compactions);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  db_->GetLiveFilesMetaData(&post_release_metadata);
+  ASSERT_EQ(pre_release_metadata.size(), post_release_metadata.size());
+  for (size_t i = 0; i < pre_release_metadata.size(); ++i) {
+    const auto& pre_file = pre_release_metadata[i];
+    const auto& post_file = post_release_metadata[i];
+    ASSERT_EQ(1, pre_file.level);
+    ASSERT_EQ(1, post_file.level);
+    // each file is same as before with deletion markers/deleted keys.
+    ASSERT_EQ(post_file.size, pre_file.size);
+  }
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionNormal) {
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 20;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.compaction_pri = kRoundRobin;
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // add a small second for each wait time, to make sure the file is expired
+  int small_seconds = 1;
+
+  std::atomic_int ttl_compactions{0};
+  std::atomic_int round_robin_ttl_compactions{0};
+  std::atomic_int other_compactions{0};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+          round_robin_ttl_compactions++;
+        } else {
+          other_compactions++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  // Setup the files from lower level to up level, each file is 1 hour's older
+  // than the next one.
+  // create 10 files on the last level (L6)
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // generate 1 file per hour
+  }
+  MoveFilesToLevel(6);
+
+  // create 5 files on L5
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 200; j++) {
+      ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);
+  }
+  MoveFilesToLevel(5);
+
+  // create 3 files on L4
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 300; j++) {
+      ASSERT_OK(Put(Key(i * 300 + j), "value" + std::to_string(i * 300 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);
+  }
+  MoveFilesToLevel(4);
+
+  // The LSM tree should be like:
+  // L4: [0,            299], [300,      599], [600,     899]
+  // L5: [0,  199]      [200,         399]...............[800, 999]
+  // L6: [0,99][100,199][200,299][300,399]...............[800,899][900,999]
+  ASSERT_EQ("0,0,0,0,3,5,10", FilesPerLevel());
+
+  // make sure the first L5 file is expired
+  env_->MockSleepForSeconds(16 * 60 * 60 + small_seconds++);
+
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(4), "value" + std::to_string(1)));
+  ASSERT_OK(Put(Key(5), "value" + std::to_string(1)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify there's a RoundRobin TTL compaction
+  ASSERT_EQ(1, round_robin_ttl_compactions);
+  round_robin_ttl_compactions = 0;
+
+  // expire 2 more files
+  env_->MockSleepForSeconds(2 * 60 * 60 + small_seconds++);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(4), "value" + std::to_string(2)));
+  ASSERT_OK(Put(Key(5), "value" + std::to_string(2)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(2, round_robin_ttl_compactions);
+  round_robin_ttl_compactions = 0;
+
+  // expire 4 more files, 2 out of 3 files on L4 are expired
+  env_->MockSleepForSeconds(4 * 60 * 60 + small_seconds++);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(3)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  // make the first L0 file expired, which triggers a normal TTL compaction
+  // instead of roundrobin TTL compaction, it will also include an extra file
+  // from L0 because of overlap
+  ASSERT_EQ(0, ttl_compactions);
+  env_->MockSleepForSeconds(19 * 60 * 60 + small_seconds++);
+
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // L0 -> L1 compaction is normal TTL compaction, L1 -> next levels compactions
+  // are RoundRobin TTL compaction.
+  ASSERT_GT(ttl_compactions, 0);
+  ttl_compactions = 0;
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  // All files are expired, so only the last level has data
+  env_->MockSleepForSeconds(24 * 60 * 60);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+  ASSERT_GT(ttl_compactions, 0);
+  ttl_compactions = 0;
+  ASSERT_GT(round_robin_ttl_compactions, 0);
+  round_robin_ttl_compactions = 0;
+
+  ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, RoundRobinTtlCompactionUnsortedTime) {
+  // This is to test the case that the RoundRobin compaction cursor not pointing
+  // to the oldest file, RoundRobin compaction should still compact the file
+  // after cursor until all expired files are compacted.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 20;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.compaction_pri = kRoundRobin;
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+  options.env = env_;
+
+  std::atomic_int ttl_compactions{0};
+  std::atomic_int round_robin_ttl_compactions{0};
+  std::atomic_int other_compactions{0};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        } else if (compaction_reason == CompactionReason::kRoundRobinTtl) {
+          round_robin_ttl_compactions++;
+        } else {
+          other_compactions++;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  // create 10 files on the last level (L6)
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 100 + j), "value" + std::to_string(i * 100 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // generate 1 file per hour
+  }
+  MoveFilesToLevel(6);
+
+  // create 5 files on L5
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 200; j++) {
+      ASSERT_OK(Put(Key(i * 200 + j), "value" + std::to_string(i * 200 + j)));
+    }
+    ASSERT_OK(Flush());
+    env_->MockSleepForSeconds(60 * 60);  // 1 hour
+  }
+  MoveFilesToLevel(5);
+
+  // The LSM tree should be like:
+  // L5: [0,  199]      [200,         399] [400,599] [600,799] [800, 999]
+  // L6: [0,99][100,199][200,299][300,399]....................[800,899][900,999]
+  ASSERT_EQ("0,0,0,0,0,5,10", FilesPerLevel());
+
+  // point the compaction cursor to the 4th file on L5
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+  VersionStorageInfo* storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+  const InternalKey split_cursor = InternalKey(Key(600), 100000, kTypeValue);
+  storage_info->AddCursorForOneLevel(5, split_cursor);
+
+  // make the first file on L5 expired, there should be 3 TTL compactions:
+  // 4th one, 5th one, then 1st one.
+  env_->MockSleepForSeconds(19 * 60 * 60 + 1);
+  // trigger TTL compaction
+  ASSERT_OK(Put(Key(6), "value" + std::to_string(4)));
+  ASSERT_OK(Put(Key(7), "value" + std::to_string(4)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, NumTableFilesAtLevel(5));
+
+  ASSERT_EQ(3, round_robin_ttl_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+  ASSERT_EQ(0, other_compactions);
+}
+
+TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 1024;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.ttl = 24 * 60 * 60;  // 24 hours
+  options.max_open_files = -1;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  env_->MockSleepForSeconds(36 * 60 * 60);  // 36 hours
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Just do a simple write + flush so that the Ttl expired files get
+  // compacted.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test dynamically changing ttl.
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+
+  // Delete previously written keys.
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Delete(Key(i * kNumKeysPerFile + j)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,0,2", FilesPerLevel());
+  MoveFilesToLevel(1);
+  ASSERT_EQ("0,2,0,2", FilesPerLevel());
+
+  // Move time forward by 12 hours, and make sure that compaction still doesn't
+  // trigger as ttl is set to 24 hours.
+  env_->MockSleepForSeconds(12 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("1,2,0,2", FilesPerLevel());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->compaction_reason() == CompactionReason::kTtl);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dynamically change ttl to 10 hours.
+  // This should trigger a ttl compaction, as 12 hours have already passed.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "36000"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All non-L0 files are deleted, as they contained only deleted data.
+  ASSERT_EQ("1", FilesPerLevel());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
+  env_->SetMockSleep();
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.compression = kNoCompression;
+      options.ttl = 24 * 60 * 60;  // 24 hours
+      if (if_open_all_files) {
+        options.max_open_files = -1;
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 2;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the oldest ancester time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintOldestAncesterTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      options.env = env_;
+
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+      DestroyAndReopen(options);
+
+      int ttl_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kTtl) {
+              ttl_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      // Add two L6 files with key ranges: [1 .. 100], [101 .. 200].
+      Random rnd(301);
+      for (int i = 1; i <= 100; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      // Get the first file's creation time. This will be the oldest file in the
+      // DB. Compactions inolving this file's descendents should keep getting
+      // this time.
+      std::vector<std::vector<FileMetaData>> level_to_files;
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      uint64_t oldest_time = level_to_files[0][0].oldest_ancester_time;
+      // Add 1 hour and do another flush.
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      for (int i = 101; i <= 200; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      MoveFilesToLevel(6);
+      ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      // Add two L4 files with key ranges: [1 .. 50], [51 .. 150].
+      for (int i = 1; i <= 50; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      for (int i = 51; i <= 150; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      MoveFilesToLevel(4);
+      ASSERT_EQ("0,0,0,0,2,0,2", FilesPerLevel());
+
+      env_->MockSleepForSeconds(1 * 60 * 60);
+      // Add one L1 file with key range: [26, 75].
+      for (int i = 26; i <= 75; ++i) {
+        ASSERT_OK(Put(Key(i), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,1,0,0,2,0,2", FilesPerLevel());
+
+      // LSM tree:
+      // L1:         [26 .. 75]
+      // L4:     [1 .. 50][51 ..... 150]
+      // L6:     [1 ........ 100][101 .... 200]
+      //
+      // On TTL expiry, TTL compaction should be initiated on L1 file, and the
+      // compactions should keep going on until the key range hits bottom level.
+      // In other words: the compaction on this data range "cascasdes" until
+      // reaching the bottom level.
+      //
+      // Order of events on TTL expiry:
+      // 1. L1 file falls to L3 via 2 trivial moves which are initiated by the
+      // ttl
+      //    compaction.
+      // 2. A TTL compaction happens between L3 and L4 files. Output file in L4.
+      // 3. The new output file from L4 falls to L5 via 1 trival move initiated
+      //    by the ttl compaction.
+      // 4. A TTL compaction happens between L5 and L6 files. Ouptut in L6.
+
+      // Add 25 hours and do a write
+      env_->MockSleepForSeconds(25 * 60 * 60);
+
+      ASSERT_OK(Put(Key(1), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_EQ(5, ttl_compactions);
+
+      dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                      &level_to_files);
+      ASSERT_EQ(oldest_time, level_to_files[6][0].oldest_ancester_time);
+
+      env_->MockSleepForSeconds(25 * 60 * 60);
+      ASSERT_OK(Put(Key(2), "1"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,0,0,0,0,0,1", FilesPerLevel());
+      ASSERT_GE(ttl_compactions, 6);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompaction) {
+  env_->SetMockSleep();
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  for (bool if_restart : {false, true}) {
+    for (bool if_open_all_files : {false, true}) {
+      Options options = CurrentOptions();
+      options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+      if (if_open_all_files) {
+        options.max_open_files = -1;  // needed for ttl compaction
+      } else {
+        options.max_open_files = 20;
+      }
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = 0;
+          });
+      // In the case where all files are opened and doing DB restart
+      // forcing the file creation time in manifest file to be 0 to
+      // simulate the case of reading from an old version.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "VersionEdit::EncodeTo:VarintFileCreationTime", [&](void* arg) {
+            if (if_restart && if_open_all_files) {
+              std::string* encoded_fieled = static_cast<std::string*>(arg);
+              *encoded_fieled = "";
+              PutVarint64(encoded_fieled, 0);
+            }
+          });
+
+      options.env = env_;
+
+      // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+      DestroyAndReopen(options);
+
+      int periodic_compactions = 0;
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+            Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+            auto compaction_reason = compaction->compaction_reason();
+            if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+              periodic_compactions++;
+            }
+          });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      Random rnd(301);
+      for (int i = 0; i < kNumLevelFiles; ++i) {
+        for (int j = 0; j < kNumKeysPerFile; ++j) {
+          ASSERT_OK(
+              Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+        }
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+      ASSERT_EQ("2", FilesPerLevel());
+      ASSERT_EQ(0, periodic_compactions);
+
+      // Add 50 hours and do a write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("a", "1"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      // Assert that the files stay in the same level
+      ASSERT_EQ("3", FilesPerLevel());
+      // The two old files go through the periodic compaction process
+      ASSERT_EQ(2, periodic_compactions);
+
+      MoveFilesToLevel(1);
+      ASSERT_EQ("0,3", FilesPerLevel());
+
+      // Add another 50 hours and do another write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("b", "2"));
+      if (if_restart) {
+        Reopen(options);
+      } else {
+        ASSERT_OK(Flush());
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("1,3", FilesPerLevel());
+      // The three old files now go through the periodic compaction process. 2
+      // + 3.
+      ASSERT_EQ(5, periodic_compactions);
+
+      // Add another 50 hours and do another write
+      env_->MockSleepForSeconds(50 * 60 * 60);
+      ASSERT_OK(Put("c", "3"));
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ("2,3", FilesPerLevel());
+      // The four old files now go through the periodic compaction process. 5
+      // + 4.
+      ASSERT_EQ(9, periodic_compactions);
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithOldDB) {
+  // This test makes sure that periodic compactions are working with a DB
+  // where file_creation_time of some files is 0.
+  // After compactions the new files are created with a valid file_creation_time
+
+  const int kNumKeysPerFile = 32;
+  const int kNumFiles = 4;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  bool set_file_creation_time_to_zero = true;
+  bool set_creation_time_to_zero = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          props->file_creation_time = 0;
+        }
+        if (set_creation_time_to_zero) {
+          props->creation_time = 0;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+    // Move the first two files to L2.
+    if (i == 1) {
+      MoveFilesToLevel(2);
+      set_creation_time_to_zero = false;
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+
+  Close();
+
+  set_file_creation_time_to_zero = false;
+  // Forward the clock by 2 days.
+  env_->MockSleepForSeconds(2 * 24 * 60 * 60);
+  options.periodic_compaction_seconds = 1 * 24 * 60 * 60;  // 1 day
+
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("2,0,2", FilesPerLevel());
+  // Make sure that all files go through periodic compaction.
+  ASSERT_EQ(kNumFiles, periodic_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicAndTtlCompaction) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;                          // 10 hours
+  options.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  options.max_open_files = -1;  // needed for both periodic and ttl compactions
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  int periodic_compactions = 0;
+  int ttl_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        auto compaction_reason = compaction->compaction_reason();
+        if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+          periodic_compactions++;
+        } else if (compaction_reason == CompactionReason::kTtl) {
+          ttl_compactions++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  MoveFilesToLevel(3);
+
+  ASSERT_EQ("0,0,0,2", FilesPerLevel());
+  ASSERT_EQ(0, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->MockSleepForSeconds(50 * 60 * 60);
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Files in the bottom level go through periodic compactions.
+  ASSERT_EQ("1,0,0,2", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(0, ttl_compactions);
+
+  // Add a little more time than ttl
+  env_->MockSleepForSeconds(11 * 60 * 60);
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Notice that the previous file in level 1 falls down to the bottom level
+  // due to ttl compactions, one level at a time.
+  // And bottom level files don't get picked up for ttl compactions.
+  ASSERT_EQ("1,0,0,3", FilesPerLevel());
+  ASSERT_EQ(2, periodic_compactions);
+  ASSERT_EQ(3, ttl_compactions);
+
+  // Add some time greater than periodic_compaction_time.
+  env_->MockSleepForSeconds(50 * 60 * 60);
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Previous L0 file falls one level at a time to bottom level due to ttl.
+  // And all 4 bottom files go through periodic compactions.
+  ASSERT_EQ("1,0,0,4", FilesPerLevel());
+  ASSERT_EQ(6, periodic_compactions);
+  ASSERT_EQ(6, ttl_compactions);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, LevelTtlBooster) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 3;
+  const int kValueSize = 1000;
+
+  Options options = CurrentOptions();
+  options.ttl = 10 * 60 * 60;                           // 10 hours
+  options.periodic_compaction_seconds = 480 * 60 * 60;  // very long
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 5 * uint64_t{kNumKeysPerFile * kValueSize};
+  options.max_open_files = -1;  // needed for both periodic and ttl compactions
+  options.compaction_pri = CompactionPri::kMinOverlappingRatio;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  MoveFilesToLevel(2);
+
+  ASSERT_EQ("0,0,3", FilesPerLevel());
+
+  // Create some files for L1
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(2 * j + i), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  ASSERT_EQ("0,1,3", FilesPerLevel());
+
+  // Make the new L0 files qualify TTL boosting and generate one more to trigger
+  // L1 -> L2 compaction. Old files will be picked even if their priority is
+  // lower without boosting.
+  env_->MockSleepForSeconds(8 * 60 * 60);
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(kNumKeysPerFile * 2 + 2 * j + i),
+                    rnd.RandomString(kValueSize * 2)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  // Force files to be compacted to L1
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "1"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1,2", FilesPerLevel());
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}}));
+
+  ASSERT_GT(SizeAtLevel(1), kNumKeysPerFile * 4 * kValueSize);
+}
+
+TEST_F(DBCompactionTest, LevelPeriodicCompactionWithCompactionFilters) {
+  class TestCompactionFilter : public CompactionFilter {
+    const char* Name() const override { return "TestCompactionFilter"; }
+  };
+  class TestCompactionFilterFactory : public CompactionFilterFactory {
+    const char* Name() const override { return "TestCompactionFilterFactory"; }
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& /*context*/) override {
+      return std::unique_ptr<CompactionFilter>(new TestCompactionFilter());
+    }
+  };
+
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Random rnd(301);
+
+  Options options = CurrentOptions();
+  TestCompactionFilter test_compaction_filter;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  enum CompactionFilterType {
+    kUseCompactionFilter,
+    kUseCompactionFilterFactory
+  };
+
+  for (CompactionFilterType comp_filter_type :
+       {kUseCompactionFilter, kUseCompactionFilterFactory}) {
+    // Assert that periodic compactions are not enabled.
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max() - 1,
+              options.periodic_compaction_seconds);
+
+    if (comp_filter_type == kUseCompactionFilter) {
+      options.compaction_filter = &test_compaction_filter;
+      options.compaction_filter_factory.reset();
+    } else if (comp_filter_type == kUseCompactionFilterFactory) {
+      options.compaction_filter = nullptr;
+      options.compaction_filter_factory.reset(
+          new TestCompactionFilterFactory());
+    }
+    DestroyAndReopen(options);
+
+    // periodic_compaction_seconds should be set to the sanitized value when
+    // a compaction filter or a compaction filter factory is used.
+    ASSERT_EQ(30 * 24 * 60 * 60,
+              dbfull()->GetOptions().periodic_compaction_seconds);
+
+    int periodic_compactions = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+          Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+          auto compaction_reason = compaction->compaction_reason();
+          if (compaction_reason == CompactionReason::kPeriodicCompaction) {
+            periodic_compactions++;
+          }
+        });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 0; i < kNumLevelFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(
+            Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+      }
+      ASSERT_OK(Flush());
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ("2", FilesPerLevel());
+    ASSERT_EQ(0, periodic_compactions);
+
+    // Add 31 days and do a write
+    env_->MockSleepForSeconds(31 * 24 * 60 * 60);
+    ASSERT_OK(Put("a", "1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Assert that the files stay in the same level
+    ASSERT_EQ("3", FilesPerLevel());
+    // The two old files go through the periodic compaction process
+    ASSERT_EQ(2, periodic_compactions);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByL0FileCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // L0 file count going too high.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as
+  //         compaction trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+    if (i == 0) {
+      options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+    } else {
+      options.level0_file_num_compaction_trigger = kNumL0FilesLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the auto compaction doesn't finish until manual compaction has
+      // had a chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "CompactionJob::Run():End"}});
+    } else {
+      // ensure the auto-compaction doesn't finish until manual compaction has
+      // continued without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(Key(k), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    });
+
+    manual_compaction_thread.join();
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeDelayedByImmMemTableCount) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, manual
+  // compaction only triggers flush after it's sure stall won't be triggered for
+  // immutable memtable count going too high.
+  const int kNumImmMemTableLimit = 8;
+  // i == 0: verifies normal case where stall is avoided by delay
+  // i == 1: verifies no delay in edge case where stall trigger is same as flush
+  //         trigger, so stall can't be avoided
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    // the delay limit is one less than the stop limit. This test focuses on
+    // avoiding delay limit, but this option sets stop limit, so add one.
+    options.max_write_buffer_number = kNumImmMemTableLimit + 1;
+    if (i == 1) {
+      options.min_write_buffer_number_to_merge = kNumImmMemTableLimit;
+    }
+    Reopen(options);
+
+    if (i == 0) {
+      // ensure the flush doesn't finish until manual compaction has had a
+      // chance to be delayed.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+            "FlushJob::WriteLevel0Table"}});
+    } else {
+      // ensure the flush doesn't finish until manual compaction has continued
+      // without delay.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::FlushMemTable:StallWaitDone",
+            "FlushJob::WriteLevel0Table"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumImmMemTableLimit - 1; ++j) {
+      ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+      FlushOptions flush_opts;
+      flush_opts.wait = false;
+      flush_opts.allow_write_stall = true;
+      ASSERT_OK(dbfull()->Flush(flush_opts));
+    }
+
+    auto manual_compaction_thread = port::Thread([this]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    });
+
+    manual_compaction_thread.join();
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_GT(NumTableFilesAtLevel(1), 0);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeShutdownWhileDelayed) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`, delay
+  // does not hang if CF is dropped or DB is closed
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  // i == 0: DB::DropColumnFamily() on CompactRange's target CF unblocks it
+  // i == 1: DB::CancelAllBackgroundWork() unblocks CompactRange. This is to
+  //         simulate what happens during Close as we can't call Close (it
+  //         blocks on the auto-compaction, making a cycle).
+  for (int i = 0; i < 2; ++i) {
+    CreateAndReopenWithCF({"one"}, options);
+    // The calls to close CF/DB wait until the manual compaction stalls.
+    // The auto-compaction waits until the manual compaction finishes to ensure
+    // the signal comes from closing CF/DB, not from compaction making progress.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+          "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown"},
+         {"DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual",
+          "CompactionJob::Run():End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    for (int j = 0; j < kNumL0FilesLimit - 1; ++j) {
+      for (int k = 0; k < 2; ++k) {
+        ASSERT_OK(Put(1, Key(k), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush(1));
+    }
+    auto manual_compaction_thread = port::Thread([this, i]() {
+      CompactRangeOptions cro;
+      cro.allow_write_stall = false;
+      if (i == 0) {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsColumnFamilyDropped());
+      } else {
+        ASSERT_TRUE(db_->CompactRange(cro, handles_[1], nullptr, nullptr)
+                        .IsShutdownInProgress());
+      }
+    });
+
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PreShutdown");
+    if (i == 0) {
+      ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+    } else {
+      dbfull()->CancelAllBackgroundWork(false /* wait */);
+    }
+    manual_compaction_thread.join();
+    TEST_SYNC_POINT(
+        "DBCompactionTest::CompactRangeShutdownWhileDelayed:PostManual");
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBCompactionTest, CompactRangeSkipFlushAfterDelay) {
+  // Verify that, when `CompactRangeOptions::allow_write_stall == false`,
+  // CompactRange skips its flush if the delay is long enough that the memtables
+  // existing at the beginning of the call have already been flushed.
+  const int kNumL0FilesTrigger = 4;
+  const int kNumL0FilesLimit = 8;
+  Options options = CurrentOptions();
+  options.level0_slowdown_writes_trigger = kNumL0FilesLimit;
+  options.level0_file_num_compaction_trigger = kNumL0FilesTrigger;
+  Reopen(options);
+
+  Random rnd(301);
+  // The manual flush includes the memtable that was active when CompactRange
+  // began. So it unblocks CompactRange and precludes its flush. Throughout the
+  // test, stall conditions are upheld via high L0 file count.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait",
+        "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush"},
+       {"DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush",
+        "DBImpl::FlushMemTable:StallWaitDone"},
+       {"DBImpl::FlushMemTable:StallWaitDone", "CompactionJob::Run():End"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // used for the delayable flushes
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  for (int i = 0; i < kNumL0FilesLimit - 1; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(dbfull()->Flush(flush_opts));
+  }
+  auto manual_compaction_thread = port::Thread([this]() {
+    CompactRangeOptions cro;
+    cro.allow_write_stall = false;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBCompactionTest::CompactRangeSkipFlushAfterDelay:PreFlush");
+  ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(Put(std::to_string(0), rnd.RandomString(1024)));
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactRangeSkipFlushAfterDelay:PostFlush");
+  manual_compaction_thread.join();
+
+  // If CompactRange's flush was skipped, the final Put above will still be
+  // in the active memtable.
+  std::string num_keys_in_memtable;
+  ASSERT_TRUE(db_->GetProperty(DB::Properties::kNumEntriesActiveMemTable,
+                               &num_keys_in_memtable));
+  ASSERT_EQ(std::to_string(1), num_keys_in_memtable);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBCompactionTest, CompactRangeFlushOverlappingMemtable) {
+  // Verify memtable only gets flushed if it contains data overlapping the range
+  // provided to `CompactRange`. Tests all kinds of overlap/non-overlap.
+  const int kNumEndpointKeys = 5;
+  std::string keys[kNumEndpointKeys] = {"a", "b", "c", "d", "e"};
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  // One extra iteration for nullptr, which means left side of interval is
+  // unbounded.
+  for (int i = 0; i <= kNumEndpointKeys; ++i) {
+    Slice begin;
+    Slice* begin_ptr;
+    if (i == 0) {
+      begin_ptr = nullptr;
+    } else {
+      begin = keys[i - 1];
+      begin_ptr = &begin;
+    }
+    // Start at `i` so right endpoint comes after left endpoint. One extra
+    // iteration for nullptr, which means right side of interval is unbounded.
+    for (int j = std::max(0, i - 1); j <= kNumEndpointKeys; ++j) {
+      Slice end;
+      Slice* end_ptr;
+      if (j == kNumEndpointKeys) {
+        end_ptr = nullptr;
+      } else {
+        end = keys[j];
+        end_ptr = &end;
+      }
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Put("d", "val"));
+      CompactRangeOptions compact_range_opts;
+      ASSERT_OK(db_->CompactRange(compact_range_opts, begin_ptr, end_ptr));
+
+      uint64_t get_prop_tmp, num_memtable_entries = 0;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesImmMemTables,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                      &get_prop_tmp));
+      num_memtable_entries += get_prop_tmp;
+      if (begin_ptr == nullptr || end_ptr == nullptr ||
+          (i <= 4 && j >= 1 && (begin != "c" || end != "c"))) {
+        // In this case `CompactRange`'s range overlapped in some way with the
+        // memtable's range, so flush should've happened. Then "b" and "d" won't
+        // be in the memtable.
+        ASSERT_EQ(0, num_memtable_entries);
+      } else {
+        ASSERT_EQ(2, num_memtable_entries);
+        // flush anyways to prepare for next iteration
+        ASSERT_OK(db_->Flush(FlushOptions()));
+      }
+    }
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionStatsTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  CompactionStatsCollector* collector = new CompactionStatsCollector();
+  options.listeners.emplace_back(collector);
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  VerifyCompactionStats(*cfd, *collector);
+}
+
+TEST_F(DBCompactionTest, SubcompactionEvent) {
+  class SubCompactionEventListener : public EventListener {
+   public:
+    void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+      InstrumentedMutexLock l(&mutex_);
+      ASSERT_EQ(running_compactions_.find(ci.job_id),
+                running_compactions_.end());
+      running_compactions_.emplace(ci.job_id, std::unordered_set<int>());
+    }
+
+    void OnCompactionCompleted(DB* /*db*/,
+                               const CompactionJobInfo& ci) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(ci.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      ASSERT_EQ(it->second.size(), 0);
+      running_compactions_.erase(it);
+    }
+
+    void OnSubcompactionBegin(const SubcompactionJobInfo& si) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(si.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      auto r = it->second.insert(si.subcompaction_job_id);
+      ASSERT_TRUE(r.second);  // each subcompaction_job_id should be different
+      total_subcompaction_cnt_++;
+    }
+
+    void OnSubcompactionCompleted(const SubcompactionJobInfo& si) override {
+      InstrumentedMutexLock l(&mutex_);
+      auto it = running_compactions_.find(si.job_id);
+      ASSERT_NE(it, running_compactions_.end());
+      auto r = it->second.erase(si.subcompaction_job_id);
+      ASSERT_EQ(r, 1);
+    }
+
+    size_t GetRunningCompactionCount() {
+      InstrumentedMutexLock l(&mutex_);
+      return running_compactions_.size();
+    }
+
+    size_t GetTotalSubcompactionCount() {
+      InstrumentedMutexLock l(&mutex_);
+      return total_subcompaction_cnt_;
+    }
+
+   private:
+    InstrumentedMutex mutex_;
+    std::unordered_map<int, std::unordered_set<int>> running_compactions_;
+    size_t total_subcompaction_cnt_ = 0;
+  };
+
+  Options options = CurrentOptions();
+  options.target_file_size_base = 1024;
+  options.level0_file_num_compaction_trigger = 10;
+  auto* listener = new SubCompactionEventListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // generate 4 files @ L2
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 10 + j;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+
+  // generate 2 files @ L1 which overlaps with L2 files
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 10; j++) {
+      int key_id = i * 20 + j * 2;
+      ASSERT_OK(Put(Key(key_id), "value" + std::to_string(key_id)));
+    }
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  ASSERT_EQ(FilesPerLevel(), "0,2,4");
+
+  CompactRangeOptions comp_opts;
+  comp_opts.max_subcompactions = 4;
+  Status s = dbfull()->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // make sure there's no running compaction
+  ASSERT_EQ(listener->GetRunningCompactionCount(), 0);
+  // and sub compaction is triggered
+  ASSERT_GT(listener->GetTotalSubcompactionCount(), 0);
+}
+
+TEST_F(DBCompactionTest, CompactFilesOutputRangeConflict) {
+  // LSM setup:
+  // L1:      [ba bz]
+  // L2: [a b]       [c d]
+  // L3: [a b]       [c d]
+  //
+  // Thread 1:                        Thread 2:
+  // Begin compacting all L2->L3
+  //                                  Compact [ba bz] L1->L3
+  // End compacting all L2->L3
+  //
+  // The compaction operation in thread 2 should be disallowed because the range
+  // overlaps with the compaction in thread 1, which also covers that range in
+  // L3.
+  Options options = CurrentOptions();
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  for (int level = 3; level >= 2; --level) {
+    ASSERT_OK(Put("a", "val"));
+    ASSERT_OK(Put("b", "val"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("c", "val"));
+    ASSERT_OK(Put("d", "val"));
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(level);
+  }
+  ASSERT_OK(Put("ba", "val"));
+  ASSERT_OK(Put("bz", "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin"},
+      {"DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End",
+       "CompactFilesImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto bg_thread = port::Thread([&]() {
+    // Thread 1
+    std::vector<std::string> filenames = collector->GetFlushedFiles();
+    filenames.pop_back();
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), filenames,
+                                3 /* output_level */));
+  });
+
+  // Thread 2
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2Begin");
+  std::string filename = collector->GetFlushedFiles().back();
+  ASSERT_FALSE(
+      db_->CompactFiles(CompactionOptions(), {filename}, 3 /* output_level */)
+          .ok());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::CompactFilesOutputRangeConflict:Thread2End");
+
+  bg_thread.join();
+}
+
+TEST_F(DBCompactionTest, CompactionHasEmptyOutput) {
+  Options options = CurrentOptions();
+  SstStatsCollector* collector = new SstStatsCollector();
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(collector);
+  Reopen(options);
+
+  // Make sure the L0 files overlap to prevent trivial move.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(Put("b", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("a"));
+  ASSERT_OK(Delete("b"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  // Expect one file creation to start for each flush, and zero for compaction
+  // since no keys are written.
+  ASSERT_EQ(2, collector->num_ssts_creation_started());
+}
+
+TEST_F(DBCompactionTest, CompactionLimiter) {
+  const int kNumKeysPerFile = 10;
+  const int kMaxBackgroundThreads = 64;
+
+  struct CompactionLimiter {
+    std::string name;
+    int limit_tasks;
+    int max_tasks;
+    int tasks;
+    std::shared_ptr<ConcurrentTaskLimiter> limiter;
+  };
+
+  std::vector<CompactionLimiter> limiter_settings;
+  limiter_settings.push_back({"limiter_1", 1, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_2", 2, 0, 0, nullptr});
+  limiter_settings.push_back({"limiter_3", 3, 0, 0, nullptr});
+
+  for (auto& ls : limiter_settings) {
+    ls.limiter.reset(NewConcurrentTaskLimiter(ls.name, ls.limit_tasks));
+  }
+
+  std::shared_ptr<ConcurrentTaskLimiter> unique_limiter(
+      NewConcurrentTaskLimiter("unique_limiter", -1));
+
+  const char* cf_names[] = {"default", "0", "1", "2", "3", "4", "5", "6", "7",
+                            "8",       "9", "a", "b", "c", "d", "e", "f"};
+  const unsigned int cf_count = sizeof cf_names / sizeof cf_names[0];
+
+  std::unordered_map<std::string, CompactionLimiter*> cf_to_limiter;
+
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 * 1024;  // 110KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 64;
+  options.level0_stop_writes_trigger = 64;
+  options.max_background_jobs = kMaxBackgroundThreads;  // Enough threads
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  options.max_write_buffer_number = 10;  // Enough memtables
+  DestroyAndReopen(options);
+
+  std::vector<Options> option_vector;
+  option_vector.reserve(cf_count);
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ColumnFamilyOptions cf_opt(options);
+    if (cf == 0) {
+      // "Default" CF does't use compaction limiter
+      cf_opt.compaction_thread_limiter = nullptr;
+    } else if (cf == 1) {
+      // "1" CF uses bypass compaction limiter
+      unique_limiter->SetMaxOutstandingTask(-1);
+      cf_opt.compaction_thread_limiter = unique_limiter;
+    } else {
+      // Assign limiter by mod
+      auto& ls = limiter_settings[cf % 3];
+      cf_opt.compaction_thread_limiter = ls.limiter;
+      cf_to_limiter[cf_names[cf]] = &ls;
+    }
+    option_vector.emplace_back(DBOptions(options), cf_opt);
+  }
+
+  for (unsigned int cf = 1; cf < cf_count; cf++) {
+    CreateColumnFamilies({cf_names[cf]}, option_vector[cf]);
+  }
+
+  ReopenWithColumnFamilies(
+      std::vector<std::string>(cf_names, cf_names + cf_count), option_vector);
+
+  port::Mutex mutex;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:BeforeCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(iter->second->limit_tasks, ++iter->second->tasks);
+          iter->second->max_tasks =
+              std::max(iter->second->max_tasks, iter->second->limit_tasks);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:AfterCompaction", [&](void* arg) {
+        const auto& cf_name = static_cast<ColumnFamilyData*>(arg)->GetName();
+        auto iter = cf_to_limiter.find(cf_name);
+        if (iter != cf_to_limiter.end()) {
+          MutexLock l(&mutex);
+          ASSERT_GE(--iter->second->tasks, 0);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Block all compact threads in thread pool.
+  const size_t kTotalFlushTasks = kMaxBackgroundThreads / 4;
+  const size_t kTotalCompactTasks = kMaxBackgroundThreads - kTotalFlushTasks;
+  env_->SetBackgroundThreads((int)kTotalFlushTasks, Env::HIGH);
+  env_->SetBackgroundThreads((int)kTotalCompactTasks, Env::LOW);
+
+  test::SleepingBackgroundTask sleeping_compact_tasks[kTotalCompactTasks];
+
+  // Block all compaction threads in thread pool.
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_compact_tasks[i], Env::LOW);
+    sleeping_compact_tasks[i].WaitUntilSleeping();
+  }
+
+  int keyIndex = 0;
+
+  for (int n = 0; n < options.level0_file_num_compaction_trigger; n++) {
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      for (int i = 0; i < kNumKeysPerFile; i++) {
+        ASSERT_OK(Put(cf, Key(keyIndex++), ""));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put(cf, "", ""));
+    }
+
+    for (unsigned int cf = 0; cf < cf_count; cf++) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+    }
+  }
+
+  // Enough L0 files to trigger compaction
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ASSERT_EQ(NumTableFilesAtLevel(0, cf),
+              options.level0_file_num_compaction_trigger);
+  }
+
+  // Create more files for one column family, which triggers speed up
+  // condition, all compactions will be scheduled.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    for (int i = 0; i < kNumKeysPerFile; i++) {
+      ASSERT_OK(Put(0, Key(i), ""));
+    }
+    // put extra key to trigger flush
+    ASSERT_OK(Put(0, "", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1,
+              NumTableFilesAtLevel(0, 0));
+  }
+
+  // All CFs are pending compaction
+  ASSERT_EQ(cf_count, env_->GetThreadPoolQueueLen(Env::LOW));
+
+  // Unblock all compaction threads
+  for (size_t i = 0; i < kTotalCompactTasks; i++) {
+    sleeping_compact_tasks[i].WakeUp();
+    sleeping_compact_tasks[i].WaitUntilDone();
+  }
+
+  for (unsigned int cf = 0; cf < cf_count; cf++) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Max outstanding compact tasks reached limit
+  for (auto& ls : limiter_settings) {
+    ASSERT_EQ(ls.limit_tasks, ls.max_tasks);
+    ASSERT_EQ(0, ls.limiter->GetOutstandingTask());
+  }
+
+  // test manual compaction under a fully throttled limiter
+  int cf_test = 1;
+  unique_limiter->SetMaxOutstandingTask(0);
+
+  // flush one more file to cf 1
+  for (int i = 0; i < kNumKeysPerFile; i++) {
+    ASSERT_OK(Put(cf_test, Key(keyIndex++), ""));
+  }
+  // put extra key to trigger flush
+  ASSERT_OK(Put(cf_test, "", ""));
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf_test]));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0, cf_test));
+
+  Compact(cf_test, Key(0), Key(keyIndex));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
+                        ::testing::Values(std::make_tuple(1, true),
+                                          std::make_tuple(1, false),
+                                          std::make_tuple(4, true),
+                                          std::make_tuple(4, false)));
+
+TEST_P(DBCompactionDirectIOTest, DirectIO) {
+  Options options = CurrentOptions();
+  Destroy(options);
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = MockEnv::Create(Env::Default());
+  Reopen(options);
+  bool readahead = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+  if (options.use_direct_io_for_flush_and_compaction) {
+    SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions:direct_io", [&](void* /*arg*/) { readahead = true; });
+  }
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_EQ("1,1,1", FilesPerLevel(1));
+  Compact(1, "p", "q");
+  ASSERT_EQ(readahead, options.use_direct_reads);
+  ASSERT_EQ("0,0,1", FilesPerLevel(1));
+  Destroy(options);
+  delete options.env;
+}
+
+INSTANTIATE_TEST_CASE_P(DBCompactionDirectIOTest, DBCompactionDirectIOTest,
+                        testing::Bool());
+
+class CompactionPriTest : public DBTestBase,
+                          public testing::WithParamInterface<uint32_t> {
+ public:
+  CompactionPriTest()
+      : DBTestBase("compaction_pri_test", /*env_do_fsync=*/true) {
+    compaction_pri_ = GetParam();
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t compaction_pri_;
+};
+
+TEST_P(CompactionPriTest, Test) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 16 * 1024;
+  options.compaction_pri = static_cast<CompactionPri>(compaction_pri_);
+  options.hard_pending_compaction_bytes_limit = 256 * 1024;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  const int kNKeys = 5000;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_OK(Put(Key(keys[i]), rnd.RandomString(102)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < kNKeys; i++) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    CompactionPriTest, CompactionPriTest,
+    ::testing::Values(CompactionPri::kByCompensatedSize,
+                      CompactionPri::kOldestLargestSeqFirst,
+                      CompactionPri::kOldestSmallestSeqFirst,
+                      CompactionPri::kMinOverlappingRatio,
+                      CompactionPri::kRoundRobin));
+
+TEST_F(DBCompactionTest, PersistRoundRobinCompactCursor) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 16 * 1024;
+  options.max_bytes_for_level_base = 128 * 1024;
+  options.target_file_size_base = 64 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_multiplier = 4;
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // 30 Files in L0 to trigger compactions between L1 and L2
+  for (int i = 0; i < 30; i++) {
+    for (int j = 0; j < 16; j++) {
+      ASSERT_OK(Put(rnd.RandomString(24), rnd.RandomString(1000)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const std::vector<InternalKey> compact_cursors =
+      storage_info->GetCompactCursors();
+
+  Reopen(options);
+
+  VersionSet* const reopened_versions = dbfull()->GetVersionSet();
+  assert(reopened_versions);
+
+  ColumnFamilyData* const reopened_cfd =
+      reopened_versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(reopened_cfd, nullptr);
+
+  Version* const reopened_current = reopened_cfd->current();
+  ASSERT_NE(reopened_current, nullptr);
+
+  const VersionStorageInfo* const reopened_storage_info =
+      reopened_current->storage_info();
+  ASSERT_NE(reopened_storage_info, nullptr);
+
+  const std::vector<InternalKey> reopened_compact_cursors =
+      reopened_storage_info->GetCompactCursors();
+  const auto icmp = reopened_storage_info->InternalComparator();
+  ASSERT_EQ(compact_cursors.size(), reopened_compact_cursors.size());
+  for (size_t i = 0; i < compact_cursors.size(); i++) {
+    if (compact_cursors[i].Valid()) {
+      ASSERT_EQ(0,
+                icmp->Compare(compact_cursors[i], reopened_compact_cursors[i]));
+    } else {
+      ASSERT_TRUE(!reopened_compact_cursors[i].Valid());
+    }
+  }
+}
+
+TEST_P(RoundRobinSubcompactionsAgainstPressureToken, PressureTokenTest) {
+  const int kKeysPerBuffer = 100;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.max_bytes_for_level_multiplier = 2;
+  options.level0_file_num_compaction_trigger = 4;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 8 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  // Setup 7 threads but limited subcompactions so that
+  // RoundRobin requires extra compactions from reserved threads
+  options.max_subcompactions = 1;
+  options.max_background_compactions = 7;
+  options.max_compaction_bytes = 100000000;
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(7, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 15, 25};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+  // 15 files in L1; 25 files in L2
+
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        if (grab_pressure_token_) {
+          // 7 files are selected for round-robin under auto
+          // compaction. The number of planned subcompaction is restricted by
+          // the limited number of max_background_compactions
+          ASSERT_EQ(num_planned_subcompactions, 7);
+        } else {
+          ASSERT_EQ(num_planned_subcompactions, 1);
+        }
+        num_planned_subcompactions_verified = true;
+      });
+
+  // The following 3 dependencies have to be added to ensure the auto
+  // compaction and the pressure token is correctly enabled. Same for
+  // RoundRobinSubcompactionsUsingResources and
+  // DBCompactionTest.RoundRobinSubcompactionsShrinkResources
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RoundRobinSubcompactionsAgainstPressureToken:0",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::AcquireSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstPressureToken:1"},
+       {"RoundRobinSubcompactionsAgainstPressureToken:2",
+        "CompactionJob::AcquireSubcompactionResources:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:0");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:1");
+  std::unique_ptr<WriteControllerToken> pressure_token;
+  if (grab_pressure_token_) {
+    pressure_token =
+        dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  }
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstPressureToken:2");
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstPressureToken,
+                        RoundRobinSubcompactionsAgainstPressureToken,
+                        testing::Bool());
+
+TEST_P(RoundRobinSubcompactionsAgainstResources, SubcompactionsUsingResources) {
+  const int kKeysPerBuffer = 200;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = 1;
+  options.max_background_compactions = max_compaction_limits_;
+  // Set a large number for max_compaction_bytes so that one round-robin
+  // compaction is enough to make post-compaction L1 size less than
+  // the maximum size (this test assumes only one round-robin compaction
+  // is triggered by kLevelMaxLevelSize)
+  options.max_compaction_bytes = 100000000;
+
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(total_low_pri_threads_, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 40, 100};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+
+  // 40 files in L1; 100 files in L2
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        // More than 10 files are selected for round-robin under auto
+        // compaction. The number of planned subcompaction is restricted by
+        // the minimum number between available threads and compaction limits
+        ASSERT_EQ(num_planned_subcompactions - options.max_subcompactions,
+                  std::min(total_low_pri_threads_, max_compaction_limits_) - 1);
+        num_planned_subcompactions_verified = true;
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RoundRobinSubcompactionsAgainstResources:0",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::AcquireSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstResources:1"},
+       {"RoundRobinSubcompactionsAgainstResources:2",
+        "CompactionJob::AcquireSubcompactionResources:1"},
+       {"CompactionJob::ReleaseSubcompactionResources:0",
+        "RoundRobinSubcompactionsAgainstResources:3"},
+       {"RoundRobinSubcompactionsAgainstResources:4",
+        "CompactionJob::ReleaseSubcompactionResources:1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:0");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:1");
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:2");
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:3");
+  // We can reserve more threads now except one is being used
+  ASSERT_EQ(total_low_pri_threads_ - 1,
+            env_->ReserveThreads(total_low_pri_threads_, Env::Priority::LOW));
+  ASSERT_EQ(
+      total_low_pri_threads_ - 1,
+      env_->ReleaseThreads(total_low_pri_threads_ - 1, Env::Priority::LOW));
+  TEST_SYNC_POINT("RoundRobinSubcompactionsAgainstResources:4");
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+INSTANTIATE_TEST_CASE_P(RoundRobinSubcompactionsAgainstResources,
+                        RoundRobinSubcompactionsAgainstResources,
+                        ::testing::Values(std::make_tuple(1, 5),
+                                          std::make_tuple(5, 1),
+                                          std::make_tuple(10, 5),
+                                          std::make_tuple(5, 10),
+                                          std::make_tuple(10, 10)));
+
+TEST_P(DBCompactionTestWithParam, RoundRobinWithoutAdditionalResources) {
+  const int kKeysPerBuffer = 200;
+  Options options = CurrentOptions();
+  options.num_levels = 4;
+  options.level0_file_num_compaction_trigger = 3;
+  options.target_file_size_base = kKeysPerBuffer * 1024;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+  options.max_bytes_for_level_base = 30 * kKeysPerBuffer * 1024;
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = max_subcompactions_;
+  options.max_background_compactions = 1;
+  options.max_compaction_bytes = 100000000;
+  // Similar experiment setting as above except the max_subcompactions
+  // is given by max_subcompactions_ (1 or 4), and we fix the
+  // additional resources as (1, 1) and thus no more extra resources
+  // can be used
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(1, Env::LOW);
+
+  Random rnd(301);
+  const std::vector<int> files_per_level = {0, 33, 100};
+  for (int lvl = 2; lvl > 0; lvl--) {
+    for (int i = 0; i < files_per_level[lvl]; i++) {
+      for (int j = 0; j < kKeysPerBuffer; j++) {
+        // Add (lvl-1) to ensure nearly equivallent number of files
+        // in L2 are overlapped with fils selected to compact from
+        // L1
+        ASSERT_OK(Put(Key(2 * i * kKeysPerBuffer + 2 * j + (lvl - 1)),
+                      rnd.RandomString(1010)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(lvl);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(files_per_level[lvl], NumTableFilesAtLevel(lvl, 0));
+  }
+
+  // 33 files in L1; 100 files in L2
+  // This is a variable for making sure the following callback is called
+  // and the assertions in it are indeed excuted.
+  bool num_planned_subcompactions_verified = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::GenSubcompactionBoundaries:0", [&](void* arg) {
+        uint64_t num_planned_subcompactions = *(static_cast<uint64_t*>(arg));
+        // At most 4 files are selected for round-robin under auto
+        // compaction. The number of planned subcompaction is restricted by
+        // the max_subcompactions since no extra resources can be used
+        ASSERT_EQ(num_planned_subcompactions, options.max_subcompactions);
+        num_planned_subcompactions_verified = true;
+      });
+  // No need to setup dependency for pressure token since
+  // AcquireSubcompactionResources may not be called and it anyway cannot
+  // reserve any additional resources
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBCompactionTest::RoundRobinWithoutAdditionalResources:0",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_OK(dbfull()->EnableAutoCompaction({dbfull()->DefaultColumnFamily()}));
+  TEST_SYNC_POINT("DBCompactionTest::RoundRobinWithoutAdditionalResources:0");
+
+  ASSERT_OK(dbfull()->WaitForCompact());
+  ASSERT_TRUE(num_planned_subcompactions_verified);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, RoundRobinCutOutputAtCompactCursor) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 4 * 1024;
+  options.max_bytes_for_level_base = 64 * 1024;
+  options.max_bytes_for_level_multiplier = 4;
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_pri = CompactionPri::kRoundRobin;
+
+  DestroyAndReopen(options);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  VersionStorageInfo* storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const InternalKey split_cursor = InternalKey(Key(600), 100, kTypeValue);
+  storage_info->AddCursorForOneLevel(2, split_cursor);
+
+  Random rnd(301);
+
+  for (int i = 0; i < 50; i++) {
+    for (int j = 0; j < 50; j++) {
+      ASSERT_OK(Put(Key(j * 2 + i * 100), rnd.RandomString(102)));
+    }
+  }
+  // Add more overlapping files (avoid trivial move) to trigger compaction that
+  // output files in L2. Note that trivial move does not trigger compaction and
+  // in that case the cursor is not necessarily the boundary of file.
+  for (int i = 0; i < 50; i++) {
+    for (int j = 0; j < 50; j++) {
+      ASSERT_OK(Put(Key(j * 2 + 1 + i * 100), rnd.RandomString(1014)));
+    }
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  const auto icmp = cfd->current()->storage_info()->InternalComparator();
+  // Files in level 2 should be split by the cursor
+  for (const auto& file : level_to_files[2]) {
+    ASSERT_TRUE(
+        icmp->Compare(file.smallest.Encode(), split_cursor.Encode()) >= 0 ||
+        icmp->Compare(file.largest.Encode(), split_cursor.Encode()) < 0);
+  }
+}
+
+class NoopMergeOperator : public MergeOperator {
+ public:
+  NoopMergeOperator() {}
+
+  bool FullMergeV2(const MergeOperationInput& /*merge_in*/,
+                   MergeOperationOutput* merge_out) const override {
+    std::string val("bar");
+    merge_out->new_value = val;
+    return true;
+  }
+
+  const char* Name() const override { return "Noop"; }
+};
+
+TEST_F(DBCompactionTest, PartialManualCompaction) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 10240;
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(Merge("foo", rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  MoveFilesToLevel(2);
+
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(DB::Properties::kLiveSstFilesSize, &prop));
+  uint64_t max_compaction_bytes = atoi(prop.c_str()) / 2;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_compaction_bytes", std::to_string(max_compaction_bytes)}}));
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+}
+
+TEST_F(DBCompactionTest, ManualCompactionFailsInReadOnlyMode) {
+  // Regression test for bug where manual compaction hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  const int kNumL0Files = 4;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.env = mock_env.get();
+  DestroyAndReopen(opts);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure files are overlapping in key-range to prevent trivial move.
+    ASSERT_OK(Put("key1", rnd.RandomString(1024)));
+    ASSERT_OK(Put("key2", rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(kNumL0Files, NumTableFilesAtLevel(0));
+
+  // Enter read-only mode by failing a write.
+  mock_env->SetFilesystemActive(false);
+  // Make sure this is outside `CompactRange`'s range so that it doesn't fail
+  // early trying to flush memtable.
+  ASSERT_NOK(Put("key3", rnd.RandomString(1024)));
+
+  // In the bug scenario, the first manual compaction would fail and forget to
+  // unregister itself, causing the second one to hang forever due to conflict
+  // with a non-running compaction.
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  Slice begin_key("key1");
+  Slice end_key("key2");
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+  ASSERT_NOK(dbfull()->CompactRange(cro, &begin_key, &end_key));
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+// ManualCompactionBottomLevelOptimization tests the bottom level manual
+// compaction optimization to skip recompacting files created by Ln-1 to Ln
+// compaction
+TEST_F(DBCompactionTest, ManualCompactionBottomLevelOptimized) {
+  Options opts = CurrentOptions();
+  opts.num_levels = 3;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.merge_operator.reset(new NoopMergeOperator());
+  opts.target_file_size_base = 1024;
+  opts.max_bytes_for_level_multiplier = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  MoveFilesToLevel(2);
+
+  for (auto i = 0; i < 8; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("bar" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  int num = comp_stats[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  const std::vector<InternalStats::CompactionStats>& comp_stats2 =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  num = comp_stats2[2].num_input_files_in_output_level;
+  ASSERT_EQ(num, 0);
+}
+
+TEST_F(DBCompactionTest, ManualCompactionMax) {
+  uint64_t l1_avg_size = 0, l2_avg_size = 0;
+  auto generate_sst_func = [&]() {
+    Random rnd(301);
+    for (auto i = 0; i < 100; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 10 + j), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(2);
+
+    for (auto i = 0; i < 10; i++) {
+      for (auto j = 0; j < 10; j++) {
+        ASSERT_OK(Put(Key(i * 100 + j * 10), rnd.RandomString(1024)));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+
+    std::vector<std::vector<FileMetaData>> level_to_files;
+    dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                    &level_to_files);
+
+    uint64_t total = 0;
+    for (const auto& file : level_to_files[1]) {
+      total += file.compensated_file_size;
+    }
+    l1_avg_size = total / level_to_files[1].size();
+
+    total = 0;
+    for (const auto& file : level_to_files[2]) {
+      total += file.compensated_file_size;
+    }
+    l2_avg_size = total / level_to_files[2].size();
+  };
+
+  std::atomic_int num_compactions(0);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) { ++num_compactions; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+
+  // with default setting (1.6G by default), it should cover all files in 1
+  // compaction
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == 1);
+
+  // split the compaction to 5
+  int num_split = 5;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  uint64_t total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  opts.max_compaction_bytes = total_size / num_split;
+  opts.target_file_size_base = total_size / num_split;
+  Reopen(opts);
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+
+  // very small max_compaction_bytes, it should still move forward
+  opts.max_compaction_bytes = l1_avg_size / 2;
+  opts.target_file_size_base = l1_avg_size / 2;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() > 10);
+
+  // dynamically set the option
+  num_split = 2;
+  opts.max_compaction_bytes = 0;
+  DestroyAndReopen(opts);
+  generate_sst_func();
+  total_size = (l1_avg_size * 10) + (l2_avg_size * 100);
+  Status s = db_->SetOptions(
+      {{"max_compaction_bytes", std::to_string(total_size / num_split)},
+       {"target_file_size_base", std::to_string(total_size / num_split)}});
+  ASSERT_OK(s);
+
+  num_compactions.store(0);
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_TRUE(num_compactions.load() == num_split);
+}
+
+TEST_F(DBCompactionTest, CompactionDuringShutdown) {
+  Options opts = CurrentOptions();
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.disable_auto_compactions = true;
+  DestroyAndReopen(opts);
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  InternalStats* internal_stats_ptr = cfd->internal_stats();
+  ASSERT_NE(internal_stats_ptr, nullptr);
+
+  Random rnd(301);
+  for (auto i = 0; i < 2; ++i) {
+    for (auto j = 0; j < 10; ++j) {
+      ASSERT_OK(
+          Put("foo" + std::to_string(i * 10 + j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun",
+      [&](void* /*arg*/) { dbfull()->shutting_down_.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  Status s = dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+  ASSERT_TRUE(s.ok() || s.IsShutdownInProgress());
+  ASSERT_OK(dbfull()->error_handler_.GetBGError());
+}
+
+// FixFileIngestionCompactionDeadlock tests and verifies that compaction and
+// file ingestion do not cause deadlock in the event of write stall triggered
+// by number of L0 files reaching level0_stop_writes_trigger.
+TEST_P(DBCompactionTestWithParam, FixFileIngestionCompactionDeadlock) {
+  const int kNumKeysPerFile = 100;
+  // Generate SST files.
+  Options options = CurrentOptions();
+
+  // Generate an external SST file containing a single key, i.e. 99
+  std::string sst_files_dir = dbname_ + "/sst_files/";
+  ASSERT_OK(DestroyDir(env_, sst_files_dir));
+  ASSERT_OK(env_->CreateDir(sst_files_dir));
+  SstFileWriter sst_writer(EnvOptions(), options);
+  const std::string sst_file_path = sst_files_dir + "test.sst";
+  ASSERT_OK(sst_writer.Open(sst_file_path));
+  ASSERT_OK(sst_writer.Put(Key(kNumKeysPerFile - 1), "value"));
+  ASSERT_OK(sst_writer.Finish());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+       "BackgroundCallCompaction:0"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.level0_file_num_compaction_trigger =
+      options.level0_stop_writes_trigger;
+  options.max_subcompactions = max_subcompactions_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Generate level0_stop_writes_trigger L0 files to trigger write stop
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    for (int j = 0; j != kNumKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(990)));
+    }
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_EQ(NumTableFilesAtLevel(0 /*level*/, 0 /*cf*/), i);
+    }
+  }
+  // When we reach this point, there will be level0_stop_writes_trigger L0
+  // files and one extra key (99) in memory, which overlaps with the external
+  // SST file. Write stall triggers, and can be cleared only after compaction
+  // reduces the number of L0 files.
+
+  // Compaction will also be triggered since we have reached the threshold for
+  // auto compaction. Note that compaction may begin after the following file
+  // ingestion thread and waits for ingestion to finish.
+
+  // Thread to ingest file with overlapping key range with the current
+  // memtable. Consequently ingestion will trigger a flush. The flush MUST
+  // proceed without waiting for the write stall condition to clear, otherwise
+  // deadlock can happen.
+  port::Thread ingestion_thr([&]() {
+    IngestExternalFileOptions ifo;
+    Status s = db_->IngestExternalFile({sst_file_path}, ifo);
+    ASSERT_OK(s);
+  });
+
+  // More write to trigger write stop
+  ingestion_thr.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Close();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency0", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int k = 0; k < 2; ++k) {
+    ASSERT_OK(Put("foo", "bar"));
+    Status s = Flush();
+    if (k < 1) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsCorruption());
+    }
+  }
+
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, ConsistencyFailTest2) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.target_file_size_base = 1000;
+  options.level0_file_num_compaction_trigger = 2;
+  BlockBasedTableOptions bbto;
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistency1", [&](void* arg) {
+        auto p =
+            reinterpret_cast<std::pair<FileMetaData**, FileMetaData**>*>(arg);
+        // just swap the two FileMetaData so that we hit error
+        // in CheckConsistency funcion
+        FileMetaData* temp = *(p->first);
+        *(p->first) = *(p->second);
+        *(p->second) = temp;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(1000);
+
+  ASSERT_OK(Put("foo1", value));
+  ASSERT_OK(Put("z", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo2", value));
+  ASSERT_OK(Put("z", ""));
+  Status s = Flush();
+  ASSERT_TRUE(s.ok() || s.IsCorruption());
+
+  // This probably returns non-OK, but we rely on the next Put()
+  // to determine the DB is frozen.
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+  ASSERT_NOK(Put("foo", "bar"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void IngestOneKeyValue(DBImpl* db, const std::string& key,
+                       const std::string& value, const Options& options) {
+  ExternalSstFileInfo info;
+  std::string f = test::PerThreadDBPath("sst_file" + key);
+  EnvOptions env;
+  ROCKSDB_NAMESPACE::SstFileWriter writer(env, options);
+  auto s = writer.Open(f);
+  ASSERT_OK(s);
+  // ASSERT_OK(writer.Put(Key(), ""));
+  ASSERT_OK(writer.Put(key, value));
+
+  ASSERT_OK(writer.Finish(&info));
+  IngestExternalFileOptions ingest_opt;
+
+  ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt));
+}
+
+TEST_P(DBCompactionTestWithParam,
+       FlushAfterIntraL0CompactionCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::atomic<int> pick_intra_l0_count(0);
+  std::string value(rnd.RandomString(kValueSize));
+
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Flush 5 L0 sst.
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(Put(Key(i + 1), value));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+
+  // Put one key, to make smallest log sequence number in this memtable is less
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(0), "a"));
+
+  ASSERT_EQ(5, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready");
+
+  // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction.
+  for (int i = 5; i < 10; i++) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value, options);
+  }
+
+  // Put one key, to make biggest log sequence number in this memtable is bigger
+  // than sst which would be ingested in next step.
+  ASSERT_OK(Put(Key(2), "b"));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  std::vector<std::vector<FileMetaData>> level_to_files;
+  dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(),
+                                  &level_to_files);
+  ASSERT_GT(level_to_files[0].size(), 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+
+  ASSERT_OK(Flush());
+}
+
+TEST_P(DBCompactionTestWithParam,
+       IntraL0CompactionAfterFlushCheckConsistencyFail) {
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = 5;
+  options.max_background_compactions = 2;
+  options.max_subcompactions = max_subcompactions_;
+  options.write_buffer_size = 2 << 20;
+  options.max_write_buffer_number = 6;
+  DestroyAndReopen(options);
+
+  const size_t kValueSize = 1 << 20;
+  Random rnd(301);
+  std::string value(rnd.RandomString(kValueSize));
+  std::string value2(rnd.RandomString(kValueSize));
+  std::string bigvalue = value + value;
+
+  // prevents trivial move
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(Put(Key(i), ""));  // prevents trivial move
+  }
+  ASSERT_OK(Flush());
+  Compact("", Key(99));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  std::atomic<int> pick_intra_l0_count(0);
+  // The L0->L1 must be picked before we begin ingesting files to trigger
+  // intra-L0 compaction, and must not finish until after an intra-L0
+  // compaction has been picked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"LevelCompactionPicker::PickCompaction:Return",
+        "DBCompactionTestWithParam::"
+        "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"},
+       {"LevelCompactionPicker::PickCompactionBySize:0",
+        "CompactionJob::Run():Start"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FindIntraL0Compaction",
+      [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Make 6 L0 sst.
+  for (int i = 0; i < 6; ++i) {
+    if (i % 2 == 0) {
+      IngestOneKeyValue(dbfull(), Key(i), value, options);
+    } else {
+      ASSERT_OK(Put(Key(i), value));
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+
+  // Stop run flush job
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_tasks;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks,
+                 Env::Priority::HIGH);
+  sleeping_tasks.WaitUntilSleeping();
+
+  // Put many keys to make memtable request to flush
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_OK(Put(Key(i), bigvalue));
+  }
+
+  ASSERT_EQ(6, NumTableFilesAtLevel(0));
+  TEST_SYNC_POINT(
+      "DBCompactionTestWithParam::"
+      "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready");
+  // ingest file to trigger IntraL0Compaction
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(i, NumTableFilesAtLevel(0));
+    IngestOneKeyValue(dbfull(), Key(i), value2, options);
+  }
+
+  // Wake up flush job
+  sleeping_tasks.WakeUp();
+  sleeping_tasks.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  uint64_t error_count = 0;
+  db_->GetIntProperty("rocksdb.background-errors", &error_count);
+  ASSERT_EQ(error_count, 0);
+  ASSERT_GT(pick_intra_l0_count.load(), 0);
+  for (int i = 0; i < 6; ++i) {
+    ASSERT_EQ(bigvalue, Get(Key(i)));
+  }
+  for (int i = 6; i < 10; ++i) {
+    ASSERT_EQ(value2, Get(Key(i)));
+  }
+}
+
+TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) {
+  constexpr int kSstNum = 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate some sst files on level 0 with sequence keys (no overlap)
+  for (int i = 0; i < kSstNum; i++) {
+    for (int j = 1; j < UCHAR_MAX; j++) {
+      auto key = std::string(kSstNum, '\0');
+      key[kSstNum - i] += static_cast<char>(j);
+      ASSERT_OK(Put(key, std::string(i % 1000, 'A')));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  ASSERT_EQ(std::to_string(kSstNum), FilesPerLevel(0));
+
+  auto cro = CompactRangeOptions();
+  cro.bottommost_level_compaction = bottommost_level_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  if (bottommost_level_compaction_ == BottommostLevelCompaction::kForce ||
+      bottommost_level_compaction_ ==
+          BottommostLevelCompaction::kForceOptimized) {
+    // Real compaction to compact all sst files from level 0 to 1 file on level
+    // 1
+    ASSERT_EQ("0,1", FilesPerLevel(0));
+  } else {
+    // Just trivial move from level 0 -> 1
+    ASSERT_EQ("0," + std::to_string(kSstNum), FilesPerLevel(0));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBCompactionTestWithBottommostParam, DBCompactionTestWithBottommostParam,
+    ::testing::Values(BottommostLevelCompaction::kSkip,
+                      BottommostLevelCompaction::kIfHaveCompactionFilter,
+                      BottommostLevelCompaction::kForce,
+                      BottommostLevelCompaction::kForceOptimized));
+
+TEST_F(DBCompactionTest, UpdateLevelSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 10);
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+
+  has_compaction = false;
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_F(DBCompactionTest, UpdateUniversalSubCompactionTest) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.target_file_size_base = 1 << 10;  // 1KB
+  DestroyAndReopen(options);
+
+  bool has_compaction = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 10);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+  has_compaction = false;
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_subcompactions", "2"}}));
+  ASSERT_TRUE(dbfull()->GetDBOptions().max_subcompactions == 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(compaction->max_subcompactions() == 2);
+        has_compaction = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Trigger compaction
+  for (int i = 0; i < 32; i++) {
+    for (int j = 0; j < 5000; j++) {
+      ASSERT_OK(Put(std::to_string(j), std::string(1, 'A')));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(has_compaction);
+}
+
+TEST_P(ChangeLevelConflictsWithAuto, TestConflict) {
+  // A `CompactRange()` may race with an automatic compaction, we'll need
+  // to make sure it doesn't corrupte the data.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // Run a qury to refitting to level 1 while another thread writing to
+  // the same level.
+  SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "AutoCompactionFinished1",
+      },
+      {
+          "AutoCompactionFinished2",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::thread auto_comp([&] {
+    TEST_SYNC_POINT("AutoCompactionFinished1");
+    ASSERT_OK(Put("bar", "v2"));
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("bar", "v3"));
+    ASSERT_OK(Put("foo", "v3"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    TEST_SYNC_POINT("AutoCompactionFinished2");
+  });
+
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = GetParam() ? 1 : 0;
+    // This should return non-OK, but it's more important for the test to
+    // make sure that the DB is not corrupted.
+    ASSERT_NOK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  auto_comp.join();
+  // Refitting didn't happen.
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Write something to DB just make sure that consistency check didn't
+  // fail and make the DB readable.
+}
+
+INSTANTIATE_TEST_CASE_P(ChangeLevelConflictsWithAuto,
+                        ChangeLevelConflictsWithAuto, testing::Bool());
+
+TEST_F(DBCompactionTest, ChangeLevelCompactRangeConflictsWithManual) {
+  // A `CompactRange()` with `change_level == true` needs to execute its final
+  // step, `ReFitLevel()`, in isolation. Previously there was a bug where
+  // refitting could target the same level as an ongoing manual compaction,
+  // leading to overlapping files in that level.
+  //
+  // This test ensures that case is not possible by verifying any manual
+  // compaction issued during the `ReFitLevel()` phase fails with
+  // `Status::Incomplete`.
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the
+  // foreground thread will try to simultaneously compact L0->L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // The first two dependencies ensure the foreground creates an L0 file
+      // between the background compaction's L0->L1 and its L1->L2.
+      {
+          "DBImpl::RunManualCompaction()::1",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "PutFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "FlushedFG",
+          "DBImpl::RunManualCompaction()::2",
+      },
+      // The next two dependencies ensure the foreground invokes
+      // `CompactRange()` while the background is refitting. The
+      // foreground's `CompactRange()` is guaranteed to attempt an L0->L1
+      // as we set it up with an empty memtable and a new L0 file.
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactFG",
+      },
+      {
+          "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+          "CompactedFG",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:PutFG");
+  // Make sure we have something new to compact in the foreground.
+  // Note key 1 is carefully chosen as it ensures the file we create here
+  // overlaps with one of the files being refitted L2->L1 in the background.
+  // If we chose key 0, the file created here would not overlap.
+  ASSERT_OK(Put(Key(1), "val"));
+  ASSERT_OK(Flush());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:FlushedFG");
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:CompactFG");
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelCompactRangeConflictsWithManual:"
+      "CompactedFG");
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, ChangeLevelErrorPathTest) {
+  // This test is added to ensure that RefitLevel() error paths are clearing
+  // internal flags and to test that subsequent valid RefitLevel() calls
+  // succeeds
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  Reopen(options);
+
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  // Setup an LSM with three levels populated.
+  Random rnd(301);
+  int key_idx = 0;
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ("1", FilesPerLevel(0));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,2", FilesPerLevel(0));
+
+  auto start_idx = key_idx;
+  GenerateNewFile(&rnd, &key_idx);
+  GenerateNewFile(&rnd, &key_idx);
+  auto end_idx = key_idx - 1;
+  ASSERT_EQ("1,1,2", FilesPerLevel(0));
+
+  // Next two CompactRange() calls are used to test exercise error paths within
+  // RefitLevel() before triggering a valid RefitLevel() call
+
+  // Trigger a refit to L1 first
+  {
+    std::string begin_string = Key(start_idx);
+    std::string end_string = Key(end_idx);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a refit from L2->L1 - this should fail and exercise error paths in
+  // RefitLevel()
+  {
+    // Select key range that matches the bottom most level (L2)
+    std::string begin_string = Key(0);
+    std::string end_string = Key(start_idx - 1);
+    Slice begin(begin_string);
+    Slice end(end_string);
+
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_NOK(dbfull()->CompactRange(cro, &begin, &end));
+  }
+  ASSERT_EQ("0,3,2", FilesPerLevel(0));
+
+  // Try a valid Refit request to ensure, the path is still working
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlob) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), third_value);
+  ASSERT_EQ(Get(second_key), third_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_EQ(l1_files.size(), 1);
+
+  const FileMetaData* const table_file = l1_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), first_key);
+  ASSERT_EQ(table_file->largest.user_key(), second_key);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 2);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+  ASSERT_EQ(compaction_stats[1].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[1].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[1].num_output_files_blob, 1);
+}
+
+class DBCompactionTestBlobError
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBCompactionTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobError, DBCompactionTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBCompactionTestBlobError, CompactionError) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char second_key[] = "second_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_value[] = "second_value";
+  constexpr char third_value[] = "third_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, first_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, second_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(first_key, third_value));
+  ASSERT_OK(Put(second_key, third_value));
+  ASSERT_OK(Flush());
+
+  options.enable_blob_files = true;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(db_->CompactRange(CompactRangeOptions(), begin, end).IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l1_files = storage_info->LevelFiles(1);
+  ASSERT_TRUE(l1_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_GT(compaction_stats[1].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[1].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[1].num_output_files_blob, 0);
+  }
+}
+
+class DBCompactionTestBlobGC
+    : public DBCompactionTest,
+      public testing::WithParamInterface<std::tuple<double, bool>> {
+ public:
+  DBCompactionTestBlobGC()
+      : blob_gc_age_cutoff_(std::get<0>(GetParam())),
+        updated_enable_blob_files_(std::get<1>(GetParam())) {}
+
+  double blob_gc_age_cutoff_;
+  bool updated_enable_blob_files_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBCompactionTestBlobGC, DBCompactionTestBlobGC,
+                        ::testing::Combine(::testing::Values(0.0, 0.5, 1.0),
+                                           ::testing::Bool()));
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGCOverrides) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0;
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 128; i += 2) {
+    ASSERT_OK(Put("key" + std::to_string(i), "value" + std::to_string(i)));
+    ASSERT_OK(
+        Put("key" + std::to_string(i + 1), "value" + std::to_string(i + 1)));
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(original_blob_files.size(), 128);
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+  CompactRangeOptions cro;
+  cro.blob_garbage_collection_policy = BlobGarbageCollectionPolicy::kForce;
+  cro.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Check that the GC stats are correct
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    assert(versions);
+    assert(versions->GetColumnFamilySet());
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    assert(cfd);
+
+    const InternalStats* const internal_stats = cfd->internal_stats();
+    assert(internal_stats);
+
+    const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+    ASSERT_GE(compaction_stats.size(), 2);
+
+    ASSERT_GE(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      cro.blob_garbage_collection_age_cutoff * original_blob_files.size());
+  const size_t expected_num_files = original_blob_files.size() - cutoff_index;
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_num_files);
+
+  // Original blob files below the cutoff should be gone, original blob files
+  // at or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  for (size_t i = 0; i < 128; ++i) {
+    ASSERT_EQ(Get("key" + std::to_string(i)), "value" + std::to_string(i));
+  }
+}
+
+TEST_P(DBCompactionTestBlobGC, CompactionWithBlobGC) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // one blob per file
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = blob_gc_age_cutoff_;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 4);
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  size_t expected_number_of_files = original_blob_files.size();
+
+  if (!updated_enable_blob_files_) {
+    ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+
+    expected_number_of_files -= cutoff_index;
+  }
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_GE(compaction_stats.size(), 2);
+
+  if (blob_gc_age_cutoff_ > 0.0) {
+    ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
+
+    if (updated_enable_blob_files_) {
+      // GC relocated some blobs to new blob files
+      ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
+      ASSERT_EQ(compaction_stats[1].bytes_read_blob,
+                compaction_stats[1].bytes_written_blob);
+    } else {
+      // GC moved some blobs back to the LSM, no new blob files
+      ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+    }
+  } else {
+    ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
+    ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
+  }
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_CorruptIndex) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  ASSERT_OK(Put(fourth_key, fourth_value));
+
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::GarbageCollectBlobIfNeeded::TamperWithBlobIndex",
+      [](void* arg) {
+        Slice* const blob_index = static_cast<Slice*>(arg);
+        assert(blob_index);
+        assert(!blob_index->empty());
+        blob_index->remove_prefix(1);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_InlinedTTLIndex) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char blob[] = "short";
+  static_assert(sizeof(short) - 1 < min_blob_size,
+                "Blob too long to be inlined");
+
+  // Fake an inlined TTL blob index.
+  std::string blob_index;
+
+  constexpr uint64_t expiration = 1234567890;
+
+  BlobIndex::EncodeInlinedTTL(&blob_index, expiration, blob);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithBlobGCError_IndexWithInvalidFileNumber) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 1.0;
+
+  Reopen(options);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  ASSERT_OK(Put(first_key, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+  ASSERT_OK(Put(second_key, second_value));
+
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  ASSERT_OK(Put(third_key, third_value));
+
+  constexpr char fourth_key[] = "fourth_key";
+
+  // Fake a blob index referencing a non-existent blob file.
+  std::string blob_index;
+
+  constexpr uint64_t blob_file_number = 1000;
+  constexpr uint64_t offset = 1234;
+  constexpr uint64_t size = 5678;
+
+  BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                        kNoCompression);
+
+  WriteBatch batch;
+  ASSERT_OK(
+      WriteBatchInternal::PutBlobIndex(&batch, 0, fourth_key, blob_index));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_TRUE(
+      db_->CompactRange(CompactRangeOptions(), begin, end).IsCorruption());
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  Status s;
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // options is not set, the checksum handoff will not be triggered
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+  Destroy(options);
+  Reopen(options);
+
+  // The hash does not match, compaction write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, CompactionWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 3;
+  options.env = fault_fs_env.get();
+  options.create_if_missing = true;
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  Status s;
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s, Status::OK());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put(Key(0), "value1"));
+  ASSERT_OK(Put(Key(2), "value2"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(Key(1), "value3"));
+  s = Flush();
+  ASSERT_EQ(s, Status::OK());
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, FIFOWarm) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.num_levels = 1;
+  options.max_open_files = -1;
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  CompactionOptionsFIFO fifo_options;
+  fifo_options.age_for_warm = 1000;
+  fifo_options.max_table_files_size = 100000000;
+  options.compaction_options_fifo = fifo_options;
+  env_->SetMockSleep();
+  Reopen(options);
+
+  int total_warm = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "NewWritableFile::FileOptions.temperature", [&](void* arg) {
+        Temperature temperature = *(static_cast<Temperature*>(arg));
+        if (temperature == Temperature::kWarm) {
+          total_warm++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_OK(Put(Key(0), "value1"));
+  env_->MockSleepForSeconds(800);
+  ASSERT_OK(Put(Key(2), "value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(4, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[1].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[2].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[0].files[3].temperature);
+  ASSERT_EQ(2, total_warm);
+
+  Destroy(options);
+}
+
+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+  const int kNumL0Files = 10;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Generate 2 levels of file to make sure the manual compaction is not skipped
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(1);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  port::Thread compact_thread1([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(0);
+    std::string end_str = Key(3);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  port::Thread compact_thread2([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(4);
+    std::string end_str = Key(7);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  // Disable manual compaction should cancel both manual compactions and both
+  // compaction should return incomplete.
+  db_->DisableManualCompaction();
+
+  compact_thread1.join();
+  compact_thread2.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  // make sure the manual compaction background is started but not yet set the
+  // status to in_progress, then cancel the manual compaction, which should not
+  // result in segfault
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction",
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableJustStartedManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:InProgress",
+        "DBCompactionTest::DisableInProgressManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "CompactionJob::Run():Start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableInProgressManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+}
+
+TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  db_->DisableManualCompaction();
+
+  // CompactRange should return before the compaction has the chance to run
+  compact_thread.join();
+
+  // Try close DB while manual compaction is canceled but still in the queue.
+  // And an auto-triggered compaction is also in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(std::to_string(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  // Close DB with manual compaction and auto triggered compaction in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  // manual compaction thread should return with Incomplete().
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBCompactionTest,
+       DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
+  // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
+  // for automatic compactions to drain before starting the manual compaction.
+  // This test verifies `DisableManualCompaction()` can cancel such a compaction
+  // without waiting for the drain to complete.
+  const int kNumL0Files = 4;
+
+  // Enforces manual compaction enters wait loop due to pending automatic
+  // compaction.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBImpl::RunManualCompaction:NotScheduled"},
+       {"DBImpl::RunManualCompaction:WaitScheduled",
+        "BackgroundCallCompaction:0"}});
+  // The automatic compaction will cancel the waiting manual compaction.
+  // Completing this implies the cancellation did not wait on automatic
+  // compactions to finish.
+  bool callback_completed = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        db_->DisableManualCompaction();
+        callback_completed = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = true;
+  ASSERT_TRUE(db_->CompactRange(cro, nullptr, nullptr).IsIncomplete());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(callback_completed);
+}
+
+TEST_F(DBCompactionTest, ChangeLevelConflictsWithManual) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  Reopen(options);
+
+  // Setup an LSM with L2 populated.
+  Random rnd(301);
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 2;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  }
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // The background thread will refit L2->L1 while the foreground thread will
+  // attempt to run a compaction on new data. The following dependencies
+  // ensure the background manual compaction's refitting phase disables manual
+  // compaction immediately before the foreground manual compaction can register
+  // itself. Manual compaction is kept disabled until the foreground manual
+  // checks for the failure once.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      // Only do Put()s for foreground CompactRange() once the background
+      // CompactRange() has reached the refitting phase.
+      {
+          "DBImpl::CompactRange:BeforeRefit:1",
+          "DBCompactionTest::ChangeLevelConflictsWithManual:"
+          "PreForegroundCompactRange",
+      },
+      // Right before we register the manual compaction, proceed with
+      // the refitting phase so manual compactions are disabled. Stay in
+      // the refitting phase with manual compactions disabled until it is
+      // noticed.
+      {
+          "DBImpl::RunManualCompaction:0",
+          "DBImpl::CompactRange:BeforeRefit:2",
+      },
+      {
+          "DBImpl::CompactRange:PreRefitLevel",
+          "DBImpl::RunManualCompaction:1",
+      },
+      {
+          "DBImpl::RunManualCompaction:PausedAtStart",
+          "DBImpl::CompactRange:PostRefitLevel",
+      },
+      // If compaction somehow were scheduled, let's let it run after reenabling
+      // manual compactions. This dependency is not expected to be hit but is
+      // here for speculatively coercing future bugs.
+      {
+          "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled",
+          "BackgroundCallCompaction:0",
+      },
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread refit_level_thread([&] {
+    CompactRangeOptions cro;
+    cro.change_level = true;
+    cro.target_level = 1;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::ChangeLevelConflictsWithManual:"
+      "PreForegroundCompactRange");
+  ASSERT_OK(Put(Key(0), rnd.RandomString(990)));
+  ASSERT_OK(Put(Key(1), rnd.RandomString(990)));
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsIncomplete());
+
+  refit_level_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
+  // Flushes several files to trigger compaction while lock is released during
+  // a bottom-pri compaction. Verifies it does not get scheduled to thread pool
+  // because per-DB limit for compaction parallelism is one (default).
+  const int kNumL0Files = 4;
+  const int kNumLevels = 3;
+
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // Setup last level to be non-empty since it's a bit unclear whether
+  // compaction to an empty level would be considered "bottommost".
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(kNumLevels - 1);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkBottomCompaction",
+        "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PreTriggerCompaction"},
+       {"DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+        "PostTriggerCompaction",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_range_thread([&] {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+  });
+
+  // Sleep in the low-pri thread so any newly scheduled compaction will be
+  // queued. Otherwise it might finish before we check its existence.
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PreTriggerCompaction");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(0u, env_->GetThreadPoolQueueLen(Env::Priority::LOW));
+  TEST_SYNC_POINT(
+      "DBCompactionTest::BottomPriCompactionCountsTowardConcurrencyLimit:"
+      "PostTriggerCompaction");
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  compact_range_thread.join();
+}
+
+TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
+  // allow_ingest_behind prevents seqnum zeroing, and could cause
+  // compaction loop with reason kBottommostFiles.
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleLevel;
+  options.allow_ingest_behind = true;
+  options.comparator = BytewiseComparator();
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
+  ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
+
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
+  ASSERT_OK(Flush());
+  auto snapshot = db_->GetSnapshot();
+  // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
+  db_->ReleaseSnapshot(snapshot);
+  bool compacted = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
+        // There should not be a compaction.
+        compacted = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Wait for compaction to be scheduled.
+  env_->SleepForMicroseconds(2000000);
+  ASSERT_FALSE(compacted);
+  // The following assert can be used to check for compaction loop:
+  // it used to wait forever before the fix.
+  // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_dynamic_level_test.cc b/src/rocksdb/db/db_dynamic_level_test.cc
new file mode 100644
index 000000000..17fa67cb2
--- /dev/null
+++ b/src/rocksdb/db/db_dynamic_level_test.cc
@@ -0,0 +1,507 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/env.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBTestDynamicLevel : public DBTestBase {
+ public:
+  DBTestDynamicLevel()
+      : DBTestBase("db_dynamic_level_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  // Use InMemoryEnv, or it would be too slow.
+  std::unique_ptr<Env> env(NewMemEnv(env_));
+
+  const int kNKeys = 1000;
+  int keys[kNKeys];
+
+  auto verify_func = [&]() {
+    for (int i = 0; i < kNKeys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      ASSERT_NE("NOT_FOUND", Get(Key(kNKeys * 2 + i)));
+      if (i < kNKeys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(kNKeys + keys[i])));
+      }
+    }
+  };
+
+  Random rnd(301);
+  for (int ordered_insert = 0; ordered_insert <= 1; ordered_insert++) {
+    for (int i = 0; i < kNKeys; i++) {
+      keys[i] = i;
+    }
+    if (ordered_insert == 0) {
+      RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+    }
+    for (int max_background_compactions = 1; max_background_compactions < 4;
+         max_background_compactions += 2) {
+      Options options;
+      options.env = env.get();
+      options.create_if_missing = true;
+      options.write_buffer_size = 2048;
+      options.max_write_buffer_number = 2;
+      options.level0_file_num_compaction_trigger = 2;
+      options.level0_slowdown_writes_trigger = 2;
+      options.level0_stop_writes_trigger = 2;
+      options.target_file_size_base = 2048;
+      options.level_compaction_dynamic_level_bytes = true;
+      options.max_bytes_for_level_base = 10240;
+      options.max_bytes_for_level_multiplier = 4;
+      options.max_background_compactions = max_background_compactions;
+      options.num_levels = 5;
+
+      options.compression_per_level.resize(3);
+      options.compression_per_level[0] = kNoCompression;
+      options.compression_per_level[1] = kLZ4Compression;
+      options.compression_per_level[2] = kSnappyCompression;
+      options.env = env_;
+
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNKeys; i++) {
+        int key = keys[i];
+        ASSERT_OK(Put(Key(kNKeys + key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(key), rnd.RandomString(102)));
+        ASSERT_OK(Put(Key(kNKeys * 2 + key), rnd.RandomString(102)));
+        ASSERT_OK(Delete(Key(kNKeys + keys[i / 10])));
+        env_->SleepForMicroseconds(5000);
+      }
+
+      uint64_t int_prop;
+      ASSERT_TRUE(db_->GetIntProperty("rocksdb.background-errors", &int_prop));
+      ASSERT_EQ(0U, int_prop);
+
+      // Verify DB
+      for (int j = 0; j < 2; j++) {
+        verify_func();
+        if (j == 0) {
+          Reopen(options);
+        }
+      }
+
+      // Test compact range works
+      ASSERT_OK(
+          dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      // All data should be in the last level.
+      ColumnFamilyMetaData cf_meta;
+      db_->GetColumnFamilyMetaData(&cf_meta);
+      ASSERT_EQ(5U, cf_meta.levels.size());
+      for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(0U, cf_meta.levels[i].files.size());
+      }
+      ASSERT_GT(cf_meta.levels[4U].files.size(), 0U);
+      verify_func();
+
+      Close();
+    }
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBase2) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 9102;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 0;  // Force not expanding in compactions
+  options.db_host_id = "";  // Setting this messes up the file size calculation
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 28K to L0
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Insert extra about 28K to L0. After they are compacted to L4, the base
+  // level should be changed to L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  for (int i = 0; i < 70; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  // Write even more data while leaving the base level at L3.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 40K more
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+
+  // Fill up L0, and then run an (auto) L0->Lmax compaction to raise the base
+  // level to 2.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  // Write about 650K more.
+  // Each file is about 11KB, with 9KB of data.
+  for (int i = 0; i < 1300; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+
+  // Make sure that the compaction starts before the last bit of data is
+  // flushed, so that the base level isn't raised to L1.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:0");
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(2U, int_prop);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Write more data until the base level changes to L1. There will be
+  // a manual compaction going on at the same time.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"CompactionJob::Run():Start", "DynamicLevelMaxBytesBase2:1"},
+      {"DynamicLevelMaxBytesBase2:2", "CompactionJob::Run():End"},
+      {"DynamicLevelMaxBytesBase2:compact_range_finish",
+       "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([this] {
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_start");
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:compact_range_finish");
+  });
+
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:1");
+  for (int i = 0; i < 2; i++) {
+    ASSERT_OK(Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))),
+                  rnd.RandomString(380)));
+  }
+  TEST_SYNC_POINT("DynamicLevelMaxBytesBase2:2");
+
+  ASSERT_OK(Flush());
+
+  thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(1U, int_prop);
+}
+
+// Test specific cases in dynamic max bytes
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesCompactRange) {
+  Random rnd(301);
+  int kMaxKey = 1000000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 9999;
+  options.level0_stop_writes_trigger = 9999;
+  options.target_file_size_base = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  const int kNumLevels = 5;
+  options.num_levels = kNumLevels;
+  options.max_compaction_bytes = 1;  // Force not expanding in compactions
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Compact against empty DB
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  uint64_t int_prop;
+  std::string str_prop;
+
+  // Initial base level is the last level
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(4U, int_prop);
+
+  // Put about 7K to L0
+  for (int i = 0; i < 140; i++) {
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    // Make sure level 0 is not empty
+    ASSERT_OK(
+        Put(Key(static_cast<int>(rnd.Uniform(kMaxKey))), rnd.RandomString(80)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level1", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level2", &str_prop));
+  ASSERT_EQ("0", str_prop);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::set<int> output_levels;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::CompactRange:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        output_levels.insert(compaction->output_level());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(output_levels.size(), 2);
+  ASSERT_TRUE(output_levels.find(3) != output_levels.end());
+  ASSERT_TRUE(output_levels.find(4) != output_levels.end());
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level3", &str_prop));
+  ASSERT_EQ("0", str_prop);
+  // Base level is still level 3.
+  ASSERT_TRUE(db_->GetIntProperty("rocksdb.base-level", &int_prop));
+  ASSERT_EQ(3U, int_prop);
+}
+
+TEST_F(DBTestDynamicLevel, DynamicLevelMaxBytesBaseInc) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 2;
+  options.num_levels = 5;
+  options.max_compaction_bytes = 100000000;
+
+  DestroyAndReopen(options);
+
+  int non_trivial = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial",
+      [&](void* /*arg*/) { non_trivial++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  const int total_keys = 3000;
+  const int random_part_size = 100;
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = rnd.RandomString(random_part_size);
+    PutFixed32(&value, static_cast<uint32_t>(i));
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ(non_trivial, 0);
+
+  for (int i = 0; i < total_keys; i++) {
+    std::string value = Get(Key(i));
+    ASSERT_EQ(DecodeFixed32(value.c_str() + random_part_size),
+              static_cast<uint32_t>(i));
+  }
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+}
+
+TEST_F(DBTestDynamicLevel, DISABLED_MigrateToDynamicLevelMaxBytesBase) {
+  Random rnd(301);
+  const int kMaxKey = 2000;
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 2048;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = 2048;
+  options.level_compaction_dynamic_level_bytes = false;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.num_levels = 8;
+
+  DestroyAndReopen(options);
+
+  auto verify_func = [&](int num_keys, bool if_sleep) {
+    for (int i = 0; i < num_keys; i++) {
+      ASSERT_NE("NOT_FOUND", Get(Key(kMaxKey + i)));
+      if (i < num_keys / 10) {
+        ASSERT_EQ("NOT_FOUND", Get(Key(i)));
+      } else {
+        ASSERT_NE("NOT_FOUND", Get(Key(i)));
+      }
+      if (if_sleep && i % 1000 == 0) {
+        // Without it, valgrind may choose not to give another
+        // thread a chance to run before finishing the function,
+        // causing the test to be extremely slow.
+        env_->SleepForMicroseconds(1);
+      }
+    }
+  };
+
+  int total_keys = 1000;
+  for (int i = 0; i < total_keys; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+  verify_func(total_keys, false);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  options.level_compaction_dynamic_level_bytes = true;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  verify_func(total_keys, false);
+
+  std::atomic_bool compaction_finished;
+  compaction_finished = false;
+  // Issue manual compaction in one thread and still verify DB state
+  // in main thread.
+  ROCKSDB_NAMESPACE::port::Thread t([&]() {
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = options.num_levels - 1;
+    ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+    compaction_finished.store(true);
+  });
+  do {
+    verify_func(total_keys, true);
+  } while (!compaction_finished.load());
+  t.join();
+
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  int total_keys2 = 2000;
+  for (int i = total_keys; i < total_keys2; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+    ASSERT_OK(Delete(Key(i / 10)));
+  }
+
+  verify_func(total_keys2, false);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  verify_func(total_keys2, false);
+
+  // Base level is not level 1
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_encryption_test.cc b/src/rocksdb/db/db_encryption_test.cc
new file mode 100644
index 000000000..73e89d158
--- /dev/null
+++ b/src/rocksdb/db/db_encryption_test.cc
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include <iostream>
+#include <string>
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBEncryptionTest : public DBTestBase {
+ public:
+  DBEncryptionTest()
+      : DBTestBase("db_encryption_test", /*env_do_fsync=*/true) {}
+  Env* GetTargetEnv() {
+    if (encrypted_env_ != nullptr) {
+      return (static_cast<EnvWrapper*>(encrypted_env_))->target();
+    } else {
+      return env_;
+    }
+  }
+};
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBEncryptionTest, CheckEncrypted) {
+  ASSERT_OK(Put("foo567", "v1.fetdq"));
+  ASSERT_OK(Put("bar123", "v2.dfgkjdfghsd"));
+  Close();
+
+  // Open all files and look for the values we've put in there.
+  // They should not be found if encrypted, otherwise
+  // they should be found.
+  std::vector<std::string> fileNames;
+  auto status = env_->GetChildren(dbname_, &fileNames);
+  ASSERT_OK(status);
+
+  Env* target = GetTargetEnv();
+  int hits = 0;
+  for (auto it = fileNames.begin(); it != fileNames.end(); ++it) {
+    if (*it == "LOCK") {
+      continue;
+    }
+    auto filePath = dbname_ + "/" + *it;
+    std::unique_ptr<SequentialFile> seqFile;
+    auto envOptions = EnvOptions(CurrentOptions());
+    status = target->NewSequentialFile(filePath, &seqFile, envOptions);
+    ASSERT_OK(status);
+
+    uint64_t fileSize;
+    status = target->GetFileSize(filePath, &fileSize);
+    ASSERT_OK(status);
+
+    std::string scratch;
+    scratch.reserve(fileSize);
+    Slice data;
+    status = seqFile->Read(fileSize, &data, (char*)scratch.data());
+    ASSERT_OK(status);
+
+    if (data.ToString().find("foo567") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v1.fetdq") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("bar123") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("v2.dfgkjdfghsd") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+    if (data.ToString().find("dfgk") != std::string::npos) {
+      hits++;
+      // std::cout << "Hit in " << filePath << "\n";
+    }
+  }
+  if (encrypted_env_) {
+    ASSERT_EQ(hits, 0);
+  } else {
+    ASSERT_GE(hits, 4);
+  }
+}
+
+TEST_F(DBEncryptionTest, ReadEmptyFile) {
+  auto defaultEnv = GetTargetEnv();
+
+  // create empty file for reading it back in later
+  auto envOptions = EnvOptions(CurrentOptions());
+  auto filePath = dbname_ + "/empty.empty";
+
+  Status status;
+  {
+    std::unique_ptr<WritableFile> writableFile;
+    status = defaultEnv->NewWritableFile(filePath, &writableFile, envOptions);
+    ASSERT_OK(status);
+  }
+
+  std::unique_ptr<SequentialFile> seqFile;
+  status = defaultEnv->NewSequentialFile(filePath, &seqFile, envOptions);
+  ASSERT_OK(status);
+
+  std::string scratch;
+  Slice data;
+  // reading back 16 bytes from the empty file shouldn't trigger an assertion.
+  // it should just work and return an empty string
+  status = seqFile->Read(16, &data, (char*)scratch.data());
+  ASSERT_OK(status);
+
+  ASSERT_TRUE(data.empty());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_filesnapshot.cc b/src/rocksdb/db/db_filesnapshot.cc
new file mode 100644
index 000000000..aa9bd738a
--- /dev/null
+++ b/src/rocksdb/db/db_filesnapshot.cc
@@ -0,0 +1,442 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/types.h"
+#include "test_util/sync_point.h"
+#include "util/file_checksum_helper.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status DBImpl::FlushForGetLiveFiles() {
+  mutex_.AssertHeld();
+
+  // flush all dirty data to disk.
+  Status status;
+  if (immutable_db_options_.atomic_flush) {
+    autovector<ColumnFamilyData*> cfds;
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    mutex_.Unlock();
+    status =
+        AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kGetLiveFiles);
+    if (status.IsColumnFamilyDropped()) {
+      status = Status::OK();
+    }
+    mutex_.Lock();
+  } else {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      mutex_.Unlock();
+      status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
+      TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
+      mutex_.Lock();
+      if (!status.ok() && !status.IsColumnFamilyDropped()) {
+        break;
+      } else if (status.IsColumnFamilyDropped()) {
+        status = Status::OK();
+      }
+    }
+  }
+  return status;
+}
+
+Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
+                            uint64_t* manifest_file_size, bool flush_memtable) {
+  *manifest_file_size = 0;
+
+  mutex_.Lock();
+
+  if (flush_memtable) {
+    Status status = FlushForGetLiveFiles();
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live table and blob files
+  std::vector<uint64_t> live_table_files;
+  std::vector<uint64_t> live_blob_files;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cfd->current()->AddLiveFiles(&live_table_files, &live_blob_files);
+  }
+
+  ret.clear();
+  ret.reserve(live_table_files.size() + live_blob_files.size() +
+              3);  // for CURRENT + MANIFEST + OPTIONS
+
+  // create names of the live files. The names are not absolute
+  // paths, instead they are relative to dbname_.
+  for (const auto& table_file_number : live_table_files) {
+    ret.emplace_back(MakeTableFileName("", table_file_number));
+  }
+
+  for (const auto& blob_file_number : live_blob_files) {
+    ret.emplace_back(BlobFileName("", blob_file_number));
+  }
+
+  ret.emplace_back(CurrentFileName(""));
+  ret.emplace_back(DescriptorFileName("", versions_->manifest_file_number()));
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (versions_->options_file_number() != 0) {
+    ret.emplace_back(OptionsFileName("", versions_->options_file_number()));
+  }
+
+  // find length of manifest file while holding the mutex lock
+  *manifest_file_size = versions_->manifest_file_size();
+
+  mutex_.Unlock();
+  return Status::OK();
+}
+
+Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
+  // Record tracked WALs as a (minimum) cross-check for directory scan
+  std::vector<uint64_t> required_by_manifest;
+
+  // If caller disabled deletions, this function should return files that are
+  // guaranteed not to be deleted until deletions are re-enabled. We need to
+  // wait for pending purges to finish since WalManager doesn't know which
+  // files are going to be purged. Additional purges won't be scheduled as
+  // long as deletions are disabled (so the below loop must terminate).
+  // Also note that we disable deletions anyway to avoid the case where a
+  // file is deleted in the middle of the scan, causing IO error.
+  Status deletions_disabled = DisableFileDeletions();
+  {
+    InstrumentedMutexLock l(&mutex_);
+    while (pending_purge_obsolete_files_ > 0 || bg_purge_scheduled_ > 0) {
+      bg_cv_.Wait();
+    }
+
+    // Record tracked WALs as a (minimum) cross-check for directory scan
+    const auto& manifest_wals = versions_->GetWalSet().GetWals();
+    required_by_manifest.reserve(manifest_wals.size());
+    for (const auto& wal : manifest_wals) {
+      required_by_manifest.push_back(wal.first);
+    }
+  }
+
+  Status s = wal_manager_.GetSortedWalFiles(files);
+
+  // DisableFileDeletions / EnableFileDeletions not supported in read-only DB
+  if (deletions_disabled.ok()) {
+    Status s2 = EnableFileDeletions(/*force*/ false);
+    assert(s2.ok());
+    s2.PermitUncheckedError();
+  } else {
+    assert(deletions_disabled.IsNotSupported());
+  }
+
+  if (s.ok()) {
+    // Verify includes those required by manifest (one sorted list is superset
+    // of the other)
+    auto required = required_by_manifest.begin();
+    auto included = files.begin();
+
+    while (required != required_by_manifest.end()) {
+      if (included == files.end() || *required < (*included)->LogNumber()) {
+        // FAIL - did not find
+        return Status::Corruption(
+            "WAL file " + std::to_string(*required) +
+            " required by manifest but not in directory list");
+      }
+      if (*required == (*included)->LogNumber()) {
+        ++required;
+        ++included;
+      } else {
+        assert(*required > (*included)->LogNumber());
+        ++included;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::GetCurrentWalFile(std::unique_ptr<LogFile>* current_log_file) {
+  uint64_t current_logfile_number;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    current_logfile_number = logfile_number_;
+  }
+
+  return wal_manager_.GetLiveWalFile(current_logfile_number, current_log_file);
+}
+
+Status DBImpl::GetLiveFilesStorageInfo(
+    const LiveFilesStorageInfoOptions& opts,
+    std::vector<LiveFileStorageInfo>* files) {
+  // To avoid returning partial results, only move results to files on success.
+  assert(files);
+  files->clear();
+  std::vector<LiveFileStorageInfo> results;
+
+  // NOTE: This implementation was largely migrated from Checkpoint.
+
+  Status s;
+  VectorLogPtr live_wal_files;
+  bool flush_memtable = true;
+  if (!immutable_db_options_.allow_2pc) {
+    if (opts.wal_size_for_flush == std::numeric_limits<uint64_t>::max()) {
+      flush_memtable = false;
+    } else if (opts.wal_size_for_flush > 0) {
+      // If the outstanding log files are small, we skip the flush.
+      s = GetSortedWalFiles(live_wal_files);
+
+      if (!s.ok()) {
+        return s;
+      }
+
+      // Don't flush column families if total log size is smaller than
+      // log_size_for_flush. We copy the log files instead.
+      // We may be able to cover 2PC case too.
+      uint64_t total_wal_size = 0;
+      for (auto& wal : live_wal_files) {
+        total_wal_size += wal->SizeFileBytes();
+      }
+      if (total_wal_size < opts.wal_size_for_flush) {
+        flush_memtable = false;
+      }
+      live_wal_files.clear();
+    }
+  }
+
+  // This is a modified version of GetLiveFiles, to get access to more
+  // metadata.
+  mutex_.Lock();
+  if (flush_memtable) {
+    Status status = FlushForGetLiveFiles();
+    if (!status.ok()) {
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "Cannot Flush data %s\n",
+                      status.ToString().c_str());
+      return status;
+    }
+  }
+
+  // Make a set of all of the live table and blob files
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    VersionStorageInfo& vsi = *cfd->current()->storage_info();
+    auto& cf_paths = cfd->ioptions()->cf_paths;
+
+    auto GetDir = [&](size_t path_id) {
+      // Matching TableFileName() behavior
+      if (path_id >= cf_paths.size()) {
+        assert(false);
+        return cf_paths.back().path;
+      } else {
+        return cf_paths[path_id].path;
+      }
+    };
+
+    for (int level = 0; level < vsi.num_levels(); ++level) {
+      const auto& level_files = vsi.LevelFiles(level);
+      for (const auto& meta : level_files) {
+        assert(meta);
+
+        results.emplace_back();
+        LiveFileStorageInfo& info = results.back();
+
+        info.relative_filename = MakeTableFileName(meta->fd.GetNumber());
+        info.directory = GetDir(meta->fd.GetPathId());
+        info.file_number = meta->fd.GetNumber();
+        info.file_type = kTableFile;
+        info.size = meta->fd.GetFileSize();
+        if (opts.include_checksum_info) {
+          info.file_checksum_func_name = meta->file_checksum_func_name;
+          info.file_checksum = meta->file_checksum;
+          if (info.file_checksum_func_name.empty()) {
+            info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+            info.file_checksum = kUnknownFileChecksum;
+          }
+        }
+        info.temperature = meta->temperature;
+      }
+    }
+    const auto& blob_files = vsi.GetBlobFiles();
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+
+      info.relative_filename = BlobFileName(meta->GetBlobFileNumber());
+      info.directory = GetDir(/* path_id */ 0);
+      info.file_number = meta->GetBlobFileNumber();
+      info.file_type = kBlobFile;
+      info.size = meta->GetBlobFileSize();
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = meta->GetChecksumMethod();
+        info.file_checksum = meta->GetChecksumValue();
+        if (info.file_checksum_func_name.empty()) {
+          info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+          info.file_checksum = kUnknownFileChecksum;
+        }
+      }
+      // TODO?: info.temperature
+    }
+  }
+
+  // Capture some final info before releasing mutex
+  const uint64_t manifest_number = versions_->manifest_file_number();
+  const uint64_t manifest_size = versions_->manifest_file_size();
+  const uint64_t options_number = versions_->options_file_number();
+  const uint64_t options_size = versions_->options_file_size_;
+  const uint64_t min_log_num = MinLogNumberToKeep();
+
+  mutex_.Unlock();
+
+  std::string manifest_fname = DescriptorFileName(manifest_number);
+  {  // MANIFEST
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = manifest_fname;
+    info.directory = GetName();
+    info.file_number = manifest_number;
+    info.file_type = kDescriptorFile;
+    info.size = manifest_size;
+    info.trim_to_size = true;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  {  // CURRENT
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = kCurrentFileName;
+    info.directory = GetName();
+    info.file_type = kCurrentFile;
+    // CURRENT could be replaced so we have to record the contents as needed.
+    info.replacement_contents = manifest_fname + "\n";
+    info.size = manifest_fname.size() + 1;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // The OPTIONS file number is zero in read-write mode when OPTIONS file
+  // writing failed and the DB was configured with
+  // `fail_if_options_file_error == false`. In read-only mode the OPTIONS file
+  // number is zero when no OPTIONS file exist at all. In those cases we do not
+  // record any OPTIONS file in the live file list.
+  if (options_number != 0) {
+    results.emplace_back();
+    LiveFileStorageInfo& info = results.back();
+
+    info.relative_filename = OptionsFileName(options_number);
+    info.directory = GetName();
+    info.file_number = options_number;
+    info.file_type = kOptionsFile;
+    info.size = options_size;
+    if (opts.include_checksum_info) {
+      info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+      info.file_checksum = kUnknownFileChecksum;
+    }
+  }
+
+  // Some legacy testing stuff  TODO: carefully clean up obsolete parts
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:FlushDone");
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2");
+
+  if (s.ok()) {
+    // To maximize the effectiveness of track_and_verify_wals_in_manifest,
+    // sync WAL when it is enabled.
+    s = FlushWAL(
+        immutable_db_options_.track_and_verify_wals_in_manifest /* sync */);
+    if (s.IsNotSupported()) {  // read-only DB or similar
+      s = Status::OK();
+    }
+  }
+
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive1");
+  TEST_SYNC_POINT("CheckpointImpl::CreateCustomCheckpoint:AfterGetLive2");
+
+  // If we have more than one column family, we also need to get WAL files.
+  if (s.ok()) {
+    s = GetSortedWalFiles(live_wal_files);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  size_t wal_size = live_wal_files.size();
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size());
+
+  // Link WAL files. Copy exact size of last one because it is the only one
+  // that has changes after the last flush.
+  auto wal_dir = immutable_db_options_.GetWalDir();
+  for (size_t i = 0; s.ok() && i < wal_size; ++i) {
+    if ((live_wal_files[i]->Type() == kAliveLogFile) &&
+        (!flush_memtable || live_wal_files[i]->LogNumber() >= min_log_num)) {
+      results.emplace_back();
+      LiveFileStorageInfo& info = results.back();
+      auto f = live_wal_files[i]->PathName();
+      assert(!f.empty() && f[0] == '/');
+      info.relative_filename = f.substr(1);
+      info.directory = wal_dir;
+      info.file_number = live_wal_files[i]->LogNumber();
+      info.file_type = kWalFile;
+      info.size = live_wal_files[i]->SizeFileBytes();
+      // Only last should need to be trimmed
+      info.trim_to_size = (i + 1 == wal_size);
+      if (opts.include_checksum_info) {
+        info.file_checksum_func_name = kUnknownFileChecksumFuncName;
+        info.file_checksum = kUnknownFileChecksum;
+      }
+    }
+  }
+
+  if (s.ok()) {
+    // Only move results to output on success.
+    *files = std::move(results);
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_flush_test.cc b/src/rocksdb/db/db_flush_test.cc
new file mode 100644
index 000000000..3b3f7e183
--- /dev/null
+++ b/src/rocksdb/db/db_flush_test.cc
@@ -0,0 +1,3084 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <limits>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/mutexlock.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This is a static filter used for filtering
+// kvs during the compaction process.
+static std::string NEW_VALUE = "NewValue";
+
+class DBFlushTest : public DBTestBase {
+ public:
+  DBFlushTest() : DBTestBase("db_flush_test", /*env_do_fsync=*/true) {}
+};
+
+class DBFlushDirectIOTest : public DBFlushTest,
+                            public ::testing::WithParamInterface<bool> {
+ public:
+  DBFlushDirectIOTest() : DBFlushTest() {}
+};
+
+class DBAtomicFlushTest : public DBFlushTest,
+                          public ::testing::WithParamInterface<bool> {
+ public:
+  DBAtomicFlushTest() : DBFlushTest() {}
+};
+
+// We had issue when two background threads trying to flush at the same time,
+// only one of them get committed. The test verifies the issue is fixed.
+TEST_F(DBFlushTest, FlushWhileWritingManifest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.env = env_;
+  Reopen(options);
+  FlushOptions no_wait;
+  no_wait.wait = false;
+  no_wait.allow_write_stall = true;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"VersionSet::LogAndApply:WriteManifest",
+        "DBFlushTest::FlushWhileWritingManifest:1"},
+       {"MemTableList::TryInstallMemtableFlushResults:InProgress",
+        "VersionSet::LogAndApply:WriteManifestDone"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  TEST_SYNC_POINT("DBFlushTest::FlushWhileWritingManifest:1");
+  ASSERT_OK(Put("bar", "v"));
+  ASSERT_OK(dbfull()->Flush(no_wait));
+  // If the issue is hit we will wait here forever.
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(2, TotalTableFiles());
+#endif  // ROCKSDB_LITE
+}
+
+// Disable this test temporarily on Travis as it fails intermittently.
+// Github issue: #4151
+TEST_F(DBFlushTest, SyncFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncFail:1", "DBImpl::SyncClosedLogs:Start"},
+       {"DBImpl::SyncClosedLogs:Failed", "DBFlushTest::SyncFail:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put("key", "value"));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  // Flush installs a new super-version. Get the ref count after that.
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncFail:2");
+  fault_injection_env->SetFilesystemActive(true);
+  // Now the background job will do the flush; wait for it.
+  // Returns the IO error happend during flush.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("", FilesPerLevel());  // flush failed.
+#endif                             // ROCKSDB_LITE
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, SyncSkip) {
+  Options options = CurrentOptions();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBFlushTest::SyncSkip:1", "DBImpl::SyncClosedLogs:Skip"},
+       {"DBImpl::SyncClosedLogs:Skip", "DBFlushTest::SyncSkip:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:1");
+  TEST_SYNC_POINT("DBFlushTest::SyncSkip:2");
+
+  // Now the background job will do the flush; wait for it.
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushInLowPriThreadPool) {
+  // Verify setting an empty high-pri (flush) thread pool causes flushes to be
+  // scheduled in the low-pri (compaction) thread pool.
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  std::thread::id tid;
+  int num_flushes = 0, num_compactions = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkFlush", [&](void* /*arg*/) {
+        if (tid == std::thread::id()) {
+          tid = std::this_thread::get_id();
+        } else {
+          ASSERT_EQ(tid, std::this_thread::get_id());
+        }
+        ++num_flushes;
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction", [&](void* /*arg*/) {
+        ASSERT_EQ(tid, std::this_thread::get_id());
+        ++num_compactions;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key", "val"));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(4, num_flushes);
+  ASSERT_EQ(1, num_compactions);
+}
+
+// Test when flush job is submitted to low priority thread pool and when DB is
+// closed in the meanwhile, CloseHelper doesn't hang.
+TEST_F(DBFlushTest, CloseDBWhenFlushInLowPri) {
+  Options options = CurrentOptions();
+  options.max_background_flushes = 1;
+  options.max_total_wal_size = 8192;
+
+  DestroyAndReopen(options);
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+
+  env_->SetBackgroundThreads(0, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  int num_flushes = 0;
+
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::BGWorkFlush",
+                                        [&](void* /*arg*/) { ++num_flushes; });
+
+  int num_low_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleLowFlushCallback", [&](void* /*arg*/) {
+        num_low_flush_unscheduled++;
+        // There should be one flush job in low pool that needs to be
+        // unscheduled
+        ASSERT_EQ(num_low_flush_unscheduled, 1);
+      });
+
+  int num_high_flush_unscheduled = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::UnscheduleHighFlushCallback", [&](void* /*arg*/) {
+        num_high_flush_unscheduled++;
+        // There should be no flush job in high pool
+        ASSERT_EQ(num_high_flush_unscheduled, 0);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  // Block thread so that flush cannot be run and can be removed from the queue
+  // when called Unschedule.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Trigger flush and flush job will be scheduled to LOW priority thread.
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+
+  // Close DB and flush job in low priority queue will be removed without
+  // running.
+  Close();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_EQ(0, num_flushes);
+
+  TryReopenWithColumnFamilies({"default", "cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key3", DummyString(8192)));
+  ASSERT_OK(Flush(0));
+  ASSERT_EQ(1, num_flushes);
+}
+
+TEST_F(DBFlushTest, ManualFlushWithMinWriteBufferNumberToMerge) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush",
+        "DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1"},
+       {"DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2",
+        "FlushJob::WriteLevel0Table"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "value1"));
+
+  port::Thread t([&]() {
+    // The call wait for flush to finish, i.e. with flush_options.wait = true.
+    ASSERT_OK(Flush());
+  });
+
+  // Wait for flush start.
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:1");
+  // Insert a second memtable before the manual flush finish.
+  // At the end of the manual flush job, it will check if further flush
+  // is needed, but it will not trigger flush of the second memtable because
+  // min_write_buffer_number_to_merge is not reached.
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  TEST_SYNC_POINT("DBFlushTest::ManualFlushWithMinWriteBufferNumberToMerge:2");
+
+  // Manual flush should return, without waiting for flush indefinitely.
+  t.join();
+}
+
+TEST_F(DBFlushTest, ScheduleOnlyOneBgThread) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto unscheduled_flushes = *reinterpret_cast<int*>(arg);
+        ASSERT_EQ(0, unscheduled_flushes);
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "foo"));
+  FlushOptions flush_opts;
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_EQ(1, called);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// The following 3 tests are designed for testing garbage statistics at flush
+// time.
+//
+// ======= General Information ======= (from GitHub Wiki).
+// There are three scenarios where memtable flush can be triggered:
+//
+// 1 - Memtable size exceeds ColumnFamilyOptions::write_buffer_size
+//     after a write.
+// 2 - Total memtable size across all column families exceeds
+// DBOptions::db_write_buffer_size,
+//     or DBOptions::write_buffer_manager signals a flush. In this scenario
+//     the largest memtable will be flushed.
+// 3 - Total WAL file size exceeds DBOptions::max_total_wal_size.
+//     In this scenario the memtable with the oldest data will be flushed,
+//     in order to allow the WAL file with data from this memtable to be
+//     purged.
+//
+// As a result, a memtable can be flushed before it is full. This is one
+// reason the generated SST file can be smaller than the corresponding
+// memtable. Compression is another factor to make SST file smaller than
+// corresponding memtable, since data in memtable is uncompressed.
+
+TEST_F(DBFlushTest, StatisticsGarbageBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 64 << 20;
+
+  ASSERT_OK(TryReopen(options));
+
+  // Put multiple times the same key-values.
+  // The encoded length of a db entry in the memtable is
+  // defined in db/memtable.cc (MemTable::Add) as the variable:
+  // encoded_len=  VarintLength(internal_key_size)  --> =
+  // log_256(internal_key).
+  // Min # of bytes
+  //                                                       necessary to
+  //                                                       store
+  //                                                       internal_key_size.
+  //             + internal_key_size                --> = actual key string,
+  //             (size key_size: w/o term null char)
+  //                                                      + 8 bytes for
+  //                                                      fixed uint64 "seq
+  //                                                      number
+  // +
+  //                                                      insertion type"
+  //             + VarintLength(val_size)           --> = min # of bytes to
+  //             store val_size
+  //             + val_size                         --> = actual value
+  //             string
+  // For example, in our situation, "key1" : size 4, "value1" : size 6
+  // (the terminating null characters are not copied over to the memtable).
+  // And therefore encoded_len = 1 + (4+8) + 1 + 6 = 20 bytes per entry.
+  // However in terms of raw data contained in the memtable, and written
+  // over to the SSTable, we only count internal_key_size and val_size,
+  // because this is the only raw chunk of bytes that contains everything
+  // necessary to reconstruct a user entry: sequence number, insertion type,
+  // key, and value.
+
+  // To test the relevance of our Memtable garbage statistics,
+  // namely MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+  // we insert K-V pairs with 3 distinct keys (of length 4),
+  // and random values of arbitrary length RAND_VALUES_LENGTH,
+  // and we repeat this step NUM_REPEAT times total.
+  // At the end, we insert 3 final K-V pairs with the same 3 keys
+  // and known values (these will be the final values, of length 6).
+  // I chose NUM_REPEAT=2,000 such that no automatic flush is
+  // triggered (the number of bytes in the memtable is therefore
+  // well below any meaningful heuristic for a memtable of size 64MB).
+  // As a result, since each K-V pair is inserted as a payload
+  // of N meaningful bytes (sequence number, insertion type,
+  // key, and value = 8 + 4 + RAND_VALUE_LENGTH),
+  // MEMTABLE_GARBAGE_BYTES_AT_FLUSH should be equal to 2,000 * N bytes
+  // and MEMTABLE_PAYLAOD_BYTES_AT_FLUSH = MEMTABLE_GARBAGE_BYTES_AT_FLUSH +
+  // (3*(8 + 4 + 6)) bytes. For RAND_VALUE_LENGTH = 172 (arbitrary value), we
+  // expect:
+  //      N = 8 + 4 + 172 = 184 bytes
+  //      MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 2,000 * 184 = 368,000 bytes.
+  //      MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 368,000 + 3*18 = 368,054 bytes.
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 172;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string VALUE1 = "value1";
+  const std::string VALUE2 = "value2";
+  const std::string VALUE3 = "value3";
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  ASSERT_OK(Put(KEY1, VALUE1));
+  ASSERT_OK(Put(KEY2, VALUE2));
+  ASSERT_OK(Put(KEY3, VALUE3));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY1.size() + VALUE1.size() + KEY2.size() + VALUE2.size() + KEY3.size() +
+      VALUE3.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the last K-V pairs have been successfully inserted,
+  // and that the valid values are VALUE1, VALUE2, VALUE3.
+  PinnableSlice value;
+  ASSERT_OK(Get(KEY1, &value));
+  ASSERT_EQ(value.ToString(), VALUE1);
+  ASSERT_OK(Get(KEY2, &value));
+  ASSERT_EQ(value.ToString(), VALUE2);
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value.ToString(), VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageInsertAndDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 2000;
+  const size_t RAND_VALUES_LENGTH = 37;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  WriteBatch batch;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(Delete(KEY1));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(Delete(KEY3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of delete for KEY1, KEY2, KEY3 is written to
+  // SSTable to propagate the delete operations to K-V pairs
+  // that could have been inserted into the database during past Flush
+  // opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      KEY1.size() + KEY2.size() + KEY3.size() + 3 * sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(Delete(KEY4));
+  ASSERT_OK(Delete(KEY5));
+  ASSERT_OK(Delete(KEY6));
+
+  // // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY4.size() + KEY5.size() + KEY6.size() + 3 * sizeof(uint64_t);
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  ASSERT_NOK(Get(KEY3, &value));
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, StatisticsGarbageRangeDeletes) {
+  Options options = CurrentOptions();
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+  options.write_buffer_size = 67108864;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 42;
+  const std::string KEY1 = "key1";
+  const std::string KEY2 = "key2";
+  const std::string KEY3 = "key3";
+  const std::string KEY4 = "key4";
+  const std::string KEY5 = "key5";
+  const std::string KEY6 = "key6";
+  const std::string VALUE3 = "value3";
+
+  uint64_t EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH = 0;
+  uint64_t EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH = 0;
+
+  Random rnd(301);
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    std::string p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    std::string p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY1.size() + p_v1.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY2.size() + p_v2.size() + sizeof(uint64_t);
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        KEY3.size() + p_v3.size() + sizeof(uint64_t);
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY2));
+    // Note: DeleteRange have an exclusive upper bound, e.g. here: [KEY2,KEY3)
+    // is deleted.
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY3));
+    // Delete ranges are stored as a regular K-V pair, with key=STARTKEY,
+    // value=ENDKEY.
+    EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH +=
+        (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+        (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+  }
+
+  // The memtable data bytes includes the "garbage"
+  // bytes along with the useful payload.
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH =
+      EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH;
+
+  // Note : one set of deleteRange for (KEY1, KEY2) and (KEY2, KEY3) is written
+  // to SSTable to propagate the deleteRange operations to K-V pairs that could
+  // have been inserted into the database during past Flush opeartions.
+  EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH -=
+      (KEY1.size() + KEY2.size() + sizeof(uint64_t)) +
+      (KEY2.size() + KEY3.size() + sizeof(uint64_t));
+
+  // Overwrite KEY3 with known value (VALUE3)
+  // Note that during the whole time KEY3 has never been deleted
+  // by the RangeDeletes.
+  ASSERT_OK(Put(KEY3, VALUE3));
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      KEY3.size() + VALUE3.size() + sizeof(uint64_t);
+
+  // Additional useful paylaod.
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY4, KEY5));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY5, KEY6));
+
+  // Add useful payload to the memtable data bytes:
+  EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH +=
+      (KEY4.size() + KEY5.size() + sizeof(uint64_t)) +
+      (KEY5.size() + KEY6.size() + sizeof(uint64_t));
+
+  // We assert that the K-V pairs have been successfully deleted.
+  PinnableSlice value;
+  ASSERT_NOK(Get(KEY1, &value));
+  ASSERT_NOK(Get(KEY2, &value));
+  // And that KEY3's value is correct.
+  ASSERT_OK(Get(KEY3, &value));
+  ASSERT_EQ(value, VALUE3);
+
+  // Force flush to SST. Increments the statistics counter.
+  ASSERT_OK(Flush());
+
+  // Collect statistics.
+  uint64_t mem_data_bytes =
+      TestGetTickerCount(options, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  uint64_t mem_garbage_bytes =
+      TestGetTickerCount(options, MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  EXPECT_EQ(mem_data_bytes, EXPECTED_MEMTABLE_PAYLOAD_BYTES_AT_FLUSH);
+  EXPECT_EQ(mem_garbage_bytes, EXPECTED_MEMTABLE_GARBAGE_BYTES_AT_FLUSH);
+
+  Close();
+}
+
+#ifndef ROCKSDB_LITE
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, DBFlushTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+
+  ~TestFlushListener() override {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      test_->dbfull()->TEST_GetFilesMetaData(db->DefaultColumnFamily(),
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  DBFlushTest* test_;
+};
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, MemPurgeBasic) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+#ifndef ROCKSDB_LITE
+  // Initially deactivate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 0.0;
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#else
+  // Activate directly the MemPurge prototype.
+  // (RocksDB lite does not support dynamic options)
+  options.experimental_mempurge_threshold = 1.0;
+#endif  // !ROCKSDB_LITE
+  ASSERT_OK(TryReopen(options));
+
+  // RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+  // Dynamically activate the MemPurge prototype without restarting the DB.
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "1.0"}}));
+#endif
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "IamKey1";
+  std::string KEY2 = "IamKey2";
+  std::string KEY3 = "IamKey3";
+  std::string KEY4 = "IamKey4";
+  std::string KEY5 = "IamKey5";
+  std::string KEY6 = "IamKey6";
+  std::string KEY7 = "IamKey7";
+  std::string KEY8 = "IamKey8";
+  std::string KEY9 = "IamKey9";
+  std::string RNDKEY1, RNDKEY2, RNDKEY3;
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(719);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_KEYS_LENGTH = 57;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9, p_rv1,
+      p_rv2, p_rv3;
+
+  // Insert a very first set of keys that will be
+  // mempurged at least once.
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Insertion of of K-V pairs, no overwrites.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    RNDKEY1 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY2 = rnd.RandomString(RAND_KEYS_LENGTH);
+    RNDKEY3 = rnd.RandomString(RAND_KEYS_LENGTH);
+    p_rv1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_rv3 = rnd.RandomString(RAND_VALUES_LENGTH);
+
+    ASSERT_OK(Put(RNDKEY1, p_rv1));
+    ASSERT_OK(Put(RNDKEY2, p_rv2));
+    ASSERT_OK(Put(RNDKEY3, p_rv3));
+
+    ASSERT_EQ(Get(KEY1), p_v1);
+    ASSERT_EQ(Get(KEY2), p_v2);
+    ASSERT_EQ(Get(KEY3), p_v3);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+    ASSERT_EQ(Get(KEY6), p_v6);
+    ASSERT_EQ(Get(KEY7), p_v7);
+    ASSERT_EQ(Get(KEY8), p_v8);
+    ASSERT_EQ(Get(KEY9), p_v9);
+    ASSERT_EQ(Get(RNDKEY1), p_rv1);
+    ASSERT_EQ(Get(RNDKEY2), p_rv2);
+    ASSERT_EQ(Get(RNDKEY3), p_rv3);
+  }
+
+  // Assert that at least one flush to storage has been performed
+  EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+  // (which will consequently increase the number of mempurges recorded too).
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+
+  // Assert that there is no data corruption, even with
+  // a flush to storage.
+  ASSERT_EQ(Get(KEY1), p_v1);
+  ASSERT_EQ(Get(KEY2), p_v2);
+  ASSERT_EQ(Get(KEY3), p_v3);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+  ASSERT_EQ(Get(KEY6), p_v6);
+  ASSERT_EQ(Get(KEY7), p_v7);
+  ASSERT_EQ(Get(KEY8), p_v8);
+  ASSERT_EQ(Get(KEY9), p_v9);
+  ASSERT_EQ(Get(RNDKEY1), p_rv1);
+  ASSERT_EQ(Get(RNDKEY2), p_rv2);
+  ASSERT_EQ(Get(RNDKEY3), p_rv3);
+
+  Close();
+}
+
+// RocksDB lite does not support dynamic options
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, MemPurgeBasicToggle) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Initially deactivate the MemPurge prototype.
+  // (negative values are equivalent to 0.0).
+  options.experimental_mempurge_threshold = -25.3;
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+
+  ASSERT_OK(TryReopen(options));
+  // Dynamically activate the MemPurge prototype without restarting the DB.
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  // Values greater than 1.0 are equivalent to 1.0
+  ASSERT_OK(
+      db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "3.7898"}}));
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const size_t KVSIZE = 3;
+  std::vector<std::string> KEYS(KVSIZE);
+  for (size_t k = 0; k < KVSIZE; k++) {
+    KEYS[k] = "IamKey" + std::to_string(k);
+  }
+
+  std::vector<std::string> RNDVALS(KVSIZE);
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(719);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Dynamically deactivate MemPurge.
+  ASSERT_OK(
+      db_->SetOptions(cfh, {{"experimental_mempurge_threshold", "-1023.0"}}));
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t ZERO = 0;
+  // Assert that at least one flush to storage has been performed
+  EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+  // The mempurge count is expected to be set to 0 when the options are updated.
+  // We expect no mempurge at all.
+  EXPECT_EQ(mempurge_count.exchange(0), ZERO);
+
+  Close();
+}
+// Closes the "#ifndef ROCKSDB_LITE"
+// End of MemPurgeBasicToggle, which is not
+// supported with RocksDB LITE because it
+// relies on dynamically changing the option
+// flag experimental_mempurge_threshold.
+#endif
+
+// At the moment, MemPurge feature is deactivated
+// when atomic_flush is enabled. This is because the level
+// of garbage between Column Families is not guaranteed to
+// be consistent, therefore a CF could hypothetically
+// trigger a MemPurge while another CF would trigger
+// a regular Flush.
+TEST_F(DBFlushTest, MemPurgeWithAtomicFlush) {
+  Options options = CurrentOptions();
+
+  // The following options are used to enforce several values that
+  // may already exist as default values to make this test resilient
+  // to default value updates in the future.
+  options.statistics = CreateDBStatistics();
+
+  // Record all statistics.
+  options.statistics->set_stats_level(StatsLevel::kAll);
+
+  // create the DB if it's not already present
+  options.create_if_missing = true;
+
+  // Useful for now as we are trying to compare uncompressed data savings on
+  // flush().
+  options.compression = kNoCompression;
+
+  // Prevent memtable in place updates. Should already be disabled
+  // (from Wiki:
+  //  In place updates can be enabled by toggling on the bool
+  //  inplace_update_support flag. However, this flag is by default set to
+  //  false
+  //  because this thread-safe in-place update support is not compatible
+  //  with concurrent memtable writes. Note that the bool
+  //  allow_concurrent_memtable_write is set to true by default )
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 64KB (64KB = 65,536 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 153.245;
+  // Activate atomic_flush.
+  options.atomic_flush = true;
+
+  const std::vector<std::string> new_cf_names = {"pikachu", "eevie"};
+  CreateColumnFamilies(new_cf_names, options);
+
+  Close();
+
+  // 3 CFs: default will be filled with overwrites (would normally trigger
+  // mempurge)
+  //        new_cf_names[1] will be filled with random values (would trigger
+  //        flush) new_cf_names[2] not filled with anything.
+  ReopenWithColumnFamilies(
+      {kDefaultColumnFamilyName, new_cf_names[0], new_cf_names[1]}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(2, "bar", "baz"));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const size_t KVSIZE = 3;
+  std::vector<std::string> KEYS(KVSIZE);
+  for (size_t k = 0; k < KVSIZE; k++) {
+    KEYS[k] = "IamKey" + std::to_string(k);
+  }
+
+  std::string RNDKEY;
+  std::vector<std::string> RNDVALS(KVSIZE);
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  // Heavy overwrite workload,
+  // more than would fit in maximum allowed memtables.
+  Random rnd(106);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_KEY_LENGTH = 128;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  // Insertion of of K-V pairs, multiple times (overwrites).
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    for (size_t j = 0; j < KEYS.size(); j++) {
+      RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+      RNDVALS[j] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[j], RNDVALS[j]));
+      ASSERT_OK(Put(1, RNDKEY, RNDVALS[j]));
+      ASSERT_EQ(Get(KEYS[j]), RNDVALS[j]);
+      ASSERT_EQ(Get(1, RNDKEY), RNDVALS[j]);
+    }
+  }
+
+  // Check that there was no mempurge because atomic_flush option is true.
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 0;
+  // Check that there was at least one SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 1;
+
+  EXPECT_EQ(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_GE(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  Close();
+}
+
+TEST_F(DBFlushTest, MemPurgeDeleteAndDeleteRange) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 15.0;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  Random rnd(117);
+  const size_t NUM_REPEAT = 100;
+  const size_t RAND_VALUES_LENGTH = 10240;
+
+  std::string key, value, p_v1, p_v2, p_v3, p_v3b, p_v4, p_v5;
+  int count = 0;
+  const int EXPECTED_COUNT_FORLOOP = 3;
+  const int EXPECTED_COUNT_END = 4;
+
+  ReadOptions ropt;
+  ropt.pin_data = true;
+  ropt.total_order_seek = true;
+  Iterator* iter = nullptr;
+
+  // Insertion of of K-V pairs, multiple times.
+  // Also insert DeleteRange
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v3b = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY1, p_v1));
+    ASSERT_OK(Put(KEY2, p_v2));
+    ASSERT_OK(Put(KEY3, p_v3));
+    ASSERT_OK(Put(KEY4, p_v4));
+    ASSERT_OK(Put(KEY5, p_v5));
+    ASSERT_OK(Delete(KEY2));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY2,
+                               KEY4));
+    ASSERT_OK(Put(KEY3, p_v3b));
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), KEY1,
+                               KEY3));
+    ASSERT_OK(Delete(KEY1));
+
+    ASSERT_EQ(Get(KEY1), NOT_FOUND);
+    ASSERT_EQ(Get(KEY2), NOT_FOUND);
+    ASSERT_EQ(Get(KEY3), p_v3b);
+    ASSERT_EQ(Get(KEY4), p_v4);
+    ASSERT_EQ(Get(KEY5), p_v5);
+
+    iter = db_->NewIterator(ropt);
+    iter->SeekToFirst();
+    count = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      key = (iter->key()).ToString(false);
+      value = (iter->value()).ToString(false);
+      if (key.compare(KEY3) == 0)
+        ASSERT_EQ(value, p_v3b);
+      else if (key.compare(KEY4) == 0)
+        ASSERT_EQ(value, p_v4);
+      else if (key.compare(KEY5) == 0)
+        ASSERT_EQ(value, p_v5);
+      else
+        ASSERT_EQ(value, NOT_FOUND);
+      count++;
+    }
+
+    // Expected count here is 3: KEY3, KEY4, KEY5.
+    ASSERT_EQ(count, EXPECTED_COUNT_FORLOOP);
+    if (iter) {
+      delete iter;
+    }
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Additional test for the iterator+memPurge.
+  ASSERT_OK(Put(KEY2, p_v2));
+  iter = db_->NewIterator(ropt);
+  iter->SeekToFirst();
+  ASSERT_OK(Put(KEY4, p_v4));
+  count = 0;
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    key = (iter->key()).ToString(false);
+    value = (iter->value()).ToString(false);
+    if (key.compare(KEY2) == 0)
+      ASSERT_EQ(value, p_v2);
+    else if (key.compare(KEY3) == 0)
+      ASSERT_EQ(value, p_v3b);
+    else if (key.compare(KEY4) == 0)
+      ASSERT_EQ(value, p_v4);
+    else if (key.compare(KEY5) == 0)
+      ASSERT_EQ(value, p_v5);
+    else
+      ASSERT_EQ(value, NOT_FOUND);
+    count++;
+  }
+
+  // Expected count here is 4: KEY2, KEY3, KEY4, KEY5.
+  ASSERT_EQ(count, EXPECTED_COUNT_END);
+  if (iter) delete iter;
+
+  Close();
+}
+
+// Create a Compaction Fitler that will be invoked
+// at flush time and will update the value of a KV pair
+// if the key string is "lower" than the filter_key_ string.
+class ConditionalUpdateFilter : public CompactionFilter {
+ public:
+  explicit ConditionalUpdateFilter(const std::string* filtered_key)
+      : filtered_key_(filtered_key) {}
+  bool Filter(int /*level*/, const Slice& key, const Slice& /*value*/,
+              std::string* new_value, bool* value_changed) const override {
+    // If key<filtered_key_, update the value of the KV-pair.
+    if (key.compare(*filtered_key_) < 0) {
+      assert(new_value != nullptr);
+      *new_value = NEW_VALUE;
+      *value_changed = true;
+    }
+    return false /*do not remove this KV-pair*/;
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilter"; }
+
+ private:
+  const std::string* filtered_key_;
+};
+
+class ConditionalUpdateFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit ConditionalUpdateFilterFactory(const Slice& filtered_key)
+      : filtered_key_(filtered_key.ToString()) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(
+        new ConditionalUpdateFilter(&filtered_key_));
+  }
+
+  const char* Name() const override { return "ConditionalUpdateFilterFactory"; }
+
+  bool ShouldFilterTableFileCreation(
+      TableFileCreationReason reason) const override {
+    // This compaction filter will be invoked
+    // at flush time (and therefore at MemPurge time).
+    return (reason == TableFileCreationReason::kFlush);
+  }
+
+ private:
+  std::string filtered_key_;
+};
+
+TEST_F(DBFlushTest, MemPurgeAndCompactionFilter) {
+  Options options = CurrentOptions();
+
+  std::string KEY1 = "ThisIsKey1";
+  std::string KEY2 = "ThisIsKey2";
+  std::string KEY3 = "ThisIsKey3";
+  std::string KEY4 = "ThisIsKey4";
+  std::string KEY5 = "ThisIsKey5";
+  std::string KEY6 = "ThisIsKey6";
+  std::string KEY7 = "ThisIsKey7";
+  std::string KEY8 = "ThisIsKey8";
+  std::string KEY9 = "ThisIsKey9";
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+#ifndef ROCKSDB_LITE
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+#endif  // !ROCKSDB_LITE
+  // Create a ConditionalUpdate compaction filter
+  // that will update all the values of the KV pairs
+  // where the keys are "lower" than KEY4.
+  options.compaction_filter_factory =
+      std::make_shared<ConditionalUpdateFilterFactory>(KEY4);
+
+  // Enforce size of a single MemTable to 64MB (64MB = 67108864 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 26.55;
+
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(53);
+  const size_t NUM_REPEAT = 1000;
+  const size_t RAND_VALUES_LENGTH = 10240;
+  std::string p_v1, p_v2, p_v3, p_v4, p_v5, p_v6, p_v7, p_v8, p_v9;
+
+  p_v1 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v2 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v3 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v4 = rnd.RandomString(RAND_VALUES_LENGTH);
+  p_v5 = rnd.RandomString(RAND_VALUES_LENGTH);
+  ASSERT_OK(Put(KEY1, p_v1));
+  ASSERT_OK(Put(KEY2, p_v2));
+  ASSERT_OK(Put(KEY3, p_v3));
+  ASSERT_OK(Put(KEY4, p_v4));
+  ASSERT_OK(Put(KEY5, p_v5));
+  ASSERT_OK(Delete(KEY1));
+
+  // Insertion of of K-V pairs, multiple times.
+  for (size_t i = 0; i < NUM_REPEAT; i++) {
+    // Create value strings of arbitrary
+    // length RAND_VALUES_LENGTH bytes.
+    p_v6 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v7 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v8 = rnd.RandomString(RAND_VALUES_LENGTH);
+    p_v9 = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEY6, p_v6));
+    ASSERT_OK(Put(KEY7, p_v7));
+    ASSERT_OK(Put(KEY8, p_v8));
+    ASSERT_OK(Put(KEY9, p_v9));
+
+    ASSERT_OK(Delete(KEY7));
+  }
+
+  // Check that there was at least one mempurge
+  const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+  // Check that there was no SST files created during flush.
+  const uint32_t EXPECTED_SST_COUNT = 0;
+
+  EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+  EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+
+  // Verify that the ConditionalUpdateCompactionFilter
+  // updated the values of KEY2 and KEY3, and not KEY4 and KEY5.
+  ASSERT_EQ(Get(KEY1), NOT_FOUND);
+  ASSERT_EQ(Get(KEY2), NEW_VALUE);
+  ASSERT_EQ(Get(KEY3), NEW_VALUE);
+  ASSERT_EQ(Get(KEY4), p_v4);
+  ASSERT_EQ(Get(KEY5), p_v5);
+}
+
+TEST_F(DBFlushTest, DISABLED_MemPurgeWALSupport) {
+  Options options = CurrentOptions();
+
+  options.statistics = CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 128KB.
+  options.write_buffer_size = 128 << 10;
+  // Activate the MemPurge prototype
+  // (values >1.0 are equivalent to 1.0).
+  options.experimental_mempurge_threshold = 2.5;
+
+  ASSERT_OK(TryReopen(options));
+
+  const size_t KVSIZE = 10;
+
+  do {
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(0, "bar", "v2"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    std::atomic<uint32_t> mempurge_count{0};
+    std::atomic<uint32_t> sst_count{0};
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:MemPurgeSuccessful",
+        [&](void* /*arg*/) { mempurge_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<std::string> keys;
+    for (size_t k = 0; k < KVSIZE; k++) {
+      keys.push_back("IamKey" + std::to_string(k));
+    }
+
+    std::string RNDKEY, RNDVALUE;
+    const std::string NOT_FOUND = "NOT_FOUND";
+
+    // Heavy overwrite workload,
+    // more than would fit in maximum allowed memtables.
+    Random rnd(719);
+    const size_t NUM_REPEAT = 100;
+    const size_t RAND_KEY_LENGTH = 4096;
+    const size_t RAND_VALUES_LENGTH = 1024;
+    std::vector<std::string> values_default(KVSIZE), values_pikachu(KVSIZE);
+
+    // Insert a very first set of keys that will be
+    // mempurged at least once.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+    }
+
+    // Insert keys[0:KVSIZE/2] to
+    // both 'default' and 'pikachu' CFs.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_OK(Put(0, keys[k], values_default[k]));
+      ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+    }
+
+    // Check that the insertion was seamless.
+    for (size_t k = 0; k < KVSIZE / 2; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'default' CF. Will trigger mempurge.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_default[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into default CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(0, keys[k], values_default[k]));
+      }
+
+      // Check key validity, for all keys, both in
+      // default and pikachu CFs.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      }
+      // Note that at this point, only keys[0:KVSIZE/2]
+      // have been inserted into Pikachu.
+      for (size_t k = 0; k < KVSIZE / 2; k++) {
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Insertion of of K-V pairs, multiple times (overwrites)
+    // into 'pikachu' CF. Will trigger mempurge.
+    // Check that we keep the older logs for 'default' imm().
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        values_pikachu[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      }
+
+      // Insert K-V into pikachu CF.
+      for (size_t k = KVSIZE / 2; k < KVSIZE; k++) {
+        ASSERT_OK(Put(1, keys[k], values_pikachu[k]));
+      }
+
+      // Check key validity, for all keys,
+      // both in default and pikachu.
+      for (size_t k = 0; k < KVSIZE; k++) {
+        ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+        ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+      }
+    }
+
+    // Check that there was at least one mempurge
+    const uint32_t EXPECTED_MIN_MEMPURGE_COUNT = 1;
+    // Check that there was no SST files created during flush.
+    const uint32_t EXPECTED_SST_COUNT = 0;
+
+    EXPECT_GE(mempurge_count.exchange(0), EXPECTED_MIN_MEMPURGE_COUNT);
+    if (options.experimental_mempurge_threshold ==
+        std::numeric_limits<double>::max()) {
+      EXPECT_EQ(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    }
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Check that there was no data corruption anywhere,
+    // not in 'default' nor in 'Pikachu' CFs.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Check keys in 'Default' and 'Pikachu'.
+    // keys[0:KVSIZE/2] were for sure contained
+    // in the imm() at Reopen/recovery time.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    // Insertion of random K-V pairs to trigger
+    // a flush in the Pikachu CF.
+    for (size_t j = 0; j < NUM_REPEAT; j++) {
+      RNDKEY = rnd.RandomString(RAND_KEY_LENGTH);
+      RNDVALUE = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(1, RNDKEY, RNDVALUE));
+    }
+    // ASsert than there was at least one flush to storage.
+    EXPECT_GT(sst_count.exchange(0), EXPECTED_SST_COUNT);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    // Since values in default are held in mutable mem()
+    // and imm(), check if the flush in pikachu didn't
+    // affect these values.
+    for (size_t k = 0; k < KVSIZE; k++) {
+      ASSERT_EQ(Get(0, keys[k]), values_default[k]);
+      ASSERT_EQ(Get(1, keys[k]), values_pikachu[k]);
+    }
+    ASSERT_EQ(Get(1, RNDKEY), RNDVALUE);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBFlushTest, MemPurgeCorrectLogNumberAndSSTFileCreation) {
+  // Before our bug fix, we noticed that when 2 memtables were
+  // being flushed (with one memtable being the output of a
+  // previous MemPurge and one memtable being a newly-sealed memtable),
+  // the SST file created was not properly added to the DB version
+  // (via the VersionEdit obj), leading to data loss (the SST file
+  // was later being purged as an obsolete file).
+  // Therefore, we reproduce this scenario to test our fix.
+  Options options = CurrentOptions();
+
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.inplace_update_support = false;
+  options.allow_concurrent_memtable_write = true;
+
+  // Enforce size of a single MemTable to 1MB (64MB = 1048576 bytes).
+  options.write_buffer_size = 1 << 20;
+  // Activate the MemPurge prototype.
+  options.experimental_mempurge_threshold = 1.0;
+
+  // Force to have more than one memtable to trigger a flush.
+  // For some reason this option does not seem to be enforced,
+  // so the following test is designed to make sure that we
+  // are testing the correct test case.
+  options.min_write_buffer_number_to_merge = 3;
+  options.max_write_buffer_number = 5;
+  options.max_write_buffer_size_to_maintain = 2 * (options.write_buffer_size);
+  options.disable_auto_compactions = true;
+  ASSERT_OK(TryReopen(options));
+
+  std::atomic<uint32_t> mempurge_count{0};
+  std::atomic<uint32_t> sst_count{0};
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:MemPurgeSuccessful",
+      [&](void* /*arg*/) { mempurge_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushJob:SSTFileCreated", [&](void* /*arg*/) { sst_count++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Dummy variable used for the following callback function.
+  uint64_t ZERO = 0;
+  // We will first execute mempurge operations exclusively.
+  // Therefore, when the first flush is triggered, we want to make
+  // sure there is at least 2 memtables being flushed: one output
+  // from a previous mempurge, and one newly sealed memtable.
+  // This is when we observed in the past that some SST files created
+  // were not properly added to the DB version (via the VersionEdit obj).
+  std::atomic<uint64_t> num_memtable_at_first_flush(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:num_memtables", [&](void* arg) {
+        uint64_t* mems_size = reinterpret_cast<uint64_t*>(arg);
+        // atomic_compare_exchange_strong sometimes updates the value
+        // of ZERO (the "expected" object), so we make sure ZERO is indeed...
+        // zero.
+        ZERO = 0;
+        std::atomic_compare_exchange_strong(&num_memtable_at_first_flush, &ZERO,
+                                            *mems_size);
+      });
+
+  const std::vector<std::string> KEYS = {
+      "ThisIsKey1", "ThisIsKey2", "ThisIsKey3", "ThisIsKey4", "ThisIsKey5",
+      "ThisIsKey6", "ThisIsKey7", "ThisIsKey8", "ThisIsKey9"};
+  const std::string NOT_FOUND = "NOT_FOUND";
+
+  Random rnd(117);
+  const uint64_t NUM_REPEAT_OVERWRITES = 100;
+  const uint64_t NUM_RAND_INSERTS = 500;
+  const uint64_t RAND_VALUES_LENGTH = 10240;
+
+  std::string key, value;
+  std::vector<std::string> values(9, "");
+
+  // Keys used to check that no SST file disappeared.
+  for (uint64_t k = 0; k < 5; k++) {
+    values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(KEYS[k], values[k]));
+  }
+
+  // Insertion of of K-V pairs, multiple times.
+  // Trigger at least one mempurge and no SST file creation.
+  for (size_t i = 0; i < NUM_REPEAT_OVERWRITES; i++) {
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    for (uint64_t k = 5; k < values.size(); k++) {
+      values[k] = rnd.RandomString(RAND_VALUES_LENGTH);
+      ASSERT_OK(Put(KEYS[k], values[k]));
+    }
+    // Check database consistency.
+    for (uint64_t k = 0; k < values.size(); k++) {
+      ASSERT_EQ(Get(KEYS[k]), values[k]);
+    }
+  }
+
+  // Check that there was at least one mempurge
+  uint32_t expected_min_mempurge_count = 1;
+  // Check that there was no SST files created during flush.
+  uint32_t expected_sst_count = 0;
+  EXPECT_GE(mempurge_count.load(), expected_min_mempurge_count);
+  EXPECT_EQ(sst_count.load(), expected_sst_count);
+
+  // Trigger an SST file creation and no mempurge.
+  for (size_t i = 0; i < NUM_RAND_INSERTS; i++) {
+    key = rnd.RandomString(RAND_VALUES_LENGTH);
+    // Create value strings of arbitrary length RAND_VALUES_LENGTH bytes.
+    value = rnd.RandomString(RAND_VALUES_LENGTH);
+    ASSERT_OK(Put(key, value));
+    // Check database consistency.
+    for (uint64_t k = 0; k < values.size(); k++) {
+      ASSERT_EQ(Get(KEYS[k]), values[k]);
+    }
+    ASSERT_EQ(Get(key), value);
+  }
+
+  // Check that there was at least one SST files created during flush.
+  expected_sst_count = 1;
+  EXPECT_GE(sst_count.load(), expected_sst_count);
+
+  // Oddly enough, num_memtable_at_first_flush is not enforced to be
+  // equal to min_write_buffer_number_to_merge. So by asserting that
+  // the first SST file creation comes from one output memtable
+  // from a previous mempurge, and one newly sealed memtable. This
+  // is the scenario where we observed that some SST files created
+  // were not properly added to the DB version before our bug fix.
+  ASSERT_GE(num_memtable_at_first_flush.load(), 2);
+
+  // Check that no data was lost after SST file creation.
+  for (uint64_t k = 0; k < values.size(); k++) {
+    ASSERT_EQ(Get(KEYS[k]), values[k]);
+  }
+  // Extra check of database consistency.
+  ASSERT_EQ(Get(key), value);
+
+  Close();
+}
+
+TEST_P(DBFlushDirectIOTest, DirectIO) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.max_background_flushes = 2;
+  options.use_direct_io_for_flush_and_compaction = GetParam();
+  options.env = MockEnv::Create(Env::Default());
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:create_file", [&](void* arg) {
+        bool* use_direct_writes = static_cast<bool*>(arg);
+        ASSERT_EQ(*use_direct_writes,
+                  options.use_direct_io_for_flush_and_compaction);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v"));
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_options));
+  Destroy(options);
+  delete options.env;
+}
+
+TEST_F(DBFlushTest, FlushError) {
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_injection_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  fault_injection_env->SetFilesystemActive(false);
+  Status s = dbfull()->TEST_SwitchMemtable();
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+  ASSERT_NE(s, Status::OK());
+}
+
+TEST_F(DBFlushTest, ManualFlushFailsInReadOnlyMode) {
+  // Regression test for bug where manual flush hangs forever when the DB
+  // is in read-only mode. Verify it now at least returns, despite failing.
+  Options options;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_injection_env.get();
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  // Trigger a first flush but don't let it run
+  ASSERT_OK(db_->PauseBackgroundWork());
+  ASSERT_OK(Put("key1", "value1"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+
+  // Write a key to the second memtable so we have something to flush later
+  // after the DB is in read-only mode.
+  ASSERT_OK(Put("key2", "value2"));
+
+  // Let the first flush continue, hit an error, and put the DB in read-only
+  // mode.
+  fault_injection_env->SetFilesystemActive(false);
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  // We ingested the error to env, so the returned status is not OK.
+  ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable());
+#ifndef ROCKSDB_LITE
+  uint64_t num_bg_errors;
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBackgroundErrors, &num_bg_errors));
+  ASSERT_GT(num_bg_errors, 0);
+#endif  // ROCKSDB_LITE
+
+  // In the bug scenario, triggering another flush would cause the second flush
+  // to hang forever. After the fix we expect it to return an error.
+  ASSERT_NOK(db_->Flush(FlushOptions()));
+
+  Close();
+}
+
+TEST_F(DBFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:AfterScheduleFlush",
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::FlushMemTable:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable(cfd, flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, FireOnFlushCompletedAfterCommittedResult) {
+  class TestListener : public EventListener {
+   public:
+    void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+      // There's only one key in each flush.
+      ASSERT_EQ(info.smallest_seqno, info.largest_seqno);
+      ASSERT_NE(0, info.smallest_seqno);
+      if (info.smallest_seqno == seq1) {
+        // First flush completed
+        ASSERT_FALSE(completed1);
+        completed1 = true;
+        CheckFlushResultCommitted(db, seq1);
+      } else {
+        // Second flush completed
+        ASSERT_FALSE(completed2);
+        completed2 = true;
+        ASSERT_EQ(info.smallest_seqno, seq2);
+        CheckFlushResultCommitted(db, seq2);
+      }
+    }
+
+    void CheckFlushResultCommitted(DB* db, SequenceNumber seq) {
+      DBImpl* db_impl = static_cast_with_check<DBImpl>(db);
+      InstrumentedMutex* mutex = db_impl->mutex();
+      mutex->Lock();
+      auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(
+                      db->DefaultColumnFamily())
+                      ->cfd();
+      ASSERT_LT(seq, cfd->imm()->current()->GetEarliestSequenceNumber());
+      mutex->Unlock();
+    }
+
+    std::atomic<SequenceNumber> seq1{0};
+    std::atomic<SequenceNumber> seq2{0};
+    std::atomic<bool> completed1{false};
+    std::atomic<bool> completed2{false};
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTableToOutputFile:AfterPickMemtables",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst"},
+       {"DBImpl::FlushMemTableToOutputFile:Finish",
+        "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitSecond"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table", [&listener](void* arg) {
+        // Wait for the second flush finished, out of mutex.
+        auto* mems = reinterpret_cast<autovector<MemTable*>*>(arg);
+        if (mems->front()->GetEarliestSequenceNumber() == listener->seq1 - 1) {
+          TEST_SYNC_POINT(
+              "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:"
+              "WaitSecond");
+        }
+      });
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  // Setting max_flush_jobs = max_background_jobs / 4 = 2.
+  options.max_background_jobs = 8;
+  // Allow 2 immutable memtables.
+  options.max_write_buffer_number = 3;
+  Reopen(options);
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("foo", "v"));
+  listener->seq1 = db_->GetLatestSequenceNumber();
+  // t1 will wait for the second flush complete before committing flush result.
+  auto t1 = port::Thread([&]() {
+    // flush_opts.wait = true
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  });
+  // Wait for first flush started.
+  TEST_SYNC_POINT(
+      "DBFlushTest::FireOnFlushCompletedAfterCommittedResult:WaitFirst");
+  // The second flush will exit early without commit its result. The work
+  // is delegated to the first flush.
+  ASSERT_OK(Put("bar", "v"));
+  listener->seq2 = db_->GetLatestSequenceNumber();
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db_->Flush(flush_opts));
+  t1.join();
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  ASSERT_TRUE(listener->completed1);
+  ASSERT_TRUE(listener->completed2);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBFlushTest, FlushWithBlob) {
+  constexpr uint64_t min_blob_size = 10;
+
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  assert(table_file);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  assert(blob_file);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kTableFile);
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is an
+  // unrecoverable error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoff2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  Reopen(options);
+
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+  Reopen(options);
+
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // options is not set, the checksum handoff will not be triggered
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs->IngestDataCorruptionBeforeWrite();
+  });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest1) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "value1"));
+  ASSERT_OK(Put("key2", "value2"));
+  ASSERT_OK(Flush());
+
+  // The hash does not match, write fails
+  // fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  ASSERT_OK(Put("key3", "value3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+      });
+  ASSERT_OK(Put("key3", "value3"));
+  ASSERT_OK(Put("key4", "value4"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, FlushWithChecksumHandoffManifest2) {
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 3;
+  options.disable_auto_compactions = true;
+  options.env = fault_fs_env.get();
+  options.checksum_handoff_file_types.Add(FileType::kDescriptorFile);
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+  Reopen(options);
+  // The file system does not support checksum handoff. The check
+  // will be ignored.
+  ASSERT_OK(Put("key5", "value5"));
+  ASSERT_OK(Put("key6", "value6"));
+  ASSERT_OK(Flush());
+
+  // Each write will be similated as corrupted.
+  // Since the file system returns IOStatus::Corruption, it is mapped to
+  // kFatalError error.
+  fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs->IngestDataCorruptionBeforeWrite(); });
+  ASSERT_OK(Put("key7", "value7"));
+  ASSERT_OK(Put("key8", "value8"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Destroy(options);
+}
+
+TEST_F(DBFlushTest, PickRightMemtables) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.create_if_missing = true;
+
+  const std::string test_cf_name = "test_cf";
+  options.max_write_buffer_number = 128;
+  CreateColumnFamilies({test_cf_name}, options);
+
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, test_cf_name}, options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "value"));
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "key", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:BeforeReLock", [&](void* /*arg*/) {
+        ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "what", "v"));
+        auto* cfhi =
+            static_cast_with_check<ColumnFamilyHandleImpl>(handles_[1]);
+        assert(cfhi);
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable(cfhi->cfd()));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", [&](void* arg) {
+        auto* job = reinterpret_cast<FlushJob*>(arg);
+        assert(job);
+        const auto& mems = job->GetMemTables();
+        assert(mems.size() == 1);
+        assert(mems[0]);
+        ASSERT_EQ(1, mems[0]->GetID());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBFlushTestBlobError : public DBFlushTest,
+                             public testing::WithParamInterface<std::string> {
+ public:
+  DBFlushTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBFlushTestBlobError, DBFlushTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBFlushTestBlobError, FlushError) {
+  Options options;
+  options.enable_blob_files = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "blob"));
+
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_NOK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_TRUE(l0_files.empty());
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_TRUE(blob_files.empty());
+
+  // Make sure the files generated by the failed job have been deleted
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  assert(internal_stats);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+
+  if (sync_point_ == "BlobFileBuilder::WriteBlobToFile:AddRecord") {
+    ASSERT_EQ(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  } else {
+    // SST file writing succeeded; blob file writing failed (during Finish)
+    ASSERT_GT(compaction_stats[0].bytes_written, 0);
+    ASSERT_EQ(compaction_stats[0].bytes_written_blob, 0);
+    ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+    ASSERT_EQ(compaction_stats[0].num_output_files_blob, 0);
+  }
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+  class SimpleTestFlushListener : public EventListener {
+   public:
+    explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+    ~SimpleTestFlushListener() override {}
+
+    void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+      ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+      ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+      snapshot_ = db->GetSnapshot();
+      ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+      auto* dbimpl = static_cast_with_check<DBImpl>(db);
+      assert(dbimpl);
+
+      ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+      auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+      assert(cfhi);
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+    }
+
+    DBFlushTest* test_ = nullptr;
+    const Snapshot* snapshot_ = nullptr;
+  };
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  auto* listener = new SimpleTestFlushListener(this);
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+  const Snapshot* snapshot = listener->snapshot_;
+  assert(snapshot);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+
+  // Using snapshot should not see "foo".
+  {
+    std::string value;
+    Status s = db_->Get(read_opts, "foo", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.allow_2pc = true;
+  options.atomic_flush = GetParam();
+  // 64MB so that memtable flush won't be trigger by the small writes.
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  txn_db_opts.write_policy = TxnDBWritePolicy::WRITE_COMMITTED;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  // Create two more columns other than default CF.
+  std::vector<std::string> cfs = {"puppy", "kitty"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 2);
+  ASSERT_EQ(handles_[0]->GetName(), cfs[0]);
+  ASSERT_EQ(handles_[1]->GetName(), cfs[1]);
+  const size_t kNumCfToFlush = options.atomic_flush ? 2 : 1;
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  // txn1 only prepare, but does not commit.
+  // The WAL containing the prepared but uncommitted data must be kept.
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  // txn2 not only prepare, but also commit.
+  Transaction* txn2 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_NE(txn2, nullptr);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    ASSERT_OK(txn1->Put(handles_[i], "k1", "v1"));
+    ASSERT_OK(txn2->Put(handles_[i], "k2", "v2"));
+  }
+  // A txn must be named before prepare.
+  ASSERT_OK(txn1->SetName("txn1"));
+  ASSERT_OK(txn2->SetName("txn2"));
+  // Prepare writes to WAL, but not to memtable. (WriteCommitted)
+  ASSERT_OK(txn1->Prepare());
+  ASSERT_OK(txn2->Prepare());
+  // Commit writes to memtable.
+  ASSERT_OK(txn2->Commit());
+  delete txn1;
+  delete txn2;
+
+  // There are still data in memtable not flushed.
+  // But since data is small enough to reside in the active memtable,
+  // there are no immutable memtable.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  // Atomic flush memtables,
+  // the min log with prepared data should be written to MANIFEST.
+  std::vector<ColumnFamilyHandle*> cfs_to_flush(kNumCfToFlush);
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    cfs_to_flush[i] = handles_[i];
+  }
+  ASSERT_OK(txn_db->Flush(FlushOptions(), cfs_to_flush));
+
+  // There are no remaining data in memtable after flush.
+  for (size_t i = 0; i < kNumCfToFlush; i++) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+  }
+
+  // The recovered min log number with prepared data should be non-zero.
+  // In 2pc mode, MinLogNumberToKeep returns the
+  // VersionSet::min_log_number_to_keep recovered from MANIFEST, if it's 0,
+  // it means atomic flush didn't write the min_log_number_to_keep to MANIFEST.
+  cfs.push_back(kDefaultColumnFamilyName);
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+  DBImpl* db_impl = reinterpret_cast<DBImpl*>(db_);
+  ASSERT_TRUE(db_impl->allow_2pc());
+  ASSERT_NE(db_impl->MinLogNumberToKeep(), 0);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBAtomicFlushTest, ManualAtomicFlush) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+  }
+
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    cf_ids.emplace_back(static_cast<int>(i));
+  }
+  ASSERT_OK(Flush(cf_ids));
+
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(cfh->cfd()->GetFlushReason(), FlushReason::kManualFlush);
+    ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+}
+
+TEST_P(DBAtomicFlushTest, PrecomputeMinLogNumberToKeepNon2PC) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.write_buffer_size = (static_cast<size_t>(64) << 20);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const size_t num_cfs = handles_.size();
+  ASSERT_EQ(num_cfs, 2);
+  WriteOptions wopts;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+
+  {
+    // Flush the default CF only.
+    std::vector<int> cf_ids{0};
+    ASSERT_OK(Flush(cf_ids));
+
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    auto flushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[0]);
+    flushed_cfds.push_back(flushed_cfh->cfd());
+    flush_edits.push_back({});
+    auto unflushed_cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[1]);
+
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              unflushed_cfh->cfd()->GetLogNumber());
+  }
+
+  {
+    // Flush all CFs.
+    std::vector<int> cf_ids;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      cf_ids.emplace_back(static_cast<int>(i));
+    }
+    ASSERT_OK(Flush(cf_ids));
+    uint64_t log_num_after_flush = dbfull()->TEST_GetCurrentLogNumber();
+
+    uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+    autovector<ColumnFamilyData*> flushed_cfds;
+    autovector<autovector<VersionEdit*>> flush_edits;
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      flushed_cfds.push_back(cfh->cfd());
+      flush_edits.push_back({});
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfh->cfd()->GetLogNumber());
+    }
+    ASSERT_EQ(min_log_number_to_keep, log_num_after_flush);
+    ASSERT_EQ(PrecomputeMinLogNumberToKeepNon2PC(dbfull()->GetVersionSet(),
+                                                 flushed_cfds, flush_edits),
+              min_log_number_to_keep);
+  }
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushTriggeredByMemTableFull) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  // 4KB so that we can easily trigger auto flush.
+  options.write_buffer_size = 4096;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:FlushFinish:0",
+        "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    ASSERT_OK(Put(static_cast<int>(i) /*cf*/, "key", "value", wopts));
+  }
+  // Keep writing to one of them column families to trigger auto flush.
+  for (int i = 0; i != 4000; ++i) {
+    ASSERT_OK(Put(static_cast<int>(num_cfs) - 1 /*cf*/,
+                  "key" + std::to_string(i), "value" + std::to_string(i),
+                  wopts));
+  }
+
+  TEST_SYNC_POINT(
+      "DBAtomicFlushTest::AtomicFlushTriggeredByMemTableFull:BeforeCheck");
+  if (options.atomic_flush) {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+    }
+  } else {
+    for (size_t i = 0; i + 1 != num_cfs; ++i) {
+      auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+      ASSERT_EQ(0, cfh->cfd()->imm()->NumNotFlushed());
+      ASSERT_FALSE(cfh->cfd()->mem()->IsEmpty());
+    }
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, AtomicFlushRollbackSomeJobs) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1",
+        "DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1"},
+       {"DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2",
+        "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:1");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT("DBAtomicFlushTest::AtomicFlushRollbackSomeJobs:2");
+  for (auto* cfh : handles_) {
+    // Returns the IO error happend during flush.
+    ASSERT_NOK(dbfull()->TEST_WaitForFlushMemTable(cfh));
+  }
+  for (size_t i = 0; i != num_cfs; ++i) {
+    auto cfh = static_cast<ColumnFamilyHandleImpl*>(handles_[i]);
+    ASSERT_EQ(1, cfh->cfd()->imm()->NumNotFlushed());
+    ASSERT_TRUE(cfh->cfd()->mem()->IsEmpty());
+  }
+  fault_injection_env->SetFilesystemActive(true);
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, FlushMultipleCFs_DropSomeBeforeRequestFlush) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  std::vector<int> cf_ids;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+    cf_ids.push_back(cf_id);
+  }
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_TRUE(Flush(cf_ids).IsColumnFamilyDropped());
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest,
+       FlushMultipleCFs_DropSomeAfterScheduleFlushBeforeFlushJobRun) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::BeforeDropCF"},
+       {"DBAtomicFlushTest::AfterDropCF",
+        "DBImpl::BackgroundCallFlush:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(3, num_cfs);
+  WriteOptions wopts;
+  wopts.disableWAL = true;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_OK(Put(cf_id, "key", "value", wopts));
+  }
+  port::Thread user_thread([&]() {
+    TEST_SYNC_POINT("DBAtomicFlushTest::BeforeDropCF");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    TEST_SYNC_POINT("DBAtomicFlushTest::AfterDropCF");
+  });
+  FlushOptions flush_opts;
+  flush_opts.wait = true;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  user_thread.join();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "eevee"}, options);
+  num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  for (size_t i = 0; i != num_cfs; ++i) {
+    int cf_id = static_cast<int>(i);
+    ASSERT_EQ("value", Get(cf_id, "key"));
+  }
+  Destroy(options);
+}
+
+TEST_P(DBAtomicFlushTest, TriggerFlushAndClose) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  const int kNumKeysTriggerFlush = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysTriggerFlush));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i != kNumKeysTriggerFlush; ++i) {
+    ASSERT_OK(Put(0, "key" + std::to_string(i), "value" + std::to_string(i)));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put(0, "key", "value"));
+  Close();
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ("value", Get(0, "key"));
+}
+
+TEST_P(DBAtomicFlushTest, PickMemtablesRaceWithBackgroundFlush) {
+  bool atomic_flush = GetParam();
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  options.max_write_buffer_number = 4;
+  // Set min_write_buffer_number_to_merge to be greater than 1, so that
+  // a column family with one memtable in the imm will not cause IsFlushPending
+  // to return true when flush_requested_ is false.
+  options.min_write_buffer_number_to_merge = 2;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  ASSERT_OK(Put(0, "key00", "value00"));
+  ASSERT_OK(Put(1, "key10", "value10"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(dbfull()->Flush(flush_opts, handles_));
+  ASSERT_OK(Put(0, "key01", "value01"));
+  // Since max_write_buffer_number is 4, the following flush won't cause write
+  // stall.
+  ASSERT_OK(dbfull()->Flush(flush_opts));
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  ASSERT_OK(dbfull()->DestroyColumnFamilyHandle(handles_[1]));
+  handles_[1] = nullptr;
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  delete handles_[0];
+  handles_.clear();
+}
+
+TEST_P(DBAtomicFlushTest, CFDropRaceWithWaitForFlushMemTables) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::AtomicFlushMemTables:AfterScheduleFlush",
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop"},
+       {"DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBImpl::BackgroundCallFlush:start",
+        "DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_EQ(2, handles_.size());
+  ASSERT_OK(Put(0, "key", "value"));
+  ASSERT_OK(Put(1, "key", "value"));
+  auto* cfd_default =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily())
+          ->cfd();
+  auto* cfd_pikachu = static_cast<ColumnFamilyHandleImpl*>(handles_[1])->cfd();
+  port::Thread drop_cf_thr([&]() {
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:BeforeDrop");
+    ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+    delete handles_[1];
+    handles_.resize(1);
+    TEST_SYNC_POINT(
+        "DBAtomicFlushTest::CFDropRaceWithWaitForFlushMemTables:AfterFree");
+  });
+  FlushOptions flush_opts;
+  flush_opts.allow_write_stall = true;
+  ASSERT_OK(dbfull()->TEST_AtomicFlushMemTables({cfd_default, cfd_pikachu},
+                                                flush_opts));
+  drop_cf_thr.join();
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBAtomicFlushTest, RollbackAfterFailToInstallResults) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  options.create_if_missing = true;
+  options.atomic_flush = atomic_flush;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    ASSERT_OK(Put(static_cast<int>(cf), "a", "value"));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void* /*arg*/) { fault_injection_env->SetFilesystemActive(false); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  FlushOptions flush_opts;
+  Status s = db_->Flush(flush_opts, handles_);
+  ASSERT_NOK(s);
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+// In atomic flush, concurrent bg flush threads commit to the MANIFEST in
+// serial, in the order of their picked memtables for each column family.
+// Only when a bg flush thread finds out that its memtables are the earliest
+// unflushed ones for all the included column families will this bg flush
+// thread continue to commit to MANIFEST.
+// This unit test uses sync point to coordinate the execution of two bg threads
+// executing the same sequence of functions. The interleaving are as follows.
+// time            bg1                            bg2
+//  |   pick memtables to flush
+//  |   flush memtables cf1_m1, cf2_m1
+//  |   join MANIFEST write queue
+//  |                                     pick memtabls to flush
+//  |                                     flush memtables cf1_(m1+1)
+//  |                                     join MANIFEST write queue
+//  |                                     wait to write MANIFEST
+//  |   write MANIFEST
+//  |   IO error
+//  |                                     detect IO error and stop waiting
+//  V
+TEST_P(DBAtomicFlushTest, BgThreadNoWaitAfterManifestError) {
+  bool atomic_flush = GetParam();
+  if (!atomic_flush) {
+    return;
+  }
+  auto fault_injection_env = std::make_shared<FaultInjectionTestEnv>(env_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = true;
+  options.env = fault_injection_env.get();
+  // Set a larger value than default so that RocksDB can schedule concurrent
+  // background flush threads.
+  options.max_background_jobs = 8;
+  options.max_write_buffer_number = 8;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  assert(2 == handles_.size());
+
+  WriteOptions write_opts;
+  write_opts.disableWAL = true;
+
+  ASSERT_OK(Put(0, "a", "v_0_a", write_opts));
+  ASSERT_OK(Put(1, "a", "v_1_a", write_opts));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"BgFlushThr2:WaitToCommit", "BgFlushThr1:BeforeWriteManifest"},
+  });
+
+  std::thread::id bg_flush_thr1, bg_flush_thr2;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void*) {
+        if (bg_flush_thr1 == std::thread::id()) {
+          bg_flush_thr1 = std::this_thread::get_id();
+        } else if (bg_flush_thr2 == std::thread::id()) {
+          bg_flush_thr2 = std::this_thread::get_id();
+        }
+      });
+
+  int called = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", [&](void* arg) {
+        if (std::this_thread::get_id() == bg_flush_thr2) {
+          const auto* ptr = reinterpret_cast<std::pair<Status, bool>*>(arg);
+          assert(ptr);
+          if (0 == called) {
+            // When bg flush thread 2 reaches here for the first time.
+            ASSERT_OK(ptr->first);
+            ASSERT_TRUE(ptr->second);
+          } else if (1 == called) {
+            // When bg flush thread 2 reaches here for the second time.
+            ASSERT_TRUE(ptr->first.IsIOError());
+            ASSERT_FALSE(ptr->second);
+          }
+          ++called;
+          TEST_SYNC_POINT("BgFlushThr2:WaitToCommit");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+      [&](void*) {
+        if (std::this_thread::get_id() == bg_flush_thr1) {
+          TEST_SYNC_POINT("BgFlushThr1:BeforeWriteManifest");
+        }
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (std::this_thread::get_id() != bg_flush_thr1) {
+          return;
+        }
+        ASSERT_OK(db_->Put(write_opts, "b", "v_1_b"));
+
+        FlushOptions flush_opts;
+        flush_opts.wait = false;
+        std::vector<ColumnFamilyHandle*> cfhs(1, db_->DefaultColumnFamily());
+        ASSERT_OK(dbfull()->Flush(flush_opts, cfhs));
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) {
+        auto* ptr = reinterpret_cast<IOStatus*>(arg);
+        assert(ptr);
+        *ptr = IOStatus::IOError("Injected failure");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()->Flush(FlushOptions(), handles_).IsIOError());
+
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBAtomicFlushTest, NoWaitWhenWritesStopped) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.atomic_flush = GetParam();
+  options.max_write_buffer_number = 2;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::DelayWrite:Start",
+        "DBAtomicFlushTest::NoWaitWhenWritesStopped:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+  for (int i = 0; i < options.max_write_buffer_number; ++i) {
+    ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+  }
+  std::thread stalled_writer([&]() { ASSERT_OK(Put("k", "v")); });
+
+  TEST_SYNC_POINT("DBAtomicFlushTest::NoWaitWhenWritesStopped:0");
+
+  {
+    FlushOptions flush_opts;
+    flush_opts.wait = false;
+    flush_opts.allow_write_stall = true;
+    ASSERT_TRUE(db_->Flush(flush_opts).IsTryAgain());
+  }
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  stalled_writer.join();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBFlushDirectIOTest, DBFlushDirectIOTest,
+                        testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(DBAtomicFlushTest, DBAtomicFlushTest, testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.cc b/src/rocksdb/db/db_impl/compacted_db_impl.cc
new file mode 100644
index 000000000..f18ee0d72
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.cc
@@ -0,0 +1,257 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/db_impl/compacted_db_impl.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "table/get_context.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+extern void MarkKeyMayExist(void* arg);
+extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
+                      const Slice& v, bool hit_and_return);
+
+CompactedDBImpl::CompactedDBImpl(const DBOptions& options,
+                                 const std::string& dbname)
+    : DBImpl(options, dbname, /*seq_per_batch*/ false, +/*batch_per_txn*/ true,
+             /*read_only*/ true),
+      cfd_(nullptr),
+      version_(nullptr),
+      user_comparator_(nullptr) {}
+
+CompactedDBImpl::~CompactedDBImpl() {}
+
+size_t CompactedDBImpl::FindFile(const Slice& key) {
+  size_t right = files_.num_files - 1;
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
+  };
+  return static_cast<size_t>(
+      std::lower_bound(files_.files, files_.files + right, key, cmp) -
+      files_.files);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value) {
+  return Get(options, /*column_family*/ nullptr, key, value,
+             /*timestamp*/ nullptr);
+}
+
+Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
+                            const Slice& key, PinnableSlice* value,
+                            std::string* timestamp) {
+  assert(user_comparator_);
+  if (options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        DefaultColumnFamily(), *(options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(DefaultColumnFamily());
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+  std::string* ts =
+      user_comparator_->timestamp_size() > 0 ? timestamp : nullptr;
+  LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+  GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
+                         GetContext::kNotFound, lkey.user_key(), value,
+                         /*columns=*/nullptr, ts, nullptr, nullptr, true,
+                         nullptr, nullptr, nullptr, nullptr, &read_cb);
+
+  const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+  if (user_comparator_->CompareWithoutTimestamp(
+          key, /*a_has_ts=*/false,
+          ExtractUserKeyAndStripTimestamp(f.smallest_key,
+                                          user_comparator_->timestamp_size()),
+          /*b_has_ts=*/false) < 0) {
+    return Status::NotFound();
+  }
+  Status s = f.fd.table_reader->Get(options, lkey.internal_key(), &get_context,
+                                    nullptr);
+  if (!s.ok() && !s.IsNotFound()) {
+    return s;
+  }
+  if (get_context.State() == GetContext::kFound) {
+    return Status::OK();
+  }
+  return Status::NotFound();
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+    const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  return MultiGet(options, keys, values, /*timestamps*/ nullptr);
+}
+
+std::vector<Status> CompactedDBImpl::MultiGet(
+    const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+    const std::vector<Slice>& keys, std::vector<std::string>* values,
+    std::vector<std::string>* timestamps) {
+  assert(user_comparator_);
+  size_t num_keys = keys.size();
+
+  if (options.timestamp) {
+    Status s = FailIfTsMismatchCf(DefaultColumnFamily(), *(options.timestamp),
+                                  /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return std::vector<Status>(num_keys, s);
+    }
+  } else {
+    Status s = FailIfCfHasTs(DefaultColumnFamily());
+    if (!s.ok()) {
+      return std::vector<Status>(num_keys, s);
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamps) {
+    for (auto& ts : *timestamps) {
+      ts.clear();
+    }
+  }
+
+  GetWithTimestampReadCallback read_cb(kMaxSequenceNumber);
+  autovector<TableReader*, 16> reader_list;
+  for (const auto& key : keys) {
+    LookupKey lkey(key, kMaxSequenceNumber, options.timestamp);
+    const FdWithKeyRange& f = files_.files[FindFile(lkey.user_key())];
+    if (user_comparator_->CompareWithoutTimestamp(
+            key, /*a_has_ts=*/false,
+            ExtractUserKeyAndStripTimestamp(f.smallest_key,
+                                            user_comparator_->timestamp_size()),
+            /*b_has_ts=*/false) < 0) {
+      reader_list.push_back(nullptr);
+    } else {
+      f.fd.table_reader->Prepare(lkey.internal_key());
+      reader_list.push_back(f.fd.table_reader);
+    }
+  }
+  std::vector<Status> statuses(num_keys, Status::NotFound());
+  values->resize(num_keys);
+  if (timestamps) {
+    timestamps->resize(num_keys);
+  }
+  int idx = 0;
+  for (auto* r : reader_list) {
+    if (r != nullptr) {
+      PinnableSlice pinnable_val;
+      std::string& value = (*values)[idx];
+      LookupKey lkey(keys[idx], kMaxSequenceNumber, options.timestamp);
+      std::string* timestamp = timestamps ? &(*timestamps)[idx] : nullptr;
+      GetContext get_context(
+          user_comparator_, nullptr, nullptr, nullptr, GetContext::kNotFound,
+          lkey.user_key(), &pinnable_val, /*columns=*/nullptr,
+          user_comparator_->timestamp_size() > 0 ? timestamp : nullptr, nullptr,
+          nullptr, true, nullptr, nullptr, nullptr, nullptr, &read_cb);
+      Status s = r->Get(options, lkey.internal_key(), &get_context, nullptr);
+      assert(static_cast<size_t>(idx) < statuses.size());
+      if (!s.ok() && !s.IsNotFound()) {
+        statuses[idx] = s;
+      } else {
+        value.assign(pinnable_val.data(), pinnable_val.size());
+        if (get_context.State() == GetContext::kFound) {
+          statuses[idx] = Status::OK();
+        }
+      }
+    }
+    ++idx;
+  }
+  return statuses;
+}
+
+Status CompactedDBImpl::Init(const Options& options) {
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  mutex_.Lock();
+  ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
+                            ColumnFamilyOptions(options));
+  Status s = Recover({cf}, true /* read only */, false, true);
+  if (s.ok()) {
+    cfd_ = static_cast_with_check<ColumnFamilyHandleImpl>(DefaultColumnFamily())
+               ->cfd();
+    cfd_->InstallSuperVersion(&sv_context, &mutex_);
+  }
+  mutex_.Unlock();
+  sv_context.Clean();
+  if (!s.ok()) {
+    return s;
+  }
+  NewThreadStatusCfInfo(cfd_);
+  version_ = cfd_->GetSuperVersion()->current;
+  user_comparator_ = cfd_->user_comparator();
+  auto* vstorage = version_->storage_info();
+  if (vstorage->num_non_empty_levels() == 0) {
+    return Status::NotSupported("no file exists");
+  }
+  const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
+  // L0 should not have files
+  if (l0.num_files > 1) {
+    return Status::NotSupported("L0 contain more than 1 file");
+  }
+  if (l0.num_files == 1) {
+    if (vstorage->num_non_empty_levels() > 1) {
+      return Status::NotSupported("Both L0 and other level contain files");
+    }
+    files_ = l0;
+    return Status::OK();
+  }
+
+  for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
+    if (vstorage->LevelFilesBrief(i).num_files > 0) {
+      return Status::NotSupported("Other levels also contain files");
+    }
+  }
+
+  int level = vstorage->num_non_empty_levels() - 1;
+  if (vstorage->LevelFilesBrief(level).num_files > 0) {
+    files_ = vstorage->LevelFilesBrief(level);
+    return Status::OK();
+  }
+  return Status::NotSupported("no file exists");
+}
+
+Status CompactedDBImpl::Open(const Options& options, const std::string& dbname,
+                             DB** dbptr) {
+  *dbptr = nullptr;
+
+  if (options.max_open_files != -1) {
+    return Status::InvalidArgument("require max_open_files = -1");
+  }
+  if (options.merge_operator.get() != nullptr) {
+    return Status::InvalidArgument("merge operator is not supported");
+  }
+  DBOptions db_options(options);
+  std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
+  Status s = db->Init(options);
+  if (s.ok()) {
+    s = db->StartPeriodicTaskScheduler();
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
+                   "Opened the db as fully compacted mode");
+    LogFlush(db->immutable_db_options_.info_log);
+    *dbptr = db.release();
+  }
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/compacted_db_impl.h b/src/rocksdb/db/db_impl/compacted_db_impl.h
new file mode 100644
index 000000000..eb458b85d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/compacted_db_impl.h
@@ -0,0 +1,154 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#ifndef ROCKSDB_LITE
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with DBImplSecondary and DBImplReadOnly
+class CompactedDBImpl : public DBImpl {
+ public:
+  CompactedDBImpl(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  CompactedDBImpl(const CompactedDBImpl&) = delete;
+  void operator=(const CompactedDBImpl&) = delete;
+
+  ~CompactedDBImpl() override;
+
+  static Status Open(const Options& options, const std::string& dbname,
+                     DB** dbptr);
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  using DB::MultiGet;
+  // Note that CompactedDBImpl::MultiGet is not the optimized version of
+  // MultiGet to use.
+  // TODO: optimize CompactedDBImpl::MultiGet, see DBImpl::MultiGet for details.
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options, const std::vector<ColumnFamilyHandle*>&,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+
+  std::vector<Status> MultiGet(const ReadOptions& options,
+                               const std::vector<ColumnFamilyHandle*>&,
+                               const std::vector<Slice>& keys,
+                               std::vector<std::string>* values,
+                               std::vector<std::string>* timestamps) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported in compacted db mode.");
+  }
+
+  // FIXME: some missing overrides for more "write" functions
+  // Share with DBImplReadOnly?
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  friend class DB;
+  inline size_t FindFile(const Slice& key);
+  Status Init(const Options& options);
+
+  ColumnFamilyData* cfd_;
+  Version* version_;
+  const Comparator* user_comparator_;
+  LevelFilesBrief files_;
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl.cc b/src/rocksdb/db/db_impl/db_impl.cc
new file mode 100644
index 000000000..a431111d4
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.cc
@@ -0,0 +1,5918 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_impl/db_impl.h"
+
+#include <stdint.h>
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <map>
+#include <set>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/builder.h"
+#include "db/compaction/compaction_job.h"
+#include "db/db_info_dumper.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/forward_iterator.h"
+#include "db/import_column_family_job.h"
+#include "db/job_context.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/malloc_stats.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/table_cache.h"
+#include "db/table_properties_collector.h"
+#include "db/transaction_log_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "db/write_callback.h"
+#include "env/unique_id_gen.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/in_memory_stats_history.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "options/cf_options.h"
+#include "options/options_helper.h"
+#include "options/options_parser.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/stats_history.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "rocksdb/version.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/block_based/block.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/get_context.h"
+#include "table/merging_iterator.h"
+#include "table/multiget_context.h"
+#include "table/sst_file_dumper.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "trace_replay/trace_replay.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/compression.h"
+#include "util/crc32c.h"
+#include "util/defer.h"
+#include "util/distributed_mutex.h"
+#include "util/hash_containers.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/trace/replayer_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::string kDefaultColumnFamilyName("default");
+const std::string kPersistentStatsColumnFamilyName(
+    "___rocksdb_stats_history___");
+void DumpRocksDBBuildVersion(Logger* log);
+
+CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options) {
+  // Compressing memtable flushes might not help unless the sequential load
+  // optimization is used for leveled compaction. Otherwise the CPU and
+  // latency overhead is not offset by saving much space.
+  if (ioptions.compaction_style == kCompactionStyleUniversal &&
+      mutable_cf_options.compaction_options_universal
+              .compression_size_percent >= 0) {
+    return kNoCompression;
+  }
+  if (mutable_cf_options.compression_per_level.empty()) {
+    return mutable_cf_options.compression;
+  } else {
+    // For leveled compress when min_level_to_compress != 0.
+    return mutable_cf_options.compression_per_level[0];
+  }
+}
+
+namespace {
+void DumpSupportInfo(Logger* logger) {
+  ROCKS_LOG_HEADER(logger, "Compression algorithms supported:");
+  for (auto& compression : OptionsHelper::compression_type_string_map) {
+    if (compression.second != kNoCompression &&
+        compression.second != kDisableCompressionOption) {
+      ROCKS_LOG_HEADER(logger, "\t%s supported: %d", compression.first.c_str(),
+                       CompressionTypeSupported(compression.second));
+    }
+  }
+  ROCKS_LOG_HEADER(logger, "Fast CRC32 supported: %s",
+                   crc32c::IsFastCrc32Supported().c_str());
+
+  ROCKS_LOG_HEADER(logger, "DMutex implementation: %s", DMutex::kName());
+}
+}  // namespace
+
+DBImpl::DBImpl(const DBOptions& options, const std::string& dbname,
+               const bool seq_per_batch, const bool batch_per_txn,
+               bool read_only)
+    : dbname_(dbname),
+      own_info_log_(options.info_log == nullptr),
+      init_logger_creation_s_(),
+      initial_db_options_(SanitizeOptions(dbname, options, read_only,
+                                          &init_logger_creation_s_)),
+      env_(initial_db_options_.env),
+      io_tracer_(std::make_shared<IOTracer>()),
+      immutable_db_options_(initial_db_options_),
+      fs_(immutable_db_options_.fs, io_tracer_),
+      mutable_db_options_(initial_db_options_),
+      stats_(immutable_db_options_.stats),
+#ifdef COERCE_CONTEXT_SWITCH
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS, &bg_cv_,
+             immutable_db_options_.use_adaptive_mutex),
+#else   // COERCE_CONTEXT_SWITCH
+      mutex_(stats_, immutable_db_options_.clock, DB_MUTEX_WAIT_MICROS,
+             immutable_db_options_.use_adaptive_mutex),
+#endif  // COERCE_CONTEXT_SWITCH
+      default_cf_handle_(nullptr),
+      error_handler_(this, immutable_db_options_, &mutex_),
+      event_logger_(immutable_db_options_.info_log.get()),
+      max_total_in_memory_state_(0),
+      file_options_(BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+      file_options_for_compaction_(fs_->OptimizeForCompactionTableWrite(
+          file_options_, immutable_db_options_)),
+      seq_per_batch_(seq_per_batch),
+      batch_per_txn_(batch_per_txn),
+      next_job_id_(1),
+      shutting_down_(false),
+      db_lock_(nullptr),
+      manual_compaction_paused_(false),
+      bg_cv_(&mutex_),
+      logfile_number_(0),
+      log_dir_synced_(false),
+      log_empty_(true),
+      persist_stats_cf_handle_(nullptr),
+      log_sync_cv_(&log_write_mutex_),
+      total_log_size_(0),
+      is_snapshot_supported_(true),
+      write_buffer_manager_(immutable_db_options_.write_buffer_manager.get()),
+      write_thread_(immutable_db_options_),
+      nonmem_write_thread_(immutable_db_options_),
+      write_controller_(mutable_db_options_.delayed_write_rate),
+      last_batch_group_size_(0),
+      unscheduled_flushes_(0),
+      unscheduled_compactions_(0),
+      bg_bottom_compaction_scheduled_(0),
+      bg_compaction_scheduled_(0),
+      num_running_compactions_(0),
+      bg_flush_scheduled_(0),
+      num_running_flushes_(0),
+      bg_purge_scheduled_(0),
+      disable_delete_obsolete_files_(0),
+      pending_purge_obsolete_files_(0),
+      delete_obsolete_files_last_run_(immutable_db_options_.clock->NowMicros()),
+      last_stats_dump_time_microsec_(0),
+      has_unpersisted_data_(false),
+      unable_to_release_oldest_log_(false),
+      num_running_ingest_file_(0),
+#ifndef ROCKSDB_LITE
+      wal_manager_(immutable_db_options_, file_options_, io_tracer_,
+                   seq_per_batch),
+#endif  // ROCKSDB_LITE
+      bg_work_paused_(0),
+      bg_compaction_paused_(0),
+      refitting_level_(false),
+      opened_successfully_(false),
+#ifndef ROCKSDB_LITE
+      periodic_task_scheduler_(),
+#endif  // ROCKSDB_LITE
+      two_write_queues_(options.two_write_queues),
+      manual_wal_flush_(options.manual_wal_flush),
+      // last_sequencee_ is always maintained by the main queue that also writes
+      // to the memtable. When two_write_queues_ is disabled last seq in
+      // memtable is the same as last seq published to the readers. When it is
+      // enabled but seq_per_batch_ is disabled, last seq in memtable still
+      // indicates last published seq since wal-only writes that go to the 2nd
+      // queue do not consume a sequence number. Otherwise writes performed by
+      // the 2nd queue could change what is visible to the readers. In this
+      // cases, last_seq_same_as_publish_seq_==false, the 2nd queue maintains a
+      // separate variable to indicate the last published sequence.
+      last_seq_same_as_publish_seq_(
+          !(seq_per_batch && options.two_write_queues)),
+      // Since seq_per_batch_ is currently set only by WritePreparedTxn which
+      // requires a custom gc for compaction, we use that to set use_custom_gc_
+      // as well.
+      use_custom_gc_(seq_per_batch),
+      shutdown_initiated_(false),
+      own_sfm_(options.sst_file_manager == nullptr),
+      closed_(false),
+      atomic_flush_install_cv_(&mutex_),
+      blob_callback_(immutable_db_options_.sst_file_manager.get(), &mutex_,
+                     &error_handler_, &event_logger_,
+                     immutable_db_options_.listeners, dbname_) {
+  // !batch_per_trx_ implies seq_per_batch_ because it is only unset for
+  // WriteUnprepared, which should use seq_per_batch_.
+  assert(batch_per_txn_ || seq_per_batch_);
+
+  // Reserve ten files or so for other uses and give the rest to TableCache.
+  // Give a large number for setting of "infinite" open files.
+  const int table_cache_size = (mutable_db_options_.max_open_files == -1)
+                                   ? TableCache::kInfiniteCapacity
+                                   : mutable_db_options_.max_open_files - 10;
+  LRUCacheOptions co;
+  co.capacity = table_cache_size;
+  co.num_shard_bits = immutable_db_options_.table_cache_numshardbits;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  table_cache_ = NewLRUCache(co);
+  SetDbSessionId();
+  assert(!db_session_id_.empty());
+
+#ifndef ROCKSDB_LITE
+  periodic_task_functions_.emplace(PeriodicTaskType::kDumpStats,
+                                   [this]() { this->DumpStats(); });
+  periodic_task_functions_.emplace(PeriodicTaskType::kPersistStats,
+                                   [this]() { this->PersistStats(); });
+  periodic_task_functions_.emplace(PeriodicTaskType::kFlushInfoLog,
+                                   [this]() { this->FlushInfoLog(); });
+  periodic_task_functions_.emplace(
+      PeriodicTaskType::kRecordSeqnoTime,
+      [this]() { this->RecordSeqnoToTimeMapping(); });
+#endif  // ROCKSDB_LITE
+
+  versions_.reset(new VersionSet(dbname_, &immutable_db_options_, file_options_,
+                                 table_cache_.get(), write_buffer_manager_,
+                                 &write_controller_, &block_cache_tracer_,
+                                 io_tracer_, db_id_, db_session_id_));
+  column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+
+  DumpRocksDBBuildVersion(immutable_db_options_.info_log.get());
+  DumpDBFileSummary(immutable_db_options_, dbname_, db_session_id_);
+  immutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  mutable_db_options_.Dump(immutable_db_options_.info_log.get());
+  DumpSupportInfo(immutable_db_options_.info_log.get());
+
+  max_total_wal_size_.store(mutable_db_options_.max_total_wal_size,
+                            std::memory_order_relaxed);
+  if (write_buffer_manager_) {
+    wbm_stall_.reset(new WBMStallInterface());
+  }
+}
+
+Status DBImpl::Resume() {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Resuming DB");
+
+  InstrumentedMutexLock db_mutex(&mutex_);
+
+  if (!error_handler_.IsDBStopped() && !error_handler_.IsBGWorkStopped()) {
+    // Nothing to do
+    return Status::OK();
+  }
+
+  if (error_handler_.IsRecoveryInProgress()) {
+    // Don't allow a mix of manual and automatic recovery
+    return Status::Busy();
+  }
+
+  mutex_.Unlock();
+  Status s = error_handler_.RecoverFromBGError(true);
+  mutex_.Lock();
+  return s;
+}
+
+// This function implements the guts of recovery from a background error. It
+// is eventually called for both manual as well as automatic recovery. It does
+// the following -
+// 1. Wait for currently scheduled background flush/compaction to exit, in
+//    order to inadvertently causing an error and thinking recovery failed
+// 2. Flush memtables if there's any data for all the CFs. This may result
+//    another error, which will be saved by error_handler_ and reported later
+//    as the recovery status
+// 3. Find and delete any obsolete files
+// 4. Schedule compactions if needed for all the CFs. This is needed as the
+//    flush in the prior step might have been a no-op for some CFs, which
+//    means a new super version wouldn't have been installed
+Status DBImpl::ResumeImpl(DBRecoverContext context) {
+  mutex_.AssertHeld();
+  WaitForBackgroundWork();
+
+  Status s;
+  if (shutdown_initiated_) {
+    // Returning shutdown status to SFM during auto recovery will cause it
+    // to abort the recovery and allow the shutdown to progress
+    s = Status::ShutdownInProgress();
+  }
+
+  if (s.ok()) {
+    Status bg_error = error_handler_.GetBGError();
+    if (bg_error.severity() > Status::Severity::kHardError) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "DB resume requested but failed due to Fatal/Unrecoverable error");
+      s = bg_error;
+    }
+  }
+
+  // Make sure the IO Status stored in version set is set to OK.
+  bool file_deletion_disabled = !IsFileDeletionsEnabled();
+  if (s.ok()) {
+    IOStatus io_s = versions_->io_status();
+    if (io_s.IsIOError()) {
+      // If resuming from IOError resulted from MANIFEST write, then assert
+      // that we must have already set the MANIFEST writer to nullptr during
+      // clean-up phase MANIFEST writing. We must have also disabled file
+      // deletions.
+      assert(!versions_->descriptor_log_);
+      assert(file_deletion_disabled);
+      // Since we are trying to recover from MANIFEST write error, we need to
+      // switch to a new MANIFEST anyway. The old MANIFEST can be corrupted.
+      // Therefore, force writing a dummy version edit because we do not know
+      // whether there are flush jobs with non-empty data to flush, triggering
+      // appends to MANIFEST.
+      VersionEdit edit;
+      auto cfh =
+          static_cast_with_check<ColumnFamilyHandleImpl>(default_cf_handle_);
+      assert(cfh);
+      ColumnFamilyData* cfd = cfh->cfd();
+      const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions();
+      s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_,
+                                 directories_.GetDbDir());
+      if (!s.ok()) {
+        io_s = versions_->io_status();
+        if (!io_s.ok()) {
+          s = error_handler_.SetBGError(io_s,
+                                        BackgroundErrorReason::kManifestWrite);
+        }
+      }
+    }
+  }
+
+  // We cannot guarantee consistency of the WAL. So force flush Memtables of
+  // all the column families
+  if (s.ok()) {
+    FlushOptions flush_opts;
+    // We allow flush to stall write since we are trying to resume from error.
+    flush_opts.allow_write_stall = true;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
+      mutex_.Lock();
+    } else {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        InstrumentedMutexUnlock u(&mutex_);
+        s = FlushMemTable(cfd, flush_opts, context.flush_reason);
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DB resume requested but failed due to Flush failure [%s]",
+                     s.ToString().c_str());
+    }
+  }
+
+  JobContext job_context(0);
+  FindObsoleteFiles(&job_context, true);
+  mutex_.Unlock();
+
+  job_context.manifest_file_number = 1;
+  if (job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+
+  if (s.ok()) {
+    assert(versions_->io_status().ok());
+    // If we reach here, we should re-enable file deletions if it was disabled
+    // during previous error handling.
+    if (file_deletion_disabled) {
+      // Always return ok
+      s = EnableFileDeletions(/*force=*/true);
+      if (!s.ok()) {
+        ROCKS_LOG_INFO(
+            immutable_db_options_.info_log,
+            "DB resume requested but could not enable file deletions [%s]",
+            s.ToString().c_str());
+        assert(false);
+      }
+    }
+  }
+
+  mutex_.Lock();
+  if (s.ok()) {
+    // This will notify and unblock threads waiting for error recovery to
+    // finish. Those previouly waiting threads can now proceed, which may
+    // include closing the db.
+    s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
+  }
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+                   s.ToString().c_str());
+  }
+
+  // Check for shutdown again before scheduling further compactions,
+  // since we released and re-acquired the lock above
+  if (shutdown_initiated_) {
+    s = Status::ShutdownInProgress();
+  }
+  if (s.ok()) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      SchedulePendingCompaction(cfd);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  // Wake up any waiters - in this case, it could be the shutdown thread
+  bg_cv_.SignalAll();
+
+  // No need to check BGError again. If something happened, event listener would
+  // be notified and the operation causing it would have failed
+  return s;
+}
+
+void DBImpl::WaitForBackgroundWork() {
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_) {
+    bg_cv_.Wait();
+  }
+}
+
+// Will lock the mutex_,  will wait for completion if wait is true
+void DBImpl::CancelAllBackgroundWork(bool wait) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Shutdown: canceling all background work");
+
+#ifndef ROCKSDB_LITE
+  for (uint8_t task_type = 0;
+       task_type < static_cast<uint8_t>(PeriodicTaskType::kMax); task_type++) {
+    Status s = periodic_task_scheduler_.Unregister(
+        static_cast<PeriodicTaskType>(task_type));
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to unregister periodic task %d, status: %s",
+                     task_type, s.ToString().c_str());
+    }
+  }
+#endif  // !ROCKSDB_LITE
+
+  InstrumentedMutexLock l(&mutex_);
+  if (!shutting_down_.load(std::memory_order_acquire) &&
+      has_unpersisted_data_.load(std::memory_order_relaxed) &&
+      !mutable_db_options_.avoid_flush_during_shutdown) {
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      Status s =
+          AtomicFlushMemTables(cfds, FlushOptions(), FlushReason::kShutDown);
+      s.PermitUncheckedError();  //**TODO: What to do on error?
+      mutex_.Lock();
+    } else {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+        if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
+          InstrumentedMutexUnlock u(&mutex_);
+          Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
+          s.PermitUncheckedError();  //**TODO: What to do on error?
+        }
+      }
+    }
+  }
+
+  shutting_down_.store(true, std::memory_order_release);
+  bg_cv_.SignalAll();
+  if (!wait) {
+    return;
+  }
+  WaitForBackgroundWork();
+}
+
+Status DBImpl::MaybeReleaseTimestampedSnapshotsAndCheck() {
+  size_t num_snapshots = 0;
+  ReleaseTimestampedSnapshotsOlderThan(std::numeric_limits<uint64_t>::max(),
+                                       &num_snapshots);
+
+  // If there is unreleased snapshot, fail the close call
+  if (num_snapshots > 0) {
+    return Status::Aborted("Cannot close DB with unreleased snapshot.");
+  }
+
+  return Status::OK();
+}
+
+Status DBImpl::CloseHelper() {
+  // Guarantee that there is no background error recovery in progress before
+  // continuing with the shutdown
+  mutex_.Lock();
+  shutdown_initiated_ = true;
+  error_handler_.CancelErrorRecovery();
+  while (error_handler_.IsRecoveryInProgress()) {
+    bg_cv_.Wait();
+  }
+  mutex_.Unlock();
+
+  // Below check is added as recovery_error_ is not checked and it causes crash
+  // in DBSSTTest.DBWithMaxSpaceAllowedWithBlobFiles when space limit is
+  // reached.
+  error_handler_.GetRecoveryError().PermitUncheckedError();
+
+  // CancelAllBackgroundWork called with false means we just set the shutdown
+  // marker. After this we do a variant of the waiting and unschedule work
+  // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
+  CancelAllBackgroundWork(false);
+
+  // Cancel manual compaction if there's any
+  if (HasPendingManualCompaction()) {
+    DisableManualCompaction();
+  }
+  mutex_.Lock();
+  // Unschedule all tasks for this DB
+  for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+  }
+
+  Status ret = Status::OK();
+
+  // Wait for background work to finish
+  while (bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+         bg_flush_scheduled_ || bg_purge_scheduled_ ||
+         pending_purge_obsolete_files_ ||
+         error_handler_.IsRecoveryInProgress()) {
+    TEST_SYNC_POINT("DBImpl::~DBImpl:WaitJob");
+    bg_cv_.Wait();
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::CloseHelper:PendingPurgeFinished",
+                           &files_grabbed_for_purge_);
+  EraseThreadStatusDbInfo();
+  flush_scheduler_.Clear();
+  trim_history_scheduler_.Clear();
+
+  while (!flush_queue_.empty()) {
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    for (const auto& iter : flush_req) {
+      iter.first->UnrefAndTryDelete();
+    }
+  }
+
+  while (!compaction_queue_.empty()) {
+    auto cfd = PopFirstFromCompactionQueue();
+    cfd->UnrefAndTryDelete();
+  }
+
+  if (default_cf_handle_ != nullptr || persist_stats_cf_handle_ != nullptr) {
+    // we need to delete handle outside of lock because it does its own locking
+    mutex_.Unlock();
+    if (default_cf_handle_) {
+      delete default_cf_handle_;
+      default_cf_handle_ = nullptr;
+    }
+    if (persist_stats_cf_handle_) {
+      delete persist_stats_cf_handle_;
+      persist_stats_cf_handle_ = nullptr;
+    }
+    mutex_.Lock();
+  }
+
+  // Clean up obsolete files due to SuperVersion release.
+  // (1) Need to delete to obsolete files before closing because RepairDB()
+  // scans all existing files in the file system and builds manifest file.
+  // Keeping obsolete files confuses the repair process.
+  // (2) Need to check if we Open()/Recover() the DB successfully before
+  // deleting because if VersionSet recover fails (may be due to corrupted
+  // manifest file), it is not able to identify live files correctly. As a
+  // result, all "live" files can get deleted by accident. However, corrupted
+  // manifest is recoverable by RepairDB().
+  if (opened_successfully_) {
+    JobContext job_context(next_job_id_.fetch_add(1));
+    FindObsoleteFiles(&job_context, true);
+
+    mutex_.Unlock();
+    // manifest number starting from 2
+    job_context.manifest_file_number = 1;
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+    mutex_.Lock();
+  }
+  {
+    InstrumentedMutexLock lock(&log_write_mutex_);
+    for (auto l : logs_to_free_) {
+      delete l;
+    }
+    for (auto& log : logs_) {
+      uint64_t log_number = log.writer->get_log_number();
+      Status s = log.ClearWriter();
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to Sync WAL file %s with error -- %s",
+            LogFileName(immutable_db_options_.GetWalDir(), log_number).c_str(),
+            s.ToString().c_str());
+        // Retain the first error
+        if (ret.ok()) {
+          ret = s;
+        }
+      }
+    }
+    logs_.clear();
+  }
+
+  // Table cache may have table handles holding blocks from the block cache.
+  // We need to release them before the block cache is destroyed. The block
+  // cache may be destroyed inside versions_.reset(), when column family data
+  // list is destroyed, so leaving handles in table cache after
+  // versions_.reset() may cause issues.
+  // Here we clean all unreferenced handles in table cache.
+  // Now we assume all user queries have finished, so only version set itself
+  // can possibly hold the blocks from block cache. After releasing unreferenced
+  // handles here, only handles held by version set left and inside
+  // versions_.reset(), we will release them. There, we need to make sure every
+  // time a handle is released, we erase it from the cache too. By doing that,
+  // we can guarantee that after versions_.reset(), table cache is empty
+  // so the cache can be safely destroyed.
+  table_cache_->EraseUnRefEntries();
+
+  for (auto& txn_entry : recovered_transactions_) {
+    delete txn_entry.second;
+  }
+
+  // versions need to be destroyed before table_cache since it can hold
+  // references to table_cache.
+  versions_.reset();
+  mutex_.Unlock();
+  if (db_lock_ != nullptr) {
+    // TODO: Check for unlock error
+    env_->UnlockFile(db_lock_).PermitUncheckedError();
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Shutdown complete");
+  LogFlush(immutable_db_options_.info_log);
+
+#ifndef ROCKSDB_LITE
+  // If the sst_file_manager was allocated by us during DB::Open(), ccall
+  // Close() on it before closing the info_log. Otherwise, background thread
+  // in SstFileManagerImpl might try to log something
+  if (immutable_db_options_.sst_file_manager && own_sfm_) {
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    sfm->Close();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (immutable_db_options_.info_log && own_info_log_) {
+    Status s = immutable_db_options_.info_log->Close();
+    if (!s.ok() && !s.IsNotSupported() && ret.ok()) {
+      ret = s;
+    }
+  }
+
+  if (write_buffer_manager_ && wbm_stall_) {
+    write_buffer_manager_->RemoveDBFromQueue(wbm_stall_.get());
+  }
+
+  IOStatus io_s = directories_.Close(IOOptions(), nullptr /* dbg */);
+  if (!io_s.ok()) {
+    ret = io_s;
+  }
+  if (ret.IsAborted()) {
+    // Reserve IsAborted() error for those where users didn't release
+    // certain resource and they can release them and come back and
+    // retry. In this case, we wrap this exception to something else.
+    return Status::Incomplete(ret.ToString());
+  }
+
+  return ret;
+}
+
+Status DBImpl::CloseImpl() { return CloseHelper(); }
+
+DBImpl::~DBImpl() {
+  // TODO: remove this.
+  init_logger_creation_s_.PermitUncheckedError();
+
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+  if (closed_) {
+    return;
+  }
+
+  closed_ = true;
+
+  {
+    const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+    s.PermitUncheckedError();
+  }
+
+  closing_status_ = CloseImpl();
+  closing_status_.PermitUncheckedError();
+}
+
+void DBImpl::MaybeIgnoreError(Status* s) const {
+  if (s->ok() || immutable_db_options_.paranoid_checks) {
+    // No change needed
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Ignoring error %s",
+                   s->ToString().c_str());
+    *s = Status::OK();
+  }
+}
+
+const Status DBImpl::CreateArchivalDirectory() {
+  if (immutable_db_options_.WAL_ttl_seconds > 0 ||
+      immutable_db_options_.WAL_size_limit_MB > 0) {
+    std::string archivalPath =
+        ArchivalDirectory(immutable_db_options_.GetWalDir());
+    return env_->CreateDirIfMissing(archivalPath);
+  }
+  return Status::OK();
+}
+
+void DBImpl::PrintStatistics() {
+  auto dbstats = immutable_db_options_.stats;
+  if (dbstats) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "STATISTICS:\n %s",
+                   dbstats->ToString().c_str());
+  }
+}
+
+Status DBImpl::StartPeriodicTaskScheduler() {
+#ifndef ROCKSDB_LITE
+
+#ifndef NDEBUG
+  // It only used by test to disable scheduler
+  bool disable_scheduler = false;
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::StartPeriodicTaskScheduler:DisableScheduler",
+      &disable_scheduler);
+  if (disable_scheduler) {
+    return Status::OK();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    TEST_SYNC_POINT_CALLBACK("DBImpl::StartPeriodicTaskScheduler:Init",
+                             &periodic_task_scheduler_);
+  }
+
+#endif  // !NDEBUG
+  if (mutable_db_options_.stats_dump_period_sec > 0) {
+    Status s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kDumpStats,
+        periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+        mutable_db_options_.stats_dump_period_sec);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  if (mutable_db_options_.stats_persist_period_sec > 0) {
+    Status s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kPersistStats,
+        periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+        mutable_db_options_.stats_persist_period_sec);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  Status s = periodic_task_scheduler_.Register(
+      PeriodicTaskType::kFlushInfoLog,
+      periodic_task_functions_.at(PeriodicTaskType::kFlushInfoLog));
+
+  return s;
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+Status DBImpl::RegisterRecordSeqnoTimeWorker() {
+#ifndef ROCKSDB_LITE
+  uint64_t min_time_duration = std::numeric_limits<uint64_t>::max();
+  uint64_t max_time_duration = std::numeric_limits<uint64_t>::min();
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      // preserve time is the max of 2 options.
+      uint64_t preserve_time_duration =
+          std::max(cfd->ioptions()->preserve_internal_time_seconds,
+                   cfd->ioptions()->preclude_last_level_data_seconds);
+      if (!cfd->IsDropped() && preserve_time_duration > 0) {
+        min_time_duration = std::min(preserve_time_duration, min_time_duration);
+        max_time_duration = std::max(preserve_time_duration, max_time_duration);
+      }
+    }
+    if (min_time_duration == std::numeric_limits<uint64_t>::max()) {
+      seqno_time_mapping_.Resize(0, 0);
+    } else {
+      seqno_time_mapping_.Resize(min_time_duration, max_time_duration);
+    }
+  }
+
+  uint64_t seqno_time_cadence = 0;
+  if (min_time_duration != std::numeric_limits<uint64_t>::max()) {
+    // round up to 1 when the time_duration is smaller than
+    // kMaxSeqnoTimePairsPerCF
+    seqno_time_cadence =
+        (min_time_duration + SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF - 1) /
+        SeqnoToTimeMapping::kMaxSeqnoTimePairsPerCF;
+  }
+
+  Status s;
+  if (seqno_time_cadence == 0) {
+    s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kRecordSeqnoTime);
+  } else {
+    s = periodic_task_scheduler_.Register(
+        PeriodicTaskType::kRecordSeqnoTime,
+        periodic_task_functions_.at(PeriodicTaskType::kRecordSeqnoTime),
+        seqno_time_cadence);
+  }
+
+  return s;
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+// esitmate the total size of stats_history_
+size_t DBImpl::EstimateInMemoryStatsHistorySize() const {
+  size_t size_total =
+      sizeof(std::map<uint64_t, std::map<std::string, uint64_t>>);
+  if (stats_history_.size() == 0) return size_total;
+  size_t size_per_slice =
+      sizeof(uint64_t) + sizeof(std::map<std::string, uint64_t>);
+  // non-empty map, stats_history_.begin() guaranteed to exist
+  for (const auto& pairs : stats_history_.begin()->second) {
+    size_per_slice +=
+        pairs.first.capacity() + sizeof(pairs.first) + sizeof(pairs.second);
+  }
+  size_total = size_per_slice * stats_history_.size();
+  return size_total;
+}
+
+void DBImpl::PersistStats() {
+  TEST_SYNC_POINT("DBImpl::PersistStats:Entry");
+#ifndef ROCKSDB_LITE
+  if (shutdown_initiated_) {
+    return;
+  }
+  TEST_SYNC_POINT("DBImpl::PersistStats:StartRunning");
+  uint64_t now_seconds =
+      immutable_db_options_.clock->NowMicros() / kMicrosInSecond;
+
+  Statistics* statistics = immutable_db_options_.stats;
+  if (!statistics) {
+    return;
+  }
+  size_t stats_history_size_limit = 0;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    stats_history_size_limit = mutable_db_options_.stats_history_buffer_size;
+  }
+
+  std::map<std::string, uint64_t> stats_map;
+  if (!statistics->getTickerMap(&stats_map)) {
+    return;
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- PERSISTING STATS -------");
+
+  if (immutable_db_options_.persist_stats_to_disk) {
+    WriteBatch batch;
+    Status s = Status::OK();
+    if (stats_slice_initialized_) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Reading %" ROCKSDB_PRIszt " stats from statistics\n",
+                     stats_slice_.size());
+      for (const auto& stat : stats_map) {
+        if (s.ok()) {
+          char key[100];
+          int length =
+              EncodePersistentStatsKey(now_seconds, stat.first, 100, key);
+          // calculate the delta from last time
+          if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+            uint64_t delta = stat.second - stats_slice_[stat.first];
+            s = batch.Put(persist_stats_cf_handle_,
+                          Slice(key, std::min(100, length)),
+                          std::to_string(delta));
+          }
+        }
+      }
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
+    if (!s.ok()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing to persistent stats CF failed -- %s",
+                     s.ToString().c_str());
+    } else {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Writing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to persistent stats CF succeeded",
+                     stats_slice_.size(), now_seconds);
+    }
+    // TODO(Zhongyi): add purging for persisted data
+  } else {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    // calculate the delta from last time
+    if (stats_slice_initialized_) {
+      std::map<std::string, uint64_t> stats_delta;
+      for (const auto& stat : stats_map) {
+        if (stats_slice_.find(stat.first) != stats_slice_.end()) {
+          stats_delta[stat.first] = stat.second - stats_slice_[stat.first];
+        }
+      }
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64
+                     " to in-memory stats history",
+                     stats_slice_.size(), now_seconds);
+      stats_history_[now_seconds] = stats_delta;
+    }
+    stats_slice_initialized_ = true;
+    std::swap(stats_slice_, stats_map);
+    TEST_SYNC_POINT("DBImpl::PersistStats:StatsCopied");
+
+    // delete older stats snapshots to control memory consumption
+    size_t stats_history_size = EstimateInMemoryStatsHistorySize();
+    bool purge_needed = stats_history_size > stats_history_size_limit;
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Pre-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+    while (purge_needed && !stats_history_.empty()) {
+      stats_history_.erase(stats_history_.begin());
+      purge_needed =
+          EstimateInMemoryStatsHistorySize() > stats_history_size_limit;
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[Post-GC] In-memory stats history size: %" ROCKSDB_PRIszt
+                   " bytes, slice count: %" ROCKSDB_PRIszt,
+                   stats_history_size, stats_history_.size());
+  }
+  TEST_SYNC_POINT("DBImpl::PersistStats:End");
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                             uint64_t* new_time,
+                             std::map<std::string, uint64_t>* stats_map) {
+  assert(new_time);
+  assert(stats_map);
+  if (!new_time || !stats_map) return false;
+  // lock when search for start_time
+  {
+    InstrumentedMutexLock l(&stats_history_mutex_);
+    auto it = stats_history_.lower_bound(start_time);
+    if (it != stats_history_.end() && it->first < end_time) {
+      // make a copy for timestamp and stats_map
+      *new_time = it->first;
+      *stats_map = it->second;
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+Status DBImpl::GetStatsHistory(
+    uint64_t start_time, uint64_t end_time,
+    std::unique_ptr<StatsHistoryIterator>* stats_iterator) {
+  if (!stats_iterator) {
+    return Status::InvalidArgument("stats_iterator not preallocated.");
+  }
+  if (immutable_db_options_.persist_stats_to_disk) {
+    stats_iterator->reset(
+        new PersistentStatsHistoryIterator(start_time, end_time, this));
+  } else {
+    stats_iterator->reset(
+        new InMemoryStatsHistoryIterator(start_time, end_time, this));
+  }
+  return (*stats_iterator)->status();
+}
+
+void DBImpl::DumpStats() {
+  TEST_SYNC_POINT("DBImpl::DumpStats:1");
+#ifndef ROCKSDB_LITE
+  std::string stats;
+  if (shutdown_initiated_) {
+    return;
+  }
+
+  // Also probe block cache(s) for problems, dump to info log
+  UnorderedSet<Cache*> probed_caches;
+  TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
+      if (!cfd->initialized()) {
+        continue;
+      }
+
+      // Release DB mutex for gathering cache entry stats. Pass over all
+      // column families for this first so that other stats are dumped
+      // near-atomically.
+      InstrumentedMutexUnlock u(&mutex_);
+      cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
+
+      // Probe block cache for problems (if not already via another CF)
+      if (immutable_db_options_.info_log) {
+        auto* table_factory = cfd->ioptions()->table_factory.get();
+        assert(table_factory != nullptr);
+        Cache* cache =
+            table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+        if (cache && probed_caches.insert(cache).second) {
+          cache->ReportProblems(immutable_db_options_.info_log);
+        }
+      }
+    }
+
+    const std::string* property = &DB::Properties::kDBStats;
+    const DBPropertyInfo* property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
+    default_cf_internal_stats_->GetStringProperty(*property_info, *property,
+                                                  &stats);
+
+    property = &InternalStats::kPeriodicCFStats;
+    property_info = GetPropertyInfo(*property);
+    assert(property_info != nullptr);
+    assert(!property_info->need_out_of_mutex);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->initialized()) {
+        cfd->internal_stats()->GetStringProperty(*property_info, *property,
+                                                 &stats);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::DumpStats:2");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "------- DUMPING STATS -------");
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+  if (immutable_db_options_.dump_malloc_stats) {
+    stats.clear();
+    DumpMallocStats(&stats);
+    if (!stats.empty()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "------- Malloc STATS -------");
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s", stats.c_str());
+    }
+  }
+#endif  // !ROCKSDB_LITE
+
+  PrintStatistics();
+}
+
+// Periodically flush info log out of application buffer at a low frequency.
+// This improves debuggability in case of RocksDB hanging since it ensures the
+// log messages leading up to the hang will eventually become visible in the
+// log.
+void DBImpl::FlushInfoLog() {
+  if (shutdown_initiated_) {
+    return;
+  }
+  TEST_SYNC_POINT("DBImpl::FlushInfoLog:StartRunning");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+Status DBImpl::TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                           int max_entries_to_print,
+                                           std::string* out_str) {
+  auto* cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  Version* version = super_version->current;
+
+  Status s =
+      version->TablesRangeTombstoneSummary(max_entries_to_print, out_str);
+
+  CleanupSuperVersion(super_version);
+  return s;
+}
+
+void DBImpl::ScheduleBgLogWriterClose(JobContext* job_context) {
+  mutex_.AssertHeld();
+  if (!job_context->logs_to_free.empty()) {
+    for (auto l : job_context->logs_to_free) {
+      AddToLogsToFreeQueue(l);
+    }
+    job_context->logs_to_free.clear();
+  }
+}
+
+FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const {
+  assert(cfd);
+  FSDirectory* ret_dir = cfd->GetDataDir(path_id);
+  if (ret_dir == nullptr) {
+    return directories_.GetDataDir(path_id);
+  }
+  return ret_dir;
+}
+
+Status DBImpl::SetOptions(
+    ColumnFamilyHandle* column_family,
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)column_family;
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetOptions() on column family [%s], empty input",
+                   cfd->GetName().c_str());
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableCFOptions new_options;
+  Status s;
+  Status persist_options_status;
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    auto db_options = GetDBOptions();
+    InstrumentedMutexLock l(&mutex_);
+    s = cfd->SetOptions(db_options, options_map);
+    if (s.ok()) {
+      new_options = *cfd->GetLatestMutableCFOptions();
+      // Append new version to recompute compaction score.
+      VersionEdit dummy_edit;
+      s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_,
+                                 directories_.GetDbDir());
+      // Trigger possible flush/compactions. This has to be before we persist
+      // options to file, otherwise there will be a deadlock with writer
+      // thread.
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context, new_options);
+
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+      bg_cv_.SignalAll();
+    }
+  }
+  sv_context.Clean();
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "SetOptions() on column family [%s], inputs:", cfd->GetName().c_str());
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] SetOptions() succeeded", cfd->GetName().c_str());
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      // NOTE: WriteOptionsFile already logs on failure
+      s = persist_options_status;
+    }
+  } else {
+    persist_options_status.PermitUncheckedError();  // less important
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] SetOptions() failed",
+                   cfd->GetName().c_str());
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::SetDBOptions(
+    const std::unordered_map<std::string, std::string>& options_map) {
+#ifdef ROCKSDB_LITE
+  (void)options_map;
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (options_map.empty()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "SetDBOptions(), empty input.");
+    return Status::InvalidArgument("empty input");
+  }
+
+  MutableDBOptions new_options;
+  Status s;
+  Status persist_options_status = Status::OK();
+  bool wal_changed = false;
+  WriteContext write_context;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = GetMutableDBOptionsFromStrings(mutable_db_options_, options_map,
+                                       &new_options);
+
+    if (new_options.bytes_per_sync == 0) {
+      new_options.bytes_per_sync = 1024 * 1024;
+    }
+
+    if (MutableDBOptionsAreEqual(mutable_db_options_, new_options)) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "SetDBOptions(), input option value is not changed, "
+                     "skipping updating.");
+      persist_options_status.PermitUncheckedError();
+      return s;
+    }
+
+    DBOptions new_db_options =
+        BuildDBOptions(immutable_db_options_, new_options);
+    if (s.ok()) {
+      s = ValidateOptions(new_db_options);
+    }
+    if (s.ok()) {
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped()) {
+          auto cf_options = c->GetLatestCFOptions();
+          s = ColumnFamilyData::ValidateOptions(new_db_options, cf_options);
+          if (!s.ok()) {
+            break;
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      const BGJobLimits current_bg_job_limits =
+          GetBGJobLimits(mutable_db_options_.max_background_flushes,
+                         mutable_db_options_.max_background_compactions,
+                         mutable_db_options_.max_background_jobs,
+                         /* parallelize_compactions */ true);
+      const BGJobLimits new_bg_job_limits = GetBGJobLimits(
+          new_options.max_background_flushes,
+          new_options.max_background_compactions,
+          new_options.max_background_jobs, /* parallelize_compactions */ true);
+
+      const bool max_flushes_increased =
+          new_bg_job_limits.max_flushes > current_bg_job_limits.max_flushes;
+      const bool max_compactions_increased =
+          new_bg_job_limits.max_compactions >
+          current_bg_job_limits.max_compactions;
+
+      if (max_flushes_increased || max_compactions_increased) {
+        if (max_flushes_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_flushes,
+                                             Env::Priority::HIGH);
+        }
+
+        if (max_compactions_increased) {
+          env_->IncBackgroundThreadsIfNeeded(new_bg_job_limits.max_compactions,
+                                             Env::Priority::LOW);
+        }
+
+        MaybeScheduleFlushOrCompaction();
+      }
+
+      mutex_.Unlock();
+      if (new_options.stats_dump_period_sec == 0) {
+        s = periodic_task_scheduler_.Unregister(PeriodicTaskType::kDumpStats);
+      } else {
+        s = periodic_task_scheduler_.Register(
+            PeriodicTaskType::kDumpStats,
+            periodic_task_functions_.at(PeriodicTaskType::kDumpStats),
+            new_options.stats_dump_period_sec);
+      }
+      if (new_options.max_total_wal_size !=
+          mutable_db_options_.max_total_wal_size) {
+        max_total_wal_size_.store(new_options.max_total_wal_size,
+                                  std::memory_order_release);
+      }
+      if (s.ok()) {
+        if (new_options.stats_persist_period_sec == 0) {
+          s = periodic_task_scheduler_.Unregister(
+              PeriodicTaskType::kPersistStats);
+        } else {
+          s = periodic_task_scheduler_.Register(
+              PeriodicTaskType::kPersistStats,
+              periodic_task_functions_.at(PeriodicTaskType::kPersistStats),
+              new_options.stats_persist_period_sec);
+        }
+      }
+      mutex_.Lock();
+      if (!s.ok()) {
+        return s;
+      }
+
+      write_controller_.set_max_delayed_write_rate(
+          new_options.delayed_write_rate);
+      table_cache_.get()->SetCapacity(new_options.max_open_files == -1
+                                          ? TableCache::kInfiniteCapacity
+                                          : new_options.max_open_files - 10);
+      wal_changed = mutable_db_options_.wal_bytes_per_sync !=
+                    new_options.wal_bytes_per_sync;
+      mutable_db_options_ = new_options;
+      file_options_for_compaction_ = FileOptions(new_db_options);
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableWrite(
+          file_options_for_compaction_, immutable_db_options_);
+      versions_->ChangeFileOptions(mutable_db_options_);
+      // TODO(xiez): clarify why apply optimize for read to write options
+      file_options_for_compaction_ = fs_->OptimizeForCompactionTableRead(
+          file_options_for_compaction_, immutable_db_options_);
+      file_options_for_compaction_.compaction_readahead_size =
+          mutable_db_options_.compaction_readahead_size;
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (total_log_size_ > GetMaxTotalWalSize() || wal_changed) {
+        Status purge_wal_status = SwitchWAL(&write_context);
+        if (!purge_wal_status.ok()) {
+          ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                         "Unable to purge WAL files in SetDBOptions() -- %s",
+                         purge_wal_status.ToString().c_str());
+        }
+      }
+      persist_options_status = WriteOptionsFile(
+          false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+      write_thread_.ExitUnbatched(&w);
+    } else {
+      // To get here, we must have had invalid options and will not attempt to
+      // persist the options, which means the status is "OK/Uninitialized.
+      persist_options_status.PermitUncheckedError();
+    }
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions(), inputs:");
+  for (const auto& o : options_map) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s: %s\n", o.first.c_str(),
+                   o.second.c_str());
+  }
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "SetDBOptions() succeeded");
+    new_options.Dump(immutable_db_options_.info_log.get());
+    if (!persist_options_status.ok()) {
+      if (immutable_db_options_.fail_if_options_file_error) {
+        s = Status::IOError(
+            "SetDBOptions() succeeded, but unable to persist options",
+            persist_options_status.ToString());
+      }
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Unable to persist options in SetDBOptions() -- %s",
+                     persist_options_status.ToString().c_str());
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "SetDBOptions failed");
+  }
+  LogFlush(immutable_db_options_.info_log);
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+// return the same level if it cannot be moved
+int DBImpl::FindMinimumEmptyLevelFitting(
+    ColumnFamilyData* cfd, const MutableCFOptions& /*mutable_cf_options*/,
+    int level) {
+  mutex_.AssertHeld();
+  const auto* vstorage = cfd->current()->storage_info();
+  int minimum_level = level;
+  for (int i = level - 1; i > 0; --i) {
+    // stop if level i is not empty
+    if (vstorage->NumLevelFiles(i) > 0) break;
+    // stop if level i is too small (cannot fit the level files)
+    if (vstorage->MaxBytesForLevel(i) < vstorage->NumLevelBytes(level)) {
+      break;
+    }
+
+    minimum_level = i;
+  }
+  return minimum_level;
+}
+
+Status DBImpl::FlushWAL(bool sync) {
+  if (manual_wal_flush_) {
+    IOStatus io_s;
+    {
+      // We need to lock log_write_mutex_ since logs_ might change concurrently
+      InstrumentedMutexLock wl(&log_write_mutex_);
+      log::Writer* cur_log_writer = logs_.back().writer;
+      io_s = cur_log_writer->WriteBuffer();
+    }
+    if (!io_s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                      io_s.ToString().c_str());
+      // In case there is a fs error we should set it globally to prevent the
+      // future writes
+      IOStatusCheck(io_s);
+      // whether sync or not, we should abort the rest of function upon error
+      return static_cast<Status>(io_s);
+    }
+    if (!sync) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=false");
+      return static_cast<Status>(io_s);
+    }
+  }
+  if (!sync) {
+    return Status::OK();
+  }
+  // sync = true
+  ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "FlushWAL sync=true");
+  return SyncWAL();
+}
+
+bool DBImpl::WALBufferIsEmpty(bool lock) {
+  if (lock) {
+    log_write_mutex_.Lock();
+  }
+  log::Writer* cur_log_writer = logs_.back().writer;
+  auto res = cur_log_writer->BufferIsEmpty();
+  if (lock) {
+    log_write_mutex_.Unlock();
+  }
+  return res;
+}
+
+Status DBImpl::SyncWAL() {
+  TEST_SYNC_POINT("DBImpl::SyncWAL:Begin");
+  autovector<log::Writer*, 1> logs_to_sync;
+  bool need_log_dir_sync;
+  uint64_t current_log_number;
+
+  {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    assert(!logs_.empty());
+
+    // This SyncWAL() call only cares about logs up to this number.
+    current_log_number = logfile_number_;
+
+    while (logs_.front().number <= current_log_number &&
+           logs_.front().IsSyncing()) {
+      log_sync_cv_.Wait();
+    }
+    // First check that logs are safe to sync in background.
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      if (!it->writer->file()->writable_file()->IsSyncThreadSafe()) {
+        return Status::NotSupported(
+            "SyncWAL() is not supported for this implementation of WAL file",
+            immutable_db_options_.allow_mmap_writes
+                ? "try setting Options::allow_mmap_writes to false"
+                : Slice());
+      }
+    }
+    for (auto it = logs_.begin();
+         it != logs_.end() && it->number <= current_log_number; ++it) {
+      auto& log = *it;
+      log.PrepareForSync();
+      logs_to_sync.push_back(log.writer);
+    }
+
+    need_log_dir_sync = !log_dir_synced_;
+  }
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  RecordTick(stats_, WAL_FILE_SYNCED);
+  Status status;
+  IOStatus io_s;
+  for (log::Writer* log : logs_to_sync) {
+    io_s = log->file()->SyncWithoutFlush(immutable_db_options_.use_fsync);
+    if (!io_s.ok()) {
+      status = io_s;
+      break;
+    }
+  }
+  if (!io_s.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL Sync error %s",
+                    io_s.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    IOStatusCheck(io_s);
+  }
+  if (status.ok() && need_log_dir_sync) {
+    status = directories_.GetWalDir()->FsyncWithDirOptions(
+        IOOptions(), nullptr,
+        DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+  }
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1");
+  VersionEdit synced_wals;
+  {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    if (status.ok()) {
+      MarkLogsSynced(current_log_number, need_log_dir_sync, &synced_wals);
+    } else {
+      MarkLogsNotSynced(current_log_number);
+    }
+  }
+  if (status.ok() && synced_wals.IsWalAddition()) {
+    InstrumentedMutexLock l(&mutex_);
+    status = ApplyWALToManifest(&synced_wals);
+  }
+
+  TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2");
+
+  return status;
+}
+
+Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) {
+  // not empty, write to MANIFEST.
+  mutex_.AssertHeld();
+  Status status = versions_->LogAndApplyToDefaultColumnFamily(
+      synced_wals, &mutex_, directories_.GetDbDir());
+  if (!status.ok() && versions_->io_status().IsIOError()) {
+    status = error_handler_.SetBGError(versions_->io_status(),
+                                       BackgroundErrorReason::kManifestWrite);
+  }
+  return status;
+}
+
+Status DBImpl::LockWAL() {
+  log_write_mutex_.Lock();
+  auto cur_log_writer = logs_.back().writer;
+  IOStatus status = cur_log_writer->WriteBuffer();
+  if (!status.ok()) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "WAL flush error %s",
+                    status.ToString().c_str());
+    // In case there is a fs error we should set it globally to prevent the
+    // future writes
+    WriteStatusCheck(status);
+  }
+  return static_cast<Status>(status);
+}
+
+Status DBImpl::UnlockWAL() {
+  log_write_mutex_.Unlock();
+  return Status::OK();
+}
+
+void DBImpl::MarkLogsSynced(uint64_t up_to, bool synced_dir,
+                            VersionEdit* synced_wals) {
+  log_write_mutex_.AssertHeld();
+  if (synced_dir && logfile_number_ == up_to) {
+    log_dir_synced_ = true;
+  }
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;) {
+    auto& wal = *it;
+    assert(wal.IsSyncing());
+
+    if (wal.number < logs_.back().number) {
+      // Inactive WAL
+      if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+          wal.GetPreSyncSize() > 0) {
+        synced_wals->AddWal(wal.number, WalMetadata(wal.GetPreSyncSize()));
+      }
+      if (wal.GetPreSyncSize() == wal.writer->file()->GetFlushedSize()) {
+        // Fully synced
+        logs_to_free_.push_back(wal.ReleaseWriter());
+        it = logs_.erase(it);
+      } else {
+        assert(wal.GetPreSyncSize() < wal.writer->file()->GetFlushedSize());
+        wal.FinishSync();
+        ++it;
+      }
+    } else {
+      assert(wal.number == logs_.back().number);
+      // Active WAL
+      wal.FinishSync();
+      ++it;
+    }
+  }
+  log_sync_cv_.SignalAll();
+}
+
+void DBImpl::MarkLogsNotSynced(uint64_t up_to) {
+  log_write_mutex_.AssertHeld();
+  for (auto it = logs_.begin(); it != logs_.end() && it->number <= up_to;
+       ++it) {
+    auto& wal = *it;
+    wal.FinishSync();
+  }
+  log_sync_cv_.SignalAll();
+}
+
+SequenceNumber DBImpl::GetLatestSequenceNumber() const {
+  return versions_->LastSequence();
+}
+
+void DBImpl::SetLastPublishedSequence(SequenceNumber seq) {
+  versions_->SetLastPublishedSequence(seq);
+}
+
+Status DBImpl::GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                   std::string* ts_low) {
+  if (ts_low == nullptr) {
+    return Status::InvalidArgument("ts_low is nullptr");
+  }
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  InstrumentedMutexLock l(&mutex_);
+  *ts_low = cfd->GetFullHistoryTsLow();
+  assert(cfd->user_comparator()->timestamp_size() == ts_low->size());
+  return Status::OK();
+}
+
+InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
+                                              Arena* arena,
+                                              SequenceNumber sequence,
+                                              ColumnFamilyHandle* column_family,
+                                              bool allow_unprepared_value) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+
+  mutex_.Lock();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  mutex_.Unlock();
+  return NewInternalIterator(read_options, cfd, super_version, arena, sequence,
+                             allow_unprepared_value);
+}
+
+void DBImpl::SchedulePurge() {
+  mutex_.AssertHeld();
+  assert(opened_successfully_);
+
+  // Purge operations are put into High priority queue
+  bg_purge_scheduled_++;
+  env_->Schedule(&DBImpl::BGWorkPurge, this, Env::Priority::HIGH, nullptr);
+}
+
+void DBImpl::BackgroundCallPurge() {
+  mutex_.Lock();
+
+  while (!logs_to_free_queue_.empty()) {
+    assert(!logs_to_free_queue_.empty());
+    log::Writer* log_writer = *(logs_to_free_queue_.begin());
+    logs_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete log_writer;
+    mutex_.Lock();
+  }
+  while (!superversions_to_free_queue_.empty()) {
+    assert(!superversions_to_free_queue_.empty());
+    SuperVersion* sv = superversions_to_free_queue_.front();
+    superversions_to_free_queue_.pop_front();
+    mutex_.Unlock();
+    delete sv;
+    mutex_.Lock();
+  }
+
+  assert(bg_purge_scheduled_ > 0);
+
+  // Can't use iterator to go over purge_files_ because inside the loop we're
+  // unlocking the mutex that protects purge_files_.
+  while (!purge_files_.empty()) {
+    auto it = purge_files_.begin();
+    // Need to make a copy of the PurgeFilesInfo before unlocking the mutex.
+    PurgeFileInfo purge_file = it->second;
+
+    const std::string& fname = purge_file.fname;
+    const std::string& dir_to_sync = purge_file.dir_to_sync;
+    FileType type = purge_file.type;
+    uint64_t number = purge_file.number;
+    int job_id = purge_file.job_id;
+
+    purge_files_.erase(it);
+
+    mutex_.Unlock();
+    DeleteObsoleteFileImpl(job_id, fname, dir_to_sync, type, number);
+    mutex_.Lock();
+  }
+
+  bg_purge_scheduled_--;
+
+  bg_cv_.SignalAll();
+  // IMPORTANT:there should be no code after calling SignalAll. This call may
+  // signal the DB destructor that it's OK to proceed with destruction. In
+  // that case, all DB variables will be dealloacated and referencing them
+  // will cause trouble.
+  mutex_.Unlock();
+}
+
+namespace {
+
+// A `SuperVersionHandle` holds a non-null `SuperVersion*` pointing at a
+// `SuperVersion` referenced once for this object. It also contains the state
+// needed to clean up the `SuperVersion` reference from outside of `DBImpl`
+// using `CleanupSuperVersionHandle()`.
+struct SuperVersionHandle {
+  // `_super_version` must be non-nullptr and `Ref()`'d once as long as the
+  // `SuperVersionHandle` may use it.
+  SuperVersionHandle(DBImpl* _db, InstrumentedMutex* _mu,
+                     SuperVersion* _super_version, bool _background_purge)
+      : db(_db),
+        mu(_mu),
+        super_version(_super_version),
+        background_purge(_background_purge) {}
+
+  DBImpl* db;
+  InstrumentedMutex* mu;
+  SuperVersion* super_version;
+  bool background_purge;
+};
+
+static void CleanupSuperVersionHandle(void* arg1, void* /*arg2*/) {
+  SuperVersionHandle* sv_handle = reinterpret_cast<SuperVersionHandle*>(arg1);
+
+  if (sv_handle->super_version->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+
+    sv_handle->mu->Lock();
+    sv_handle->super_version->Cleanup();
+    sv_handle->db->FindObsoleteFiles(&job_context, false, true);
+    if (sv_handle->background_purge) {
+      sv_handle->db->ScheduleBgLogWriterClose(&job_context);
+      sv_handle->db->AddSuperVersionsToFreeQueue(sv_handle->super_version);
+      sv_handle->db->SchedulePurge();
+    }
+    sv_handle->mu->Unlock();
+
+    if (!sv_handle->background_purge) {
+      delete sv_handle->super_version;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      sv_handle->db->PurgeObsoleteFiles(job_context,
+                                        sv_handle->background_purge);
+    }
+    job_context.Clean();
+  }
+
+  delete sv_handle;
+}
+
+struct GetMergeOperandsState {
+  MergeContext merge_context;
+  PinnedIteratorsManager pinned_iters_mgr;
+  SuperVersionHandle* sv_handle;
+};
+
+static void CleanupGetMergeOperandsState(void* arg1, void* /*arg2*/) {
+  GetMergeOperandsState* state = static_cast<GetMergeOperandsState*>(arg1);
+  CleanupSuperVersionHandle(state->sv_handle /* arg1 */, nullptr /* arg2 */);
+  delete state;
+}
+
+}  // namespace
+
+InternalIterator* DBImpl::NewInternalIterator(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SuperVersion* super_version, Arena* arena, SequenceNumber sequence,
+    bool allow_unprepared_value, ArenaWrappedDBIter* db_iter) {
+  InternalIterator* internal_iter;
+  assert(arena != nullptr);
+  // Need to create internal iterator from the arena.
+  MergeIteratorBuilder merge_iter_builder(
+      &cfd->internal_comparator(), arena,
+      !read_options.total_order_seek &&
+          super_version->mutable_cf_options.prefix_extractor != nullptr,
+      read_options.iterate_upper_bound);
+  // Collect iterator for mutable memtable
+  auto mem_iter = super_version->mem->NewIterator(read_options, arena);
+  Status s;
+  if (!read_options.ignore_range_deletions) {
+    TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+    auto range_del_iter = super_version->mem->NewRangeTombstoneIterator(
+        read_options, sequence, false /* immutable_memtable */);
+    if (range_del_iter == nullptr || range_del_iter->empty()) {
+      delete range_del_iter;
+    } else {
+      mem_tombstone_iter = new TruncatedRangeDelIterator(
+          std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+          &cfd->ioptions()->internal_comparator, nullptr /* smallest */,
+          nullptr /* largest */);
+    }
+    merge_iter_builder.AddPointAndTombstoneIterator(mem_iter,
+                                                    mem_tombstone_iter);
+  } else {
+    merge_iter_builder.AddIterator(mem_iter);
+  }
+
+  // Collect all needed child iterators for immutable memtables
+  if (s.ok()) {
+    super_version->imm->AddIterators(read_options, &merge_iter_builder,
+                                     !read_options.ignore_range_deletions);
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::NewInternalIterator:StatusCallback", &s);
+  if (s.ok()) {
+    // Collect iterators for files in L0 - Ln
+    if (read_options.read_tier != kMemtableTier) {
+      super_version->current->AddIterators(read_options, file_options_,
+                                           &merge_iter_builder,
+                                           allow_unprepared_value);
+    }
+    internal_iter = merge_iter_builder.Finish(
+        read_options.ignore_range_deletions ? nullptr : db_iter);
+    SuperVersionHandle* cleanup = new SuperVersionHandle(
+        this, &mutex_, super_version,
+        read_options.background_purge_on_iterator_cleanup ||
+            immutable_db_options_.avoid_unnecessary_blocking_io);
+    internal_iter->RegisterCleanup(CleanupSuperVersionHandle, cleanup, nullptr);
+
+    return internal_iter;
+  } else {
+    CleanupSuperVersion(super_version);
+  }
+  return NewErrorInternalIterator<Slice>(s, arena);
+}
+
+ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
+  return default_cf_handle_;
+}
+
+ColumnFamilyHandle* DBImpl::PersistentStatsColumnFamily() const {
+  return persist_stats_cf_handle_;
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value) {
+  return Get(read_options, column_family, key, value, /*timestamp=*/nullptr);
+}
+
+Status DBImpl::Get(const ReadOptions& read_options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableSlice* value, std::string* timestamp) {
+  assert(value != nullptr);
+  value->Reset();
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = value;
+  get_impl_options.timestamp = timestamp;
+  Status s = GetImpl(read_options, key, get_impl_options);
+  return s;
+}
+
+Status DBImpl::GetEntity(const ReadOptions& read_options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         PinnableWideColumns* columns) {
+  if (!column_family) {
+    return Status::InvalidArgument(
+        "Cannot call GetEntity without a column family handle");
+  }
+
+  if (!columns) {
+    return Status::InvalidArgument(
+        "Cannot call GetEntity without a PinnableWideColumns object");
+  }
+
+  columns->Reset();
+
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.columns = columns;
+
+  return GetImpl(read_options, key, get_impl_options);
+}
+
+bool DBImpl::ShouldReferenceSuperVersion(const MergeContext& merge_context) {
+  // If both thresholds are reached, a function returning merge operands as
+  // `PinnableSlice`s should reference the `SuperVersion` to avoid large and/or
+  // numerous `memcpy()`s.
+  //
+  // The below constants enable the optimization conservatively. They are
+  // verified to not regress `GetMergeOperands()` latency in the following
+  // scenarios.
+  //
+  // - CPU: two socket Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz
+  // - `GetMergeOperands()` threads: 1 - 32
+  // - Entry size: 32 bytes - 4KB
+  // - Merges per key: 1 - 16K
+  // - LSM component: memtable
+  //
+  // TODO(ajkr): expand measurement to SST files.
+  static const size_t kNumBytesForSvRef = 32768;
+  static const size_t kLog2AvgBytesForSvRef = 8;  // 256 bytes
+
+  size_t num_bytes = 0;
+  for (const Slice& sl : merge_context.GetOperands()) {
+    num_bytes += sl.size();
+  }
+  return num_bytes >= kNumBytesForSvRef &&
+         (num_bytes >> kLog2AvgBytesForSvRef) >=
+             merge_context.GetOperands().size();
+}
+
+Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
+                       GetImplOptions& get_impl_options) {
+  assert(get_impl_options.value != nullptr ||
+         get_impl_options.merge_operands != nullptr ||
+         get_impl_options.columns != nullptr);
+
+  assert(get_impl_options.column_family);
+
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(get_impl_options.column_family,
+                                        *(read_options.timestamp),
+                                        /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(get_impl_options.column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (get_impl_options.timestamp) {
+    get_impl_options.timestamp->clear();
+  }
+
+  GetWithTimestampReadCallback read_cb(0);  // Will call Refresh
+
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+      get_impl_options.column_family);
+  auto cfd = cfh->cfd();
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->Get(get_impl_options.column_family, key).PermitUncheckedError();
+    }
+  }
+
+  if (get_impl_options.get_merge_operands_options != nullptr) {
+    for (int i = 0; i < get_impl_options.get_merge_operands_options
+                            ->expected_max_number_of_operands;
+         ++i) {
+      get_impl_options.merge_operands[i].Reset();
+    }
+  }
+
+  // Acquire SuperVersion
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  TEST_SYNC_POINT("DBImpl::GetImpl:1");
+  TEST_SYNC_POINT("DBImpl::GetImpl:2");
+
+  SequenceNumber snapshot;
+  if (read_options.snapshot != nullptr) {
+    if (get_impl_options.callback) {
+      // Already calculated based on read_options.snapshot
+      snapshot = get_impl_options.callback->max_visible_seq();
+    } else {
+      snapshot =
+          reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+    }
+  } else {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    snapshot = GetLastPublishedSequence();
+    if (get_impl_options.callback) {
+      // The unprep_seqs are not published for write unprepared, so it could be
+      // that max_visible_seq is larger. Seek to the std::max of the two.
+      // However, we still want our callback to contain the actual snapshot so
+      // that it can do the correct visibility filtering.
+      get_impl_options.callback->Refresh(snapshot);
+
+      // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+      // max_visible_seq = max(max_visible_seq, snapshot)
+      //
+      // Currently, the commented out assert is broken by
+      // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+      // the regular transaction flow, then this special read callback would not
+      // be needed.
+      //
+      // assert(callback->max_visible_seq() >= snapshot);
+      snapshot = get_impl_options.callback->max_visible_seq();
+    }
+  }
+  // If timestamp is used, we use read callback to ensure <key,t,s> is returned
+  // only if t <= read_opts.timestamp and s <= snapshot.
+  // HACK: temporarily overwrite input struct field but restore
+  SaveAndRestore<ReadCallback*> restore_callback(&get_impl_options.callback);
+  const Comparator* ucmp = get_impl_options.column_family->GetComparator();
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0) {
+    assert(!get_impl_options
+                .callback);  // timestamp with callback is not supported
+    read_cb.Refresh(snapshot);
+    get_impl_options.callback = &read_cb;
+  }
+  TEST_SYNC_POINT("DBImpl::GetImpl:3");
+  TEST_SYNC_POINT("DBImpl::GetImpl:4");
+
+  // Prepare to store a list of merge operations if merge occurs.
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  Status s;
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool skip_memtable = (read_options.read_tier == kPersistedTier &&
+                        has_unpersisted_data_.load(std::memory_order_relaxed));
+  bool done = false;
+  std::string* timestamp =
+      ucmp->timestamp_size() > 0 ? get_impl_options.timestamp : nullptr;
+  if (!skip_memtable) {
+    // Get value associated with key
+    if (get_impl_options.get_value) {
+      if (sv->mem->Get(
+              lkey,
+              get_impl_options.value ? get_impl_options.value->GetSelf()
+                                     : nullptr,
+              get_impl_options.columns, timestamp, &s, &merge_context,
+              &max_covering_tombstone_seq, read_options,
+              false /* immutable_memtable */, get_impl_options.callback,
+              get_impl_options.is_blob_index)) {
+        done = true;
+
+        if (get_impl_options.value) {
+          get_impl_options.value->PinSelf();
+        }
+
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->Get(lkey,
+                              get_impl_options.value
+                                  ? get_impl_options.value->GetSelf()
+                                  : nullptr,
+                              get_impl_options.columns, timestamp, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              read_options, get_impl_options.callback,
+                              get_impl_options.is_blob_index)) {
+        done = true;
+
+        if (get_impl_options.value) {
+          get_impl_options.value->PinSelf();
+        }
+
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    } else {
+      // Get Merge Operands associated with key, Merge Operands should not be
+      // merged and raw values should be returned to the user.
+      if (sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                       /*timestamp=*/nullptr, &s, &merge_context,
+                       &max_covering_tombstone_seq, read_options,
+                       false /* immutable_memtable */, nullptr, nullptr,
+                       false)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if ((s.ok() || s.IsMergeInProgress()) &&
+                 sv->imm->GetMergeOperands(lkey, &s, &merge_context,
+                                           &max_covering_tombstone_seq,
+                                           read_options)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done && !s.ok() && !s.IsMergeInProgress()) {
+      ReturnAndCleanupSuperVersion(cfd, sv);
+      return s;
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:0");
+  TEST_SYNC_POINT("DBImpl::GetImpl:PostMemTableGet:1");
+  PinnedIteratorsManager pinned_iters_mgr;
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    sv->current->Get(
+        read_options, lkey, get_impl_options.value, get_impl_options.columns,
+        timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+        &pinned_iters_mgr,
+        get_impl_options.get_value ? get_impl_options.value_found : nullptr,
+        nullptr, nullptr,
+        get_impl_options.get_value ? get_impl_options.callback : nullptr,
+        get_impl_options.get_value ? get_impl_options.is_blob_index : nullptr,
+        get_impl_options.get_value);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = 0;
+    if (s.ok()) {
+      if (get_impl_options.get_value) {
+        if (get_impl_options.value) {
+          size = get_impl_options.value->size();
+        } else if (get_impl_options.columns) {
+          size = get_impl_options.columns->serialized_size();
+        }
+      } else {
+        // Return all merge operands for get_impl_options.key
+        *get_impl_options.number_of_operands =
+            static_cast<int>(merge_context.GetNumOperands());
+        if (*get_impl_options.number_of_operands >
+            get_impl_options.get_merge_operands_options
+                ->expected_max_number_of_operands) {
+          s = Status::Incomplete(
+              Status::SubCode::KMergeOperandsInsufficientCapacity);
+        } else {
+          // Each operand depends on one of the following resources: `sv`,
+          // `pinned_iters_mgr`, or `merge_context`. It would be crazy expensive
+          // to reference `sv` for each operand relying on it because `sv` is
+          // (un)ref'd in all threads using the DB. Furthermore, we do not track
+          // on which resource each operand depends.
+          //
+          // To solve this, we bundle the resources in a `GetMergeOperandsState`
+          // and manage them with a `SharedCleanablePtr` shared among the
+          // `PinnableSlice`s we return. This bundle includes one `sv` reference
+          // and ownership of the `merge_context` and `pinned_iters_mgr`
+          // objects.
+          bool ref_sv = ShouldReferenceSuperVersion(merge_context);
+          if (ref_sv) {
+            assert(!merge_context.GetOperands().empty());
+            SharedCleanablePtr shared_cleanable;
+            GetMergeOperandsState* state = nullptr;
+            state = new GetMergeOperandsState();
+            state->merge_context = std::move(merge_context);
+            state->pinned_iters_mgr = std::move(pinned_iters_mgr);
+
+            sv->Ref();
+
+            state->sv_handle = new SuperVersionHandle(
+                this, &mutex_, sv,
+                immutable_db_options_.avoid_unnecessary_blocking_io);
+
+            shared_cleanable.Allocate();
+            shared_cleanable->RegisterCleanup(CleanupGetMergeOperandsState,
+                                              state /* arg1 */,
+                                              nullptr /* arg2 */);
+            for (size_t i = 0; i < state->merge_context.GetOperands().size();
+                 ++i) {
+              const Slice& sl = state->merge_context.GetOperands()[i];
+              size += sl.size();
+
+              get_impl_options.merge_operands->PinSlice(
+                  sl, nullptr /* cleanable */);
+              if (i == state->merge_context.GetOperands().size() - 1) {
+                shared_cleanable.MoveAsCleanupTo(
+                    get_impl_options.merge_operands);
+              } else {
+                shared_cleanable.RegisterCopyWith(
+                    get_impl_options.merge_operands);
+              }
+              get_impl_options.merge_operands++;
+            }
+          } else {
+            for (const Slice& sl : merge_context.GetOperands()) {
+              size += sl.size();
+              get_impl_options.merge_operands->PinSelf(sl);
+              get_impl_options.merge_operands++;
+            }
+          }
+        }
+      }
+      RecordTick(stats_, BYTES_READ, size);
+      PERF_COUNTER_ADD(get_read_bytes, size);
+    }
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+
+    RecordInHistogram(stats_, BYTES_PER_READ, size);
+  }
+  return s;
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values) {
+  return MultiGet(read_options, column_family, keys, values,
+                  /*timestamps=*/nullptr);
+}
+
+std::vector<Status> DBImpl::MultiGet(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_family,
+    const std::vector<Slice>& keys, std::vector<std::string>* values,
+    std::vector<std::string>* timestamps) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  size_t num_keys = keys.size();
+  assert(column_family.size() == num_keys);
+  std::vector<Status> stat_list(num_keys);
+
+  bool should_fail = false;
+  for (size_t i = 0; i < num_keys; ++i) {
+    assert(column_family[i]);
+    if (read_options.timestamp) {
+      stat_list[i] = FailIfTsMismatchCf(
+          column_family[i], *(read_options.timestamp), /*ts_for_read=*/true);
+      if (!stat_list[i].ok()) {
+        should_fail = true;
+      }
+    } else {
+      stat_list[i] = FailIfCfHasTs(column_family[i]);
+      if (!stat_list[i].ok()) {
+        should_fail = true;
+      }
+    }
+  }
+
+  if (should_fail) {
+    for (auto& s : stat_list) {
+      if (s.ok()) {
+        s = Status::Incomplete(
+            "DB not queried due to invalid argument(s) in the same MultiGet");
+      }
+    }
+    return stat_list;
+  }
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(column_family, keys).PermitUncheckedError();
+    }
+  }
+
+  SequenceNumber consistent_seqnum;
+
+  UnorderedMap<uint32_t, MultiGetColumnFamilyData> multiget_cf_data(
+      column_family.size());
+  for (auto cf : column_family) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(cf);
+    auto cfd = cfh->cfd();
+    if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
+      multiget_cf_data.emplace(cfd->GetID(),
+                               MultiGetColumnFamilyData(cfh, nullptr));
+    }
+  }
+
+  std::function<MultiGetColumnFamilyData*(
+      UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&)>
+      iter_deref_lambda =
+          [](UnorderedMap<uint32_t, MultiGetColumnFamilyData>::iterator&
+                 cf_iter) { return &cf_iter->second; };
+
+  bool unref_only =
+      MultiCFSnapshot<UnorderedMap<uint32_t, MultiGetColumnFamilyData>>(
+          read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+          &consistent_seqnum);
+
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum1");
+  TEST_SYNC_POINT("DBImpl::MultiGet:AfterGetSeqNum2");
+
+  // Contain a list of merge operations if merge occurs.
+  MergeContext merge_context;
+
+  // Note: this always resizes the values array
+  values->resize(num_keys);
+  if (timestamps) {
+    timestamps->resize(num_keys);
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  uint64_t bytes_read = 0;
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t num_found = 0;
+  size_t keys_read;
+  uint64_t curr_value_size = 0;
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  for (keys_read = 0; keys_read < num_keys; ++keys_read) {
+    merge_context.Clear();
+    Status& s = stat_list[keys_read];
+    std::string* value = &(*values)[keys_read];
+    std::string* timestamp = timestamps ? &(*timestamps)[keys_read] : nullptr;
+
+    LookupKey lkey(keys[keys_read], consistent_seqnum, read_options.timestamp);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(
+        column_family[keys_read]);
+    SequenceNumber max_covering_tombstone_seq = 0;
+    auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
+    assert(mgd_iter != multiget_cf_data.end());
+    auto mgd = mgd_iter->second;
+    auto super_version = mgd.super_version;
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    bool done = false;
+    if (!skip_memtable) {
+      if (super_version->mem->Get(
+              lkey, value, /*columns=*/nullptr, timestamp, &s, &merge_context,
+              &max_covering_tombstone_seq, read_options,
+              false /* immutable_memtable */, read_callback)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      } else if (super_version->imm->Get(lkey, value, /*columns=*/nullptr,
+                                         timestamp, &s, &merge_context,
+                                         &max_covering_tombstone_seq,
+                                         read_options, read_callback)) {
+        done = true;
+        RecordTick(stats_, MEMTABLE_HIT);
+      }
+    }
+    if (!done) {
+      PinnableSlice pinnable_val;
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      PinnedIteratorsManager pinned_iters_mgr;
+      super_version->current->Get(read_options, lkey, &pinnable_val,
+                                  /*columns=*/nullptr, timestamp, &s,
+                                  &merge_context, &max_covering_tombstone_seq,
+                                  &pinned_iters_mgr, /*value_found=*/nullptr,
+                                  /*key_exists=*/nullptr,
+                                  /*seq=*/nullptr, read_callback);
+      value->assign(pinnable_val.data(), pinnable_val.size());
+      RecordTick(stats_, MEMTABLE_MISS);
+    }
+
+    if (s.ok()) {
+      bytes_read += value->size();
+      num_found++;
+      curr_value_size += value->size();
+      if (curr_value_size > read_options.value_size_soft_limit) {
+        while (++keys_read < num_keys) {
+          stat_list[keys_read] = Status::Aborted();
+        }
+        break;
+      }
+    }
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      break;
+    }
+  }
+
+  if (keys_read < num_keys) {
+    // The only reason to break out of the loop is when the deadline is
+    // exceeded
+    assert(immutable_db_options_.clock->NowMicros() >
+           static_cast<uint64_t>(read_options.deadline.count()));
+    for (++keys_read; keys_read < num_keys; ++keys_read) {
+      stat_list[keys_read] = Status::TimedOut();
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  autovector<SuperVersion*> superversions_to_delete;
+
+  for (auto mgd_iter : multiget_cf_data) {
+    auto mgd = mgd_iter.second;
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(mgd.cfd, mgd.super_version);
+    } else {
+      mgd.cfd->GetSuperVersion()->Unref();
+    }
+  }
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return stat_list;
+}
+
+template <class T>
+bool DBImpl::MultiCFSnapshot(
+    const ReadOptions& read_options, ReadCallback* callback,
+    std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+        iter_deref_func,
+    T* cf_list, SequenceNumber* snapshot) {
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  bool last_try = false;
+  if (cf_list->size() == 1) {
+    // Fast path for a single column family. We can simply get the thread loca
+    // super version
+    auto cf_iter = cf_list->begin();
+    auto node = iter_deref_func(cf_iter);
+    node->super_version = GetAndRefSuperVersion(node->cfd);
+    if (read_options.snapshot != nullptr) {
+      // Note: In WritePrepared txns this is not necessary but not harmful
+      // either.  Because prep_seq > snapshot => commit_seq > snapshot so if
+      // a snapshot is specified we should be fine with skipping seq numbers
+      // that are greater than that.
+      //
+      // In WriteUnprepared, we cannot set snapshot in the lookup key because we
+      // may skip uncommitted data that should be visible to the transaction for
+      // reading own writes.
+      *snapshot =
+          static_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
+      if (callback) {
+        *snapshot = std::max(*snapshot, callback->max_visible_seq());
+      }
+    } else {
+      // Since we get and reference the super version before getting
+      // the snapshot number, without a mutex protection, it is possible
+      // that a memtable switch happened in the middle and not all the
+      // data for this snapshot is available. But it will contain all
+      // the data available in the super version we have, which is also
+      // a valid snapshot to read from.
+      // We shouldn't get snapshot before finding and referencing the super
+      // version because a flush happening in between may compact away data for
+      // the snapshot, but the snapshot is earlier than the data overwriting it,
+      // so users may see wrong results.
+      *snapshot = GetLastPublishedSequence();
+    }
+  } else {
+    // If we end up with the same issue of memtable geting sealed during 2
+    // consecutive retries, it means the write rate is very high. In that case
+    // its probably ok to take the mutex on the 3rd try so we can succeed for
+    // sure
+    constexpr int num_retries = 3;
+    for (int i = 0; i < num_retries; ++i) {
+      last_try = (i == num_retries - 1);
+      bool retry = false;
+
+      if (i > 0) {
+        for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+             ++cf_iter) {
+          auto node = iter_deref_func(cf_iter);
+          SuperVersion* super_version = node->super_version;
+          ColumnFamilyData* cfd = node->cfd;
+          if (super_version != nullptr) {
+            ReturnAndCleanupSuperVersion(cfd, super_version);
+          }
+          node->super_version = nullptr;
+        }
+      }
+      if (read_options.snapshot == nullptr) {
+        if (last_try) {
+          TEST_SYNC_POINT("DBImpl::MultiGet::LastTry");
+          // We're close to max number of retries. For the last retry,
+          // acquire the lock so we're sure to succeed
+          mutex_.Lock();
+        }
+        *snapshot = GetLastPublishedSequence();
+      } else {
+        *snapshot =
+            static_cast_with_check<const SnapshotImpl>(read_options.snapshot)
+                ->number_;
+      }
+      for (auto cf_iter = cf_list->begin(); cf_iter != cf_list->end();
+           ++cf_iter) {
+        auto node = iter_deref_func(cf_iter);
+        if (!last_try) {
+          node->super_version = GetAndRefSuperVersion(node->cfd);
+        } else {
+          node->super_version = node->cfd->GetSuperVersion()->Ref();
+        }
+        TEST_SYNC_POINT("DBImpl::MultiGet::AfterRefSV");
+        if (read_options.snapshot != nullptr || last_try) {
+          // If user passed a snapshot, then we don't care if a memtable is
+          // sealed or compaction happens because the snapshot would ensure
+          // that older key versions are kept around. If this is the last
+          // retry, then we have the lock so nothing bad can happen
+          continue;
+        }
+        // We could get the earliest sequence number for the whole list of
+        // memtables, which will include immutable memtables as well, but that
+        // might be tricky to maintain in case we decide, in future, to do
+        // memtable compaction.
+        if (!last_try) {
+          SequenceNumber seq =
+              node->super_version->mem->GetEarliestSequenceNumber();
+          if (seq > *snapshot) {
+            retry = true;
+            break;
+          }
+        }
+      }
+      if (!retry) {
+        if (last_try) {
+          mutex_.Unlock();
+        }
+        break;
+      }
+    }
+  }
+
+  // Keep track of bytes that we read for statistics-recording later
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  return last_try;
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, Status* statuses,
+                      const bool sorted_input) {
+  return MultiGet(read_options, num_keys, column_families, keys, values,
+                  /*timestamps=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys,
+                      ColumnFamilyHandle** column_families, const Slice* keys,
+                      PinnableSlice* values, std::string* timestamps,
+                      Status* statuses, const bool sorted_input) {
+  if (num_keys == 0) {
+    return;
+  }
+
+  bool should_fail = false;
+  for (size_t i = 0; i < num_keys; ++i) {
+    ColumnFamilyHandle* cfh = column_families[i];
+    assert(cfh);
+    if (read_options.timestamp) {
+      statuses[i] = FailIfTsMismatchCf(cfh, *(read_options.timestamp),
+                                       /*ts_for_read=*/true);
+      if (!statuses[i].ok()) {
+        should_fail = true;
+      }
+    } else {
+      statuses[i] = FailIfCfHasTs(cfh);
+      if (!statuses[i].ok()) {
+        should_fail = true;
+      }
+    }
+  }
+  if (should_fail) {
+    for (size_t i = 0; i < num_keys; ++i) {
+      if (statuses[i].ok()) {
+        statuses[i] = Status::Incomplete(
+            "DB not queried due to invalid argument(s) in the same MultiGet");
+      }
+    }
+    return;
+  }
+
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_families, keys).PermitUncheckedError();
+    }
+  }
+
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    values[i].Reset();
+    key_context.emplace_back(column_families[i], keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+
+  autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>
+      multiget_cf_data;
+  size_t cf_start = 0;
+  ColumnFamilyHandle* cf = sorted_keys[0]->column_family;
+
+  for (size_t i = 0; i < num_keys; ++i) {
+    KeyContext* key_ctx = sorted_keys[i];
+    if (key_ctx->column_family != cf) {
+      multiget_cf_data.emplace_back(cf, cf_start, i - cf_start, nullptr);
+      cf_start = i;
+      cf = key_ctx->column_family;
+    }
+  }
+
+  multiget_cf_data.emplace_back(cf, cf_start, num_keys - cf_start, nullptr);
+
+  std::function<MultiGetColumnFamilyData*(
+      autovector<MultiGetColumnFamilyData,
+                 MultiGetContext::MAX_BATCH_SIZE>::iterator&)>
+      iter_deref_lambda =
+          [](autovector<MultiGetColumnFamilyData,
+                        MultiGetContext::MAX_BATCH_SIZE>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<
+      autovector<MultiGetColumnFamilyData, MultiGetContext::MAX_BATCH_SIZE>>(
+      read_options, nullptr, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = nullptr;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s;
+  auto cf_iter = multiget_cf_data.begin();
+  for (; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+    s = MultiGetImpl(read_options, cf_iter->start, cf_iter->num_keys,
+                     &sorted_keys, cf_iter->super_version, consistent_seqnum,
+                     read_callback);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  if (!s.ok()) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (++cf_iter; cf_iter != multiget_cf_data.end(); ++cf_iter) {
+      for (size_t i = cf_iter->start; i < cf_iter->start + cf_iter->num_keys;
+           ++i) {
+        *sorted_keys[i]->s = s;
+      }
+    }
+  }
+
+  for (const auto& iter : multiget_cf_data) {
+    if (!unref_only) {
+      ReturnAndCleanupSuperVersion(iter.cfd, iter.super_version);
+    } else {
+      iter.cfd->GetSuperVersion()->Unref();
+    }
+  }
+}
+
+namespace {
+// Order keys by CF ID, followed by key contents
+struct CompareKeyContext {
+  inline bool operator()(const KeyContext* lhs, const KeyContext* rhs) {
+    ColumnFamilyHandleImpl* cfh =
+        static_cast<ColumnFamilyHandleImpl*>(lhs->column_family);
+    uint32_t cfd_id1 = cfh->cfd()->GetID();
+    const Comparator* comparator = cfh->cfd()->user_comparator();
+    cfh = static_cast<ColumnFamilyHandleImpl*>(rhs->column_family);
+    uint32_t cfd_id2 = cfh->cfd()->GetID();
+
+    if (cfd_id1 < cfd_id2) {
+      return true;
+    } else if (cfd_id1 > cfd_id2) {
+      return false;
+    }
+
+    // Both keys are from the same column family
+    int cmp = comparator->CompareWithoutTimestamp(
+        *(lhs->key), /*a_has_ts=*/false, *(rhs->key), /*b_has_ts=*/false);
+    if (cmp < 0) {
+      return true;
+    }
+    return false;
+  }
+};
+
+}  // anonymous namespace
+
+void DBImpl::PrepareMultiGetKeys(
+    size_t num_keys, bool sorted_input,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  if (sorted_input) {
+#ifndef NDEBUG
+    assert(std::is_sorted(sorted_keys->begin(), sorted_keys->end(),
+                          CompareKeyContext()));
+#endif
+    return;
+  }
+
+  std::sort(sorted_keys->begin(), sorted_keys->begin() + num_keys,
+            CompareKeyContext());
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      Status* statuses, const bool sorted_input) {
+  return MultiGet(read_options, column_family, num_keys, keys, values,
+                  /*timestamp=*/nullptr, statuses, sorted_input);
+}
+
+void DBImpl::MultiGet(const ReadOptions& read_options,
+                      ColumnFamilyHandle* column_family, const size_t num_keys,
+                      const Slice* keys, PinnableSlice* values,
+                      std::string* timestamps, Status* statuses,
+                      const bool sorted_input) {
+  if (tracer_) {
+    // TODO: This mutex should be removed later, to improve performance when
+    // tracing is enabled.
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      // TODO: maybe handle the tracing status?
+      tracer_->MultiGet(num_keys, column_family, keys).PermitUncheckedError();
+    }
+  }
+  autovector<KeyContext, MultiGetContext::MAX_BATCH_SIZE> key_context;
+  autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE> sorted_keys;
+  sorted_keys.resize(num_keys);
+  for (size_t i = 0; i < num_keys; ++i) {
+    values[i].Reset();
+    key_context.emplace_back(column_family, keys[i], &values[i],
+                             timestamps ? &timestamps[i] : nullptr,
+                             &statuses[i]);
+  }
+  for (size_t i = 0; i < num_keys; ++i) {
+    sorted_keys[i] = &key_context[i];
+  }
+  PrepareMultiGetKeys(num_keys, sorted_input, &sorted_keys);
+  MultiGetWithCallback(read_options, column_family, nullptr, &sorted_keys);
+}
+
+void DBImpl::MultiGetWithCallback(
+    const ReadOptions& read_options, ColumnFamilyHandle* column_family,
+    ReadCallback* callback,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys) {
+  std::array<MultiGetColumnFamilyData, 1> multiget_cf_data;
+  multiget_cf_data[0] = MultiGetColumnFamilyData(column_family, nullptr);
+  std::function<MultiGetColumnFamilyData*(
+      std::array<MultiGetColumnFamilyData, 1>::iterator&)>
+      iter_deref_lambda =
+          [](std::array<MultiGetColumnFamilyData, 1>::iterator& cf_iter) {
+            return &(*cf_iter);
+          };
+
+  size_t num_keys = sorted_keys->size();
+  SequenceNumber consistent_seqnum;
+  bool unref_only = MultiCFSnapshot<std::array<MultiGetColumnFamilyData, 1>>(
+      read_options, callback, iter_deref_lambda, &multiget_cf_data,
+      &consistent_seqnum);
+#ifndef NDEBUG
+  assert(!unref_only);
+#else
+  // Silence unused variable warning
+  (void)unref_only;
+#endif  // NDEBUG
+
+  if (callback && read_options.snapshot == nullptr) {
+    // The unprep_seqs are not published for write unprepared, so it could be
+    // that max_visible_seq is larger. Seek to the std::max of the two.
+    // However, we still want our callback to contain the actual snapshot so
+    // that it can do the correct visibility filtering.
+    callback->Refresh(consistent_seqnum);
+
+    // Internally, WriteUnpreparedTxnReadCallback::Refresh would set
+    // max_visible_seq = max(max_visible_seq, snapshot)
+    //
+    // Currently, the commented out assert is broken by
+    // InvalidSnapshotReadCallback, but if write unprepared recovery followed
+    // the regular transaction flow, then this special read callback would not
+    // be needed.
+    //
+    // assert(callback->max_visible_seq() >= snapshot);
+    consistent_seqnum = callback->max_visible_seq();
+  }
+
+  GetWithTimestampReadCallback timestamp_read_callback(0);
+  ReadCallback* read_callback = callback;
+  if (read_options.timestamp && read_options.timestamp->size() > 0) {
+    assert(!read_callback);  // timestamp with callback is not supported
+    timestamp_read_callback.Refresh(consistent_seqnum);
+    read_callback = &timestamp_read_callback;
+  }
+
+  Status s = MultiGetImpl(read_options, 0, num_keys, sorted_keys,
+                          multiget_cf_data[0].super_version, consistent_seqnum,
+                          read_callback);
+  assert(s.ok() || s.IsTimedOut() || s.IsAborted());
+  ReturnAndCleanupSuperVersion(multiget_cf_data[0].cfd,
+                               multiget_cf_data[0].super_version);
+}
+
+// The actual implementation of batched MultiGet. Parameters -
+// start_key - Index in the sorted_keys vector to start processing from
+// num_keys - Number of keys to lookup, starting with sorted_keys[start_key]
+// sorted_keys - The entire batch of sorted keys for this CF
+//
+// The per key status is returned in the KeyContext structures pointed to by
+// sorted_keys. An overall Status is also returned, with the only possible
+// values being Status::OK() and Status::TimedOut(). The latter indicates
+// that the call exceeded read_options.deadline
+Status DBImpl::MultiGetImpl(
+    const ReadOptions& read_options, size_t start_key, size_t num_keys,
+    autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+    SuperVersion* super_version, SequenceNumber snapshot,
+    ReadCallback* callback) {
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET);
+
+  assert(sorted_keys);
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  for (auto* kctx : *sorted_keys) {
+    assert(kctx);
+    if (kctx->timestamp) {
+      kctx->timestamp->clear();
+    }
+  }
+
+  // For each of the given keys, apply the entire "get" process as follows:
+  // First look in the memtable, then in the immutable memtable (if any).
+  // s is both in/out. When in, s could either be OK or MergeInProgress.
+  // merge_operands will contain the sequence of merges in the latter case.
+  size_t keys_left = num_keys;
+  Status s;
+  uint64_t curr_value_size = 0;
+  while (keys_left) {
+    if (read_options.deadline.count() &&
+        immutable_db_options_.clock->NowMicros() >
+            static_cast<uint64_t>(read_options.deadline.count())) {
+      s = Status::TimedOut();
+      break;
+    }
+
+    size_t batch_size = (keys_left > MultiGetContext::MAX_BATCH_SIZE)
+                            ? MultiGetContext::MAX_BATCH_SIZE
+                            : keys_left;
+    MultiGetContext ctx(sorted_keys, start_key + num_keys - keys_left,
+                        batch_size, snapshot, read_options, GetFileSystem(),
+                        stats_);
+    MultiGetRange range = ctx.GetMultiGetRange();
+    range.AddValueSize(curr_value_size);
+    bool lookup_current = false;
+
+    keys_left -= batch_size;
+    for (auto mget_iter = range.begin(); mget_iter != range.end();
+         ++mget_iter) {
+      mget_iter->merge_context.Clear();
+      *mget_iter->s = Status::OK();
+    }
+
+    bool skip_memtable =
+        (read_options.read_tier == kPersistedTier &&
+         has_unpersisted_data_.load(std::memory_order_relaxed));
+    if (!skip_memtable) {
+      super_version->mem->MultiGet(read_options, &range, callback,
+                                   false /* immutable_memtable */);
+      if (!range.empty()) {
+        super_version->imm->MultiGet(read_options, &range, callback);
+      }
+      if (!range.empty()) {
+        lookup_current = true;
+        uint64_t left = range.KeysLeft();
+        RecordTick(stats_, MEMTABLE_MISS, left);
+      }
+    }
+    if (lookup_current) {
+      PERF_TIMER_GUARD(get_from_output_files_time);
+      super_version->current->MultiGet(read_options, &range, callback);
+    }
+    curr_value_size = range.GetValueSize();
+    if (curr_value_size > read_options.value_size_soft_limit) {
+      s = Status::Aborted();
+      break;
+    }
+  }
+
+  // Post processing (decrement reference counts and record statistics)
+  PERF_TIMER_GUARD(get_post_process_time);
+  size_t num_found = 0;
+  uint64_t bytes_read = 0;
+  for (size_t i = start_key; i < start_key + num_keys - keys_left; ++i) {
+    KeyContext* key = (*sorted_keys)[i];
+    if (key->s->ok()) {
+      bytes_read += key->value->size();
+      num_found++;
+    }
+  }
+  if (keys_left) {
+    assert(s.IsTimedOut() || s.IsAborted());
+    for (size_t i = start_key + num_keys - keys_left; i < start_key + num_keys;
+         ++i) {
+      KeyContext* key = (*sorted_keys)[i];
+      *key->s = s;
+    }
+  }
+
+  RecordTick(stats_, NUMBER_MULTIGET_CALLS);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
+  RecordTick(stats_, NUMBER_MULTIGET_KEYS_FOUND, num_found);
+  RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
+  RecordInHistogram(stats_, BYTES_PER_MULTIGET, bytes_read);
+  PERF_COUNTER_ADD(multiget_read_bytes, bytes_read);
+  PERF_TIMER_STOP(get_post_process_time);
+
+  return s;
+}
+
+Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                  const std::string& column_family,
+                                  ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  Status s = CreateColumnFamilyImpl(cf_options, column_family, handle);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const ColumnFamilyOptions& cf_options,
+    const std::vector<std::string>& column_family_names,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_family_names.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(cf_options, column_family_names[i], &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles) {
+  assert(handles != nullptr);
+  handles->clear();
+  size_t num_cf = column_families.size();
+  Status s;
+  bool success_once = false;
+  for (size_t i = 0; i < num_cf; i++) {
+    ColumnFamilyHandle* handle;
+    s = CreateColumnFamilyImpl(column_families[i].options,
+                               column_families[i].name, &handle);
+    if (!s.ok()) {
+      break;
+    }
+    handles->push_back(handle);
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                      const std::string& column_family_name,
+                                      ColumnFamilyHandle** handle) {
+  Status s;
+  *handle = nullptr;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  s = ColumnFamilyData::ValidateOptions(db_options, cf_options);
+  if (s.ok()) {
+    for (auto& cf_path : cf_options.cf_paths) {
+      s = env_->CreateDirIfMissing(cf_path.path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
+        nullptr) {
+      return Status::InvalidArgument("Column family already exists");
+    }
+    VersionEdit edit;
+    edit.AddColumnFamily(column_family_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    edit.SetColumnFamily(new_id);
+    edit.SetLogNumber(logfile_number_);
+    edit.SetComparatorName(cf_options.comparator->Name());
+
+    // LogAndApply will both write the creation in MANIFEST and create
+    // ColumnFamilyData object
+    {  // write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      // LogAndApply will both write the creation in MANIFEST and create
+      // ColumnFamilyData object
+      s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit,
+                                 &mutex_, directories_.GetDbDir(), false,
+                                 &cf_options);
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      std::map<std::string, std::shared_ptr<FSDirectory>> dummy_created_dirs;
+      s = cfd->AddDirectories(&dummy_created_dirs);
+    }
+    if (s.ok()) {
+      auto* cfd =
+          versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
+      assert(cfd != nullptr);
+      InstallSuperVersionAndScheduleWork(cfd, &sv_context,
+                                         *cfd->GetLatestMutableCFOptions());
+
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        is_snapshot_supported_ = false;
+      }
+
+      cfd->set_initialized();
+
+      *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Created column family [%s] (ID %u)",
+                     column_family_name.c_str(), (unsigned)cfd->GetID());
+    } else {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Creating column family [%s] FAILED -- %s",
+                      column_family_name.c_str(), s.ToString().c_str());
+    }
+  }  // InstrumentedMutexLock l(&mutex_)
+
+  if (cf_options.preserve_internal_time_seconds > 0 ||
+      cf_options.preclude_last_level_data_seconds > 0) {
+    s = RegisterRecordSeqnoTimeWorker();
+  }
+  sv_context.Clean();
+  // this is outside the mutex
+  if (s.ok()) {
+    NewThreadStatusCfInfo(
+        static_cast_with_check<ColumnFamilyHandleImpl>(*handle)->cfd());
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
+  assert(column_family != nullptr);
+  Status s = DropColumnFamilyImpl(column_family);
+  if (s.ok()) {
+    s = WriteOptionsFile(true /*need_mutex_lock*/,
+                         true /*need_enter_write_thread*/);
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  bool success_once = false;
+  for (auto* handle : column_families) {
+    s = DropColumnFamilyImpl(handle);
+    if (!s.ok()) {
+      break;
+    }
+    success_once = true;
+  }
+  if (success_once) {
+    Status persist_options_status = WriteOptionsFile(
+        true /*need_mutex_lock*/, true /*need_enter_write_thread*/);
+    if (s.ok() && !persist_options_status.ok()) {
+      s = persist_options_status;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  if (cfd->GetID() == 0) {
+    return Status::InvalidArgument("Can't drop default column family");
+  }
+
+  bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported();
+
+  VersionEdit edit;
+  edit.DropColumnFamily();
+  edit.SetColumnFamily(cfd->GetID());
+
+  Status s;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (cfd->IsDropped()) {
+      s = Status::InvalidArgument("Column family already dropped!\n");
+    }
+    if (s.ok()) {
+      // we drop column family from a single write thread
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit,
+                                 &mutex_, directories_.GetDbDir());
+      write_thread_.ExitUnbatched(&w);
+    }
+    if (s.ok()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    if (!cf_support_snapshot) {
+      // Dropped Column Family doesn't support snapshot. Need to recalculate
+      // is_snapshot_supported_.
+      bool new_is_snapshot_supported = true;
+      for (auto c : *versions_->GetColumnFamilySet()) {
+        if (!c->IsDropped() && !c->mem()->IsSnapshotSupported()) {
+          new_is_snapshot_supported = false;
+          break;
+        }
+      }
+      is_snapshot_supported_ = new_is_snapshot_supported;
+    }
+    bg_cv_.SignalAll();
+  }
+
+  if (cfd->ioptions()->preserve_internal_time_seconds > 0 ||
+      cfd->ioptions()->preclude_last_level_data_seconds > 0) {
+    s = RegisterRecordSeqnoTimeWorker();
+  }
+
+  if (s.ok()) {
+    // Note that here we erase the associated cf_info of the to-be-dropped
+    // cfd before its ref-count goes to zero to avoid having to erase cf_info
+    // later inside db_mutex.
+    EraseThreadStatusCfInfo(cfd);
+    assert(cfd->IsDropped());
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Dropped column family with id %u\n", cfd->GetID());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Dropping column family with id %u FAILED -- %s\n",
+                    cfd->GetID(), s.ToString().c_str());
+  }
+
+  return s;
+}
+
+bool DBImpl::KeyMayExist(const ReadOptions& read_options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         std::string* value, std::string* timestamp,
+                         bool* value_found) {
+  assert(value != nullptr);
+  if (value_found != nullptr) {
+    // falsify later if key-may-exist but can't fetch value
+    *value_found = true;
+  }
+  ReadOptions roptions = read_options;
+  roptions.read_tier = kBlockCacheTier;  // read from block cache only
+  PinnableSlice pinnable_val;
+  GetImplOptions get_impl_options;
+  get_impl_options.column_family = column_family;
+  get_impl_options.value = &pinnable_val;
+  get_impl_options.value_found = value_found;
+  get_impl_options.timestamp = timestamp;
+  auto s = GetImpl(roptions, key, get_impl_options);
+  value->assign(pinnable_val.data(), pinnable_val.size());
+
+  // If block_cache is enabled and the index block of the table didn't
+  // not present in block_cache, the return value will be Status::Incomplete.
+  // In this case, key may still exist in the table.
+  return s.ok() || s.IsIncomplete();
+}
+
+Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
+                              ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  Iterator* result = nullptr;
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+
+  assert(column_family);
+
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  assert(cfd != nullptr);
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    // not supported in lite version
+    result = nullptr;
+
+#else
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                    /* allow_unprepared_value */ true);
+    result = NewDBIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations, read_callback,
+        this, cfd);
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterator is overridden in
+    // WritePreparedTxnDB
+    result = NewIteratorImpl(read_options, cfd,
+                             (read_options.snapshot != nullptr)
+                                 ? read_options.snapshot->GetSequenceNumber()
+                                 : kMaxSequenceNumber,
+                             read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImpl::NewIteratorImpl(const ReadOptions& read_options,
+                                            ColumnFamilyData* cfd,
+                                            SequenceNumber snapshot,
+                                            ReadCallback* read_callback,
+                                            bool expose_blob_index,
+                                            bool allow_refresh) {
+  SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+
+  TEST_SYNC_POINT("DBImpl::NewIterator:1");
+  TEST_SYNC_POINT("DBImpl::NewIterator:2");
+
+  if (snapshot == kMaxSequenceNumber) {
+    // Note that the snapshot is assigned AFTER referencing the super
+    // version because otherwise a flush happening in between may compact away
+    // data for the snapshot, so the reader would see neither data that was be
+    // visible to the snapshot before compaction nor the newer data inserted
+    // afterwards.
+    // Note that the super version might not contain all the data available
+    // to this snapshot, but in that case it can see all the data in the
+    // super version, which is a valid consistent state after the user
+    // calls NewIterator().
+    snapshot = versions_->LastSequence();
+    TEST_SYNC_POINT("DBImpl::NewIterator:3");
+    TEST_SYNC_POINT("DBImpl::NewIterator:4");
+  }
+
+  // Try to generate a DB iterator tree in continuous memory area to be
+  // cache friendly. Here is an example of result:
+  // +-------------------------------+
+  // |                               |
+  // | ArenaWrappedDBIter            |
+  // |  +                            |
+  // |  +---> Inner Iterator   ------------+
+  // |  |                            |     |
+  // |  |    +-- -- -- -- -- -- -- --+     |
+  // |  +--- | Arena                 |     |
+  // |       |                       |     |
+  // |          Allocated Memory:    |     |
+  // |       |   +-------------------+     |
+  // |       |   | DBIter            | <---+
+  // |           |  +                |
+  // |       |   |  +-> iter_  ------------+
+  // |       |   |                   |     |
+  // |       |   +-------------------+     |
+  // |       |   | MergingIterator   | <---+
+  // |           |  +                |
+  // |       |   |  +->child iter1  ------------+
+  // |       |   |  |                |          |
+  // |           |  +->child iter2  ----------+ |
+  // |       |   |  |                |        | |
+  // |       |   |  +->child iter3  --------+ | |
+  // |           |                   |      | | |
+  // |       |   +-------------------+      | | |
+  // |       |   | Iterator1         | <--------+
+  // |       |   +-------------------+      | |
+  // |       |   | Iterator2         | <------+
+  // |       |   +-------------------+      |
+  // |       |   | Iterator3         | <----+
+  // |       |   +-------------------+
+  // |       |                       |
+  // +-------+-----------------------+
+  //
+  // ArenaWrappedDBIter inlines an arena area where all the iterators in
+  // the iterator tree are allocated in the order of being accessed when
+  // querying.
+  // Laying out the iterators in the order of being accessed makes it more
+  // likely that any iterator pointer is close to the iterator it points to so
+  // that they are likely to be in the same cache line and/or page.
+  ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), sv->mutable_cf_options, sv->current,
+      snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
+      sv->version_number, read_callback, this, cfd, expose_blob_index,
+      read_options.snapshot != nullptr ? false : allow_refresh);
+
+  InternalIterator* internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), snapshot,
+      /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+
+  return db_iter;
+}
+
+Status DBImpl::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+#ifdef ROCKSDB_LITE
+    return Status::InvalidArgument(
+        "Tailing iterator not supported in RocksDB lite");
+#else
+    for (auto cfh : column_families) {
+      auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+      SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+      auto iter = new ForwardIterator(this, read_options, cfd, sv,
+                                      /* allow_unprepared_value */ true);
+      iterators->push_back(NewDBIterator(
+          env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+          cfd->user_comparator(), iter, sv->current, kMaxSequenceNumber,
+          sv->mutable_cf_options.max_sequential_skip_in_iterations,
+          read_callback, this, cfd));
+    }
+#endif
+  } else {
+    // Note: no need to consider the special case of
+    // last_seq_same_as_publish_seq_==false since NewIterators is overridden in
+    // WritePreparedTxnDB
+    auto snapshot = read_options.snapshot != nullptr
+                        ? read_options.snapshot->GetSequenceNumber()
+                        : versions_->LastSequence();
+    for (size_t i = 0; i < column_families.size(); ++i) {
+      auto* cfd =
+          static_cast_with_check<ColumnFamilyHandleImpl>(column_families[i])
+              ->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, snapshot, read_callback));
+    }
+  }
+
+  return Status::OK();
+}
+
+const Snapshot* DBImpl::GetSnapshot() { return GetSnapshotImpl(false); }
+
+#ifndef ROCKSDB_LITE
+const Snapshot* DBImpl::GetSnapshotForWriteConflictBoundary() {
+  return GetSnapshotImpl(true);
+}
+#endif  // ROCKSDB_LITE
+
+std::pair<Status, std::shared_ptr<const Snapshot>>
+DBImpl::CreateTimestampedSnapshot(SequenceNumber snapshot_seq, uint64_t ts) {
+  assert(ts != std::numeric_limits<uint64_t>::max());
+
+  auto ret = CreateTimestampedSnapshotImpl(snapshot_seq, ts, /*lock=*/true);
+  return ret;
+}
+
+std::shared_ptr<const SnapshotImpl> DBImpl::GetTimestampedSnapshot(
+    uint64_t ts) const {
+  InstrumentedMutexLock lock_guard(&mutex_);
+  return timestamped_snapshots_.GetSnapshot(ts);
+}
+
+void DBImpl::ReleaseTimestampedSnapshotsOlderThan(uint64_t ts,
+                                                  size_t* remaining_total_ss) {
+  autovector<std::shared_ptr<const SnapshotImpl>> snapshots_to_release;
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    timestamped_snapshots_.ReleaseSnapshotsOlderThan(ts, snapshots_to_release);
+  }
+  snapshots_to_release.clear();
+
+  if (remaining_total_ss) {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    *remaining_total_ss = static_cast<size_t>(snapshots_.count());
+  }
+}
+
+Status DBImpl::GetTimestampedSnapshots(
+    uint64_t ts_lb, uint64_t ts_ub,
+    std::vector<std::shared_ptr<const Snapshot>>& timestamped_snapshots) const {
+  if (ts_lb >= ts_ub) {
+    return Status::InvalidArgument(
+        "timestamp lower bound must be smaller than upper bound");
+  }
+  timestamped_snapshots.clear();
+  InstrumentedMutexLock lock_guard(&mutex_);
+  timestamped_snapshots_.GetSnapshots(ts_lb, ts_ub, timestamped_snapshots);
+  return Status::OK();
+}
+
+SnapshotImpl* DBImpl::GetSnapshotImpl(bool is_write_conflict_boundary,
+                                      bool lock) {
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
+
+  if (lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  // returns null if the underlying memtable does not support snapshot.
+  if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
+    delete s;
+    return nullptr;
+  }
+  auto snapshot_seq = GetLastPublishedSequence();
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time, is_write_conflict_boundary);
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return snapshot;
+}
+
+std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+DBImpl::CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+                                      bool lock) {
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SnapshotImpl* s = new SnapshotImpl;
+
+  const bool need_update_seq = (snapshot_seq != kMaxSequenceNumber);
+
+  if (lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  // returns null if the underlying memtable does not support snapshot.
+  if (!is_snapshot_supported_) {
+    if (lock) {
+      mutex_.Unlock();
+    }
+    delete s;
+    return std::make_pair(
+        Status::NotSupported("Memtable does not support snapshot"), nullptr);
+  }
+
+  // Caller is not write thread, thus didn't provide a valid snapshot_seq.
+  // Obtain seq from db.
+  if (!need_update_seq) {
+    snapshot_seq = GetLastPublishedSequence();
+  }
+
+  std::shared_ptr<const SnapshotImpl> latest =
+      timestamped_snapshots_.GetSnapshot(std::numeric_limits<uint64_t>::max());
+
+  // If there is already a latest timestamped snapshot, then we need to do some
+  // checks.
+  if (latest) {
+    uint64_t latest_snap_ts = latest->GetTimestamp();
+    SequenceNumber latest_snap_seq = latest->GetSequenceNumber();
+    assert(latest_snap_seq <= snapshot_seq);
+    bool needs_create_snap = true;
+    Status status;
+    std::shared_ptr<const SnapshotImpl> ret;
+    if (latest_snap_ts > ts) {
+      // A snapshot created later cannot have smaller timestamp than a previous
+      // timestamped snapshot.
+      needs_create_snap = false;
+      std::ostringstream oss;
+      oss << "snapshot exists with larger timestamp " << latest_snap_ts << " > "
+          << ts;
+      status = Status::InvalidArgument(oss.str());
+    } else if (latest_snap_ts == ts) {
+      if (latest_snap_seq == snapshot_seq) {
+        // We are requesting the same sequence number and timestamp, thus can
+        // safely reuse (share) the current latest timestamped snapshot.
+        needs_create_snap = false;
+        ret = latest;
+      } else if (latest_snap_seq < snapshot_seq) {
+        // There may have been writes to the database since the latest
+        // timestamped snapshot, yet we are still requesting the same
+        // timestamp. In this case, we cannot create the new timestamped
+        // snapshot.
+        needs_create_snap = false;
+        std::ostringstream oss;
+        oss << "Allocated seq is " << snapshot_seq
+            << ", while snapshot exists with smaller seq " << latest_snap_seq
+            << " but same timestamp " << ts;
+        status = Status::InvalidArgument(oss.str());
+      }
+    }
+    if (!needs_create_snap) {
+      if (lock) {
+        mutex_.Unlock();
+      }
+      delete s;
+      return std::make_pair(status, ret);
+    } else {
+      status.PermitUncheckedError();
+    }
+  }
+
+  SnapshotImpl* snapshot =
+      snapshots_.New(s, snapshot_seq, unix_time,
+                     /*is_write_conflict_boundary=*/true, ts);
+
+  std::shared_ptr<const SnapshotImpl> ret(
+      snapshot,
+      std::bind(&DBImpl::ReleaseSnapshot, this, std::placeholders::_1));
+  timestamped_snapshots_.AddSnapshot(ret);
+
+  // Caller is from write thread, and we need to update database's sequence
+  // number.
+  if (need_update_seq) {
+    assert(versions_);
+    if (last_seq_same_as_publish_seq_) {
+      versions_->SetLastSequence(snapshot_seq);
+    } else {
+      // TODO: support write-prepared/write-unprepared transactions with two
+      // write queues.
+      assert(false);
+    }
+  }
+
+  if (lock) {
+    mutex_.Unlock();
+  }
+  return std::make_pair(Status::OK(), ret);
+}
+
+namespace {
+using CfdList = autovector<ColumnFamilyData*, 2>;
+bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
+  for (const ColumnFamilyData* t : list) {
+    if (t == cfd) {
+      return true;
+    }
+  }
+  return false;
+}
+}  //  namespace
+
+void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  if (s == nullptr) {
+    // DBImpl::GetSnapshot() can return nullptr when snapshot
+    // not supported by specifying the condition:
+    // inplace_update_support enabled.
+    return;
+  }
+  const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    snapshots_.Delete(casted_s);
+    uint64_t oldest_snapshot;
+    if (snapshots_.empty()) {
+      oldest_snapshot = GetLastPublishedSequence();
+    } else {
+      oldest_snapshot = snapshots_.oldest()->number_;
+    }
+    // Avoid to go through every column family by checking a global threshold
+    // first.
+    if (oldest_snapshot > bottommost_files_mark_threshold_) {
+      CfdList cf_scheduled;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (!cfd->ioptions()->allow_ingest_behind) {
+          cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
+          if (!cfd->current()
+                   ->storage_info()
+                   ->BottommostFilesMarkedForCompaction()
+                   .empty()) {
+            SchedulePendingCompaction(cfd);
+            MaybeScheduleFlushOrCompaction();
+            cf_scheduled.push_back(cfd);
+          }
+        }
+      }
+
+      // Calculate a new threshold, skipping those CFs where compactions are
+      // scheduled. We do not do the same pass as the previous loop because
+      // mutex might be unlocked during the loop, making the result inaccurate.
+      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        if (CfdListContains(cf_scheduled, cfd) ||
+            cfd->ioptions()->allow_ingest_behind) {
+          continue;
+        }
+        new_bottommost_files_mark_threshold = std::min(
+            new_bottommost_files_mark_threshold,
+            cfd->current()->storage_info()->bottommost_files_mark_threshold());
+      }
+      bottommost_files_mark_threshold_ = new_bottommost_files_mark_threshold;
+    }
+  }
+  delete casted_s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
+                                        TablePropertiesCollection* props) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfAllTables(props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family,
+                                            const Range* range, std::size_t n,
+                                            TablePropertiesCollection* props) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  // Increment the ref count
+  mutex_.Lock();
+  auto version = cfd->current();
+  version->Ref();
+  mutex_.Unlock();
+
+  auto s = version->GetPropertiesOfTablesInRange(range, n, props);
+
+  // Decrement the ref count
+  mutex_.Lock();
+  version->Unref();
+  mutex_.Unlock();
+
+  return s;
+}
+
+#endif  // ROCKSDB_LITE
+
+const std::string& DBImpl::GetName() const { return dbname_; }
+
+Env* DBImpl::GetEnv() const { return env_; }
+
+FileSystem* DB::GetFileSystem() const {
+  const auto& fs = GetEnv()->GetFileSystem();
+  return fs.get();
+}
+
+FileSystem* DBImpl::GetFileSystem() const {
+  return immutable_db_options_.fs.get();
+}
+
+SystemClock* DBImpl::GetSystemClock() const {
+  return immutable_db_options_.clock;
+}
+
+#ifndef ROCKSDB_LITE
+
+Status DBImpl::StartIOTrace(const TraceOptions& trace_options,
+                            std::unique_ptr<TraceWriter>&& trace_writer) {
+  assert(trace_writer != nullptr);
+  return io_tracer_->StartIOTrace(GetSystemClock(), trace_options,
+                                  std::move(trace_writer));
+}
+
+Status DBImpl::EndIOTrace() {
+  io_tracer_->EndIOTrace();
+  return Status::OK();
+}
+
+#endif  // ROCKSDB_LITE
+
+Options DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
+  InstrumentedMutexLock l(&mutex_);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  return Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                 cfh->cfd()->GetLatestCFOptions());
+}
+
+DBOptions DBImpl::GetDBOptions() const {
+  InstrumentedMutexLock l(&mutex_);
+  return BuildDBOptions(immutable_db_options_, mutable_db_options_);
+}
+
+bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
+                         const Slice& property, std::string* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_int) {
+    uint64_t int_value;
+    bool ret_value =
+        GetIntPropertyInternal(cfd, *property_info, false, &int_value);
+    if (ret_value) {
+      *value = std::to_string(int_value);
+    }
+    return ret_value;
+  } else if (property_info->handle_string) {
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetStringProperty(*property_info, property,
+                                                      value);
+    }
+  } else if (property_info->handle_string_dbimpl) {
+    if (property_info->need_out_of_mutex) {
+      return (this->*(property_info->handle_string_dbimpl))(value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return (this->*(property_info->handle_string_dbimpl))(value);
+    }
+  }
+  // Shouldn't reach here since exactly one of handle_string and handle_int
+  // should be non-nullptr.
+  assert(false);
+  return false;
+}
+
+bool DBImpl::GetMapProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property,
+                            std::map<std::string, std::string>* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  value->clear();
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  if (property_info == nullptr) {
+    return false;
+  } else if (property_info->handle_map) {
+    if (property_info->need_out_of_mutex) {
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetMapProperty(*property_info, property,
+                                                   value);
+    }
+  }
+  // If we reach this point it means that handle_map is not provided for the
+  // requested property
+  return false;
+}
+
+bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
+                            const Slice& property, uint64_t* value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  return GetIntPropertyInternal(cfd, *property_info, false, value);
+}
+
+bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd,
+                                    const DBPropertyInfo& property_info,
+                                    bool is_locked, uint64_t* value) {
+  assert(property_info.handle_int != nullptr);
+  if (!property_info.need_out_of_mutex) {
+    if (is_locked) {
+      mutex_.AssertHeld();
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      return cfd->internal_stats()->GetIntProperty(property_info, value, this);
+    }
+  } else {
+    SuperVersion* sv = nullptr;
+    if (is_locked) {
+      mutex_.Unlock();
+    }
+    sv = GetAndRefSuperVersion(cfd);
+
+    bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
+        property_info, sv->current, value);
+
+    ReturnAndCleanupSuperVersion(cfd, sv);
+    if (is_locked) {
+      mutex_.Lock();
+    }
+
+    return ret;
+  }
+}
+
+bool DBImpl::GetPropertyHandleOptionsStatistics(std::string* value) {
+  assert(value != nullptr);
+  Statistics* statistics = immutable_db_options_.stats;
+  if (!statistics) {
+    return false;
+  }
+  *value = statistics->ToString();
+  return true;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::ResetStats() {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->initialized()) {
+      cfd->internal_stats()->Clear();
+    }
+  }
+  return Status::OK();
+}
+#endif  // ROCKSDB_LITE
+
+bool DBImpl::GetAggregatedIntProperty(const Slice& property,
+                                      uint64_t* aggregated_value) {
+  const DBPropertyInfo* property_info = GetPropertyInfo(property);
+  if (property_info == nullptr || property_info->handle_int == nullptr) {
+    return false;
+  }
+
+  uint64_t sum = 0;
+  bool ret = true;
+  {
+    // Needs mutex to protect the list of column families.
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t value;
+    for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
+      if (!cfd->initialized()) {
+        continue;
+      }
+      ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
+      // GetIntPropertyInternal may release db mutex and re-acquire it.
+      mutex_.AssertHeld();
+      if (ret) {
+        sum += value;
+      } else {
+        ret = false;
+        break;
+      }
+    }
+  }
+  *aggregated_value = sum;
+  return ret;
+}
+
+SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
+  // TODO(ljin): consider using GetReferencedSuperVersion() directly
+  return cfd->GetThreadLocalSuperVersion(this);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+SuperVersion* DBImpl::GetAndRefSuperVersion(uint32_t column_family_id) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+  if (!cfd) {
+    return nullptr;
+  }
+
+  return GetAndRefSuperVersion(cfd);
+}
+
+void DBImpl::CleanupSuperVersion(SuperVersion* sv) {
+  // Release SuperVersion
+  if (sv->Unref()) {
+    bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+    {
+      InstrumentedMutexLock l(&mutex_);
+      sv->Cleanup();
+      if (defer_purge) {
+        AddSuperVersionsToFreeQueue(sv);
+        SchedulePurge();
+      }
+    }
+    if (!defer_purge) {
+      delete sv;
+    }
+    RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
+  }
+  RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
+}
+
+void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
+                                          SuperVersion* sv) {
+  if (!cfd->ReturnThreadLocalSuperVersion(sv)) {
+    CleanupSuperVersion(sv);
+  }
+}
+
+// REQUIRED: this function should only be called on the write thread.
+void DBImpl::ReturnAndCleanupSuperVersion(uint32_t column_family_id,
+                                          SuperVersion* sv) {
+  auto column_family_set = versions_->GetColumnFamilySet();
+  auto cfd = column_family_set->GetColumnFamily(column_family_id);
+
+  // If SuperVersion is held, and we successfully fetched a cfd using
+  // GetAndRefSuperVersion(), it must still exist.
+  assert(cfd != nullptr);
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+// REQUIRED: this function should only be called on the write thread or if the
+// mutex is held.
+ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) {
+  ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get();
+
+  if (!cf_memtables->Seek(column_family_id)) {
+    return nullptr;
+  }
+
+  return cf_memtables->GetColumnFamilyHandle();
+}
+
+// REQUIRED: mutex is NOT held.
+std::unique_ptr<ColumnFamilyHandle> DBImpl::GetColumnFamilyHandleUnlocked(
+    uint32_t column_family_id) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto* cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(column_family_id);
+  if (cfd == nullptr) {
+    return nullptr;
+  }
+
+  return std::unique_ptr<ColumnFamilyHandleImpl>(
+      new ColumnFamilyHandleImpl(cfd, this, &mutex_));
+}
+
+void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                         const Range& range,
+                                         uint64_t* const count,
+                                         uint64_t* const size) {
+  ColumnFamilyHandleImpl* cfh =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+
+  // Convert user_key into a corresponding internal key.
+  InternalKey k1(range.start, kMaxSequenceNumber, kValueTypeForSeek);
+  InternalKey k2(range.limit, kMaxSequenceNumber, kValueTypeForSeek);
+  MemTable::MemTableStats memStats =
+      sv->mem->ApproximateStats(k1.Encode(), k2.Encode());
+  MemTable::MemTableStats immStats =
+      sv->imm->ApproximateStats(k1.Encode(), k2.Encode());
+  *count = memStats.count + immStats.count;
+  *size = memStats.size + immStats.size;
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options,
+                                   ColumnFamilyHandle* column_family,
+                                   const Range* range, int n, uint64_t* sizes) {
+  if (!options.include_memtables && !options.include_files) {
+    return Status::InvalidArgument("Invalid options");
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+
+  Version* v;
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* sv = GetAndRefSuperVersion(cfd);
+  v = sv->current;
+
+  for (int i = 0; i < n; i++) {
+    Slice start = range[i].start;
+    Slice limit = range[i].limit;
+
+    // Add timestamp if needed
+    std::string start_with_ts, limit_with_ts;
+    if (ts_sz > 0) {
+      // Maximum timestamp means including all key with any timestamp
+      AppendKeyWithMaxTimestamp(&start_with_ts, start, ts_sz);
+      // Append a maximum timestamp as the range limit is exclusive:
+      // [start, limit)
+      AppendKeyWithMaxTimestamp(&limit_with_ts, limit, ts_sz);
+      start = start_with_ts;
+      limit = limit_with_ts;
+    }
+    // Convert user_key into a corresponding internal key.
+    InternalKey k1(start, kMaxSequenceNumber, kValueTypeForSeek);
+    InternalKey k2(limit, kMaxSequenceNumber, kValueTypeForSeek);
+    sizes[i] = 0;
+    if (options.include_files) {
+      sizes[i] += versions_->ApproximateSize(
+          options, v, k1.Encode(), k2.Encode(), /*start_level=*/0,
+          /*end_level=*/-1, TableReaderCaller::kUserApproximateSize);
+    }
+    if (options.include_memtables) {
+      sizes[i] += sv->mem->ApproximateStats(k1.Encode(), k2.Encode()).size;
+      sizes[i] += sv->imm->ApproximateStats(k1.Encode(), k2.Encode()).size;
+    }
+  }
+
+  ReturnAndCleanupSuperVersion(cfd, sv);
+  return Status::OK();
+}
+
+std::list<uint64_t>::iterator
+DBImpl::CaptureCurrentFileNumberInPendingOutputs() {
+  // We need to remember the iterator of our insert, because after the
+  // background job is done, we need to remove that element from
+  // pending_outputs_.
+  pending_outputs_.push_back(versions_->current_next_file_number());
+  auto pending_outputs_inserted_elem = pending_outputs_.end();
+  --pending_outputs_inserted_elem;
+  return pending_outputs_inserted_elem;
+}
+
+void DBImpl::ReleaseFileNumberFromPendingOutputs(
+    std::unique_ptr<std::list<uint64_t>::iterator>& v) {
+  if (v.get() != nullptr) {
+    pending_outputs_.erase(*v.get());
+    v.reset();
+  }
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options) {
+  RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
+  if (seq_per_batch_) {
+    return Status::NotSupported(
+        "This API is not yet compatible with write-prepared/write-unprepared "
+        "transactions");
+  }
+  if (seq > versions_->LastSequence()) {
+    return Status::NotFound("Requested sequence not yet written in the db");
+  }
+  return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
+}
+
+Status DBImpl::DeleteFile(std::string name) {
+  uint64_t number;
+  FileType type;
+  WalFileType log_type;
+  if (!ParseFileName(name, &number, &type, &log_type) ||
+      (type != kTableFile && type != kWalFile)) {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log, "DeleteFile %s failed.\n",
+                    name.c_str());
+    return Status::InvalidArgument("Invalid file name");
+  }
+
+  if (type == kWalFile) {
+    // Only allow deleting archived log files
+    if (log_type != kArchivedLogFile) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed - not archived log.\n",
+                      name.c_str());
+      return Status::NotSupported("Delete only supported for archived logs");
+    }
+    Status status = wal_manager_.DeleteFile(name, number);
+    if (!status.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DeleteFile %s failed -- %s.\n", name.c_str(),
+                      status.ToString().c_str());
+    }
+    return status;
+  }
+
+  Status status;
+  int level;
+  FileMetaData* metadata;
+  ColumnFamilyData* cfd;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
+    if (!status.ok()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed. File not found\n", name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File not found");
+    }
+    assert(level < cfd->NumberLevels());
+
+    // If the file is being compacted no need to delete.
+    if (metadata->being_compacted) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "DeleteFile %s Skipped. File about to be compacted\n",
+                     name.c_str());
+      job_context.Clean();
+      return Status::OK();
+    }
+
+    // Only the files in the last level can be deleted externally.
+    // This is to make sure that any deletion tombstones are not
+    // lost. Check that the level passed is the last level.
+    auto* vstoreage = cfd->current()->storage_info();
+    for (int i = level + 1; i < cfd->NumberLevels(); i++) {
+      if (vstoreage->NumLevelFiles(i) != 0) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "DeleteFile %s FAILED. File not in last level\n",
+                       name.c_str());
+        job_context.Clean();
+        return Status::InvalidArgument("File not in last level");
+      }
+    }
+    // if level == 0, it has to be the oldest file
+    if (level == 0 &&
+        vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "DeleteFile %s failed ---"
+                     " target file in level 0 must be the oldest.",
+                     name.c_str());
+      job_context.Clean();
+      return Status::InvalidArgument("File in level 0, but not oldest");
+    }
+    edit.SetColumnFamily(cfd->GetID());
+    edit.DeleteFile(level, number);
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                                   const RangePtr* ranges, size_t n,
+                                   bool include_end) {
+  Status status = Status::OK();
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  VersionEdit edit;
+  std::set<FileMetaData*> deleted_files;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    Version* input_version = cfd->current();
+
+    auto* vstorage = input_version->storage_info();
+    for (size_t r = 0; r < n; r++) {
+      auto begin = ranges[r].start, end = ranges[r].limit;
+      for (int i = 1; i < cfd->NumberLevels(); i++) {
+        if (vstorage->LevelFiles(i).empty() ||
+            !vstorage->OverlapInLevel(i, begin, end)) {
+          continue;
+        }
+        std::vector<FileMetaData*> level_files;
+        InternalKey begin_storage, end_storage, *begin_key, *end_key;
+        if (begin == nullptr) {
+          begin_key = nullptr;
+        } else {
+          begin_storage.SetMinPossibleForUserKey(*begin);
+          begin_key = &begin_storage;
+        }
+        if (end == nullptr) {
+          end_key = nullptr;
+        } else {
+          end_storage.SetMaxPossibleForUserKey(*end);
+          end_key = &end_storage;
+        }
+
+        vstorage->GetCleanInputsWithinInterval(
+            i, begin_key, end_key, &level_files, -1 /* hint_index */,
+            nullptr /* file_index */);
+        FileMetaData* level_file;
+        for (uint32_t j = 0; j < level_files.size(); j++) {
+          level_file = level_files[j];
+          if (level_file->being_compacted) {
+            continue;
+          }
+          if (deleted_files.find(level_file) != deleted_files.end()) {
+            continue;
+          }
+          if (!include_end && end != nullptr &&
+              cfd->user_comparator()->Compare(level_file->largest.user_key(),
+                                              *end) == 0) {
+            continue;
+          }
+          edit.SetColumnFamily(cfd->GetID());
+          edit.DeleteFile(i, level_file->fd.GetNumber());
+          deleted_files.insert(level_file);
+          level_file->being_compacted = true;
+        }
+        vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                         *cfd->GetLatestMutableCFOptions());
+      }
+    }
+    if (edit.GetDeletedFiles().empty()) {
+      job_context.Clean();
+      return status;
+    }
+    input_version->Ref();
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+    for (auto* deleted_file : deleted_files) {
+      deleted_file->being_compacted = false;
+    }
+    input_version->Unref();
+    FindObsoleteFiles(&job_context, false);
+  }  // lock released here
+
+  LogFlush(immutable_db_options_.info_log);
+  // remove files outside the db-lock
+  if (job_context.HaveSomethingToDelete()) {
+    // Call PurgeObsoleteFiles() without holding mutex.
+    PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+  return status;
+}
+
+void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  versions_->GetLiveFilesMetaData(metadata);
+}
+
+Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  InstrumentedMutexLock l(&mutex_);
+  return versions_->GetLiveFilesChecksumInfo(checksum_list);
+}
+
+void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                     ColumnFamilyMetaData* cf_meta) {
+  assert(column_family);
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  auto* sv = GetAndRefSuperVersion(cfd);
+  {
+    // Without mutex, Version::GetColumnFamilyMetaData will have data race with
+    // Compaction::MarkFilesBeingCompacted. One solution is to use mutex, but
+    // this may cause regression. An alternative is to make
+    // FileMetaData::being_compacted atomic, but it will make FileMetaData
+    // non-copy-able. Another option is to separate these variables from
+    // original FileMetaData struct, and this requires re-organization of data
+    // structures. For now, we take the easy approach. If
+    // DB::GetColumnFamilyMetaData is not called frequently, the regression
+    // should not be big. We still need to keep an eye on it.
+    InstrumentedMutexLock l(&mutex_);
+    sv->current->GetColumnFamilyMetaData(cf_meta);
+  }
+  ReturnAndCleanupSuperVersion(cfd, sv);
+}
+
+void DBImpl::GetAllColumnFamilyMetaData(
+    std::vector<ColumnFamilyMetaData>* metadata) {
+  InstrumentedMutexLock l(&mutex_);
+  for (auto cfd : *(versions_->GetColumnFamilySet())) {
+    {
+      metadata->emplace_back();
+      cfd->current()->GetColumnFamilyMetaData(&metadata->back());
+    }
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::CheckConsistency() {
+  mutex_.AssertHeld();
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+  TEST_SYNC_POINT("DBImpl::CheckConsistency:AfterGetLiveFilesMetaData");
+
+  std::string corruption_messages;
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    // Instead of calling GetFileSize() for each expected file, call
+    // GetChildren() for the DB directory and check that all expected files
+    // are listed, without checking their sizes.
+    // Since sst files might be in different directories, do it for each
+    // directory separately.
+    std::map<std::string, std::vector<std::string>> files_by_directory;
+    for (const auto& md : metadata) {
+      // md.name has a leading "/". Remove it.
+      std::string fname = md.name;
+      if (!fname.empty() && fname[0] == '/') {
+        fname = fname.substr(1);
+      }
+      files_by_directory[md.db_path].push_back(fname);
+    }
+
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (const auto& dir_files : files_by_directory) {
+      std::string directory = dir_files.first;
+      std::vector<std::string> existing_files;
+      Status s = fs_->GetChildren(directory, io_opts, &existing_files,
+                                  /*IODebugContext*=*/nullptr);
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't list files in " + directory + ": " + s.ToString() + "\n";
+        continue;
+      }
+      std::sort(existing_files.begin(), existing_files.end());
+
+      for (const std::string& fname : dir_files.second) {
+        if (!std::binary_search(existing_files.begin(), existing_files.end(),
+                                fname) &&
+            !std::binary_search(existing_files.begin(), existing_files.end(),
+                                Rocks2LevelTableFileName(fname))) {
+          corruption_messages +=
+              "Missing sst file " + fname + " in " + directory + "\n";
+        }
+      }
+    }
+  } else {
+    for (const auto& md : metadata) {
+      // md.name has a leading "/".
+      std::string file_path = md.db_path + md.name;
+
+      uint64_t fsize = 0;
+      TEST_SYNC_POINT("DBImpl::CheckConsistency:BeforeGetFileSize");
+      Status s = env_->GetFileSize(file_path, &fsize);
+      if (!s.ok() &&
+          env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
+        s = Status::OK();
+      }
+      if (!s.ok()) {
+        corruption_messages +=
+            "Can't access " + md.name + ": " + s.ToString() + "\n";
+      } else if (fsize != md.size) {
+        corruption_messages += "Sst file size mismatch: " + file_path +
+                               ". Size recorded in manifest " +
+                               std::to_string(md.size) + ", actual size " +
+                               std::to_string(fsize) + "\n";
+      }
+    }
+  }
+
+  if (corruption_messages.size() == 0) {
+    return Status::OK();
+  } else {
+    return Status::Corruption(corruption_messages);
+  }
+}
+
+Status DBImpl::GetDbIdentity(std::string& identity) const {
+  identity.assign(db_id_);
+  return Status::OK();
+}
+
+Status DBImpl::GetDbIdentityFromIdentityFile(std::string* identity) const {
+  std::string idfilename = IdentityFileName(dbname_);
+  const FileOptions soptions;
+
+  Status s = ReadFileToString(fs_.get(), idfilename, identity);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // If last character is '\n' remove it from identity. (Old implementations
+  // of Env::GenerateUniqueId() would include a trailing '\n'.)
+  if (identity->size() > 0 && identity->back() == '\n') {
+    identity->pop_back();
+  }
+  return s;
+}
+
+Status DBImpl::GetDbSessionId(std::string& session_id) const {
+  session_id.assign(db_session_id_);
+  return Status::OK();
+}
+
+namespace {
+SemiStructuredUniqueIdGen* DbSessionIdGen() {
+  static SemiStructuredUniqueIdGen gen;
+  return &gen;
+}
+}  // namespace
+
+void DBImpl::TEST_ResetDbSessionIdGen() { DbSessionIdGen()->Reset(); }
+
+std::string DBImpl::GenerateDbSessionId(Env*) {
+  // See SemiStructuredUniqueIdGen for its desirable properties.
+  auto gen = DbSessionIdGen();
+
+  uint64_t lo, hi;
+  gen->GenerateNext(&hi, &lo);
+  if (lo == 0) {
+    // Avoid emitting session ID with lo==0, so that SST unique
+    // IDs can be more easily ensured non-zero
+    gen->GenerateNext(&hi, &lo);
+    assert(lo != 0);
+  }
+  return EncodeSessionId(hi, lo);
+}
+
+void DBImpl::SetDbSessionId() {
+  db_session_id_ = GenerateDbSessionId(env_);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::SetDbSessionId", &db_session_id_);
+}
+
+// Default implementation -- returns not supported status
+Status DB::CreateColumnFamily(const ColumnFamilyOptions& /*cf_options*/,
+                              const std::string& /*column_family_name*/,
+                              ColumnFamilyHandle** /*handle*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const ColumnFamilyOptions& /*cf_options*/,
+    const std::vector<std::string>& /*column_family_names*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::CreateColumnFamilies(
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamily(ColumnFamilyHandle* /*column_family*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DropColumnFamilies(
+    const std::vector<ColumnFamilyHandle*>& /*column_families*/) {
+  return Status::NotSupported("");
+}
+
+Status DB::DestroyColumnFamilyHandle(ColumnFamilyHandle* column_family) {
+  if (DefaultColumnFamily() == column_family) {
+    return Status::InvalidArgument(
+        "Cannot destroy the handle returned by DefaultColumnFamily()");
+  }
+  delete column_family;
+  return Status::OK();
+}
+
+DB::~DB() {}
+
+Status DBImpl::Close() {
+  InstrumentedMutexLock closing_lock_guard(&closing_mutex_);
+  if (closed_) {
+    return closing_status_;
+  }
+
+  {
+    const Status s = MaybeReleaseTimestampedSnapshotsAndCheck();
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  closing_status_ = CloseImpl();
+  closed_ = true;
+  return closing_status_;
+}
+
+Status DB::ListColumnFamilies(const DBOptions& db_options,
+                              const std::string& name,
+                              std::vector<std::string>* column_families) {
+  const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+  return VersionSet::ListColumnFamilies(column_families, name, fs.get());
+}
+
+Snapshot::~Snapshot() {}
+
+Status DestroyDB(const std::string& dbname, const Options& options,
+                 const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ImmutableDBOptions soptions(SanitizeOptions(dbname, options));
+  Env* env = soptions.env;
+  std::vector<std::string> filenames;
+  bool wal_in_db_path = soptions.IsWalDirSameAsDBPath();
+
+  // Reset the logger because it holds a handle to the
+  // log file and prevents cleanup and directory removal
+  soptions.info_log.reset();
+  IOOptions io_opts;
+  // Ignore error in case directory does not exist
+  soptions.fs
+      ->GetChildren(dbname, io_opts, &filenames,
+                    /*IODebugContext*=*/nullptr)
+      .PermitUncheckedError();
+
+  FileLock* lock;
+  const std::string lockname = LockFileName(dbname);
+  Status result = env->LockFile(lockname, &lock);
+  if (result.ok()) {
+    uint64_t number;
+    FileType type;
+    InfoLogPrefix info_log_prefix(!soptions.db_log_dir.empty(), dbname);
+    for (const auto& fname : filenames) {
+      if (ParseFileName(fname, &number, info_log_prefix.prefix, &type) &&
+          type != kDBLockFile) {  // Lock file will be deleted at end
+        Status del;
+        std::string path_to_delete = dbname + "/" + fname;
+        if (type == kMetaDatabase) {
+          del = DestroyDB(path_to_delete, options);
+        } else if (type == kTableFile || type == kWalFile ||
+                   type == kBlobFile) {
+          del = DeleteDBFile(
+              &soptions, path_to_delete, dbname,
+              /*force_bg=*/false,
+              /*force_fg=*/(type == kWalFile) ? !wal_in_db_path : false);
+        } else {
+          del = env->DeleteFile(path_to_delete);
+        }
+        if (!del.ok() && result.ok()) {
+          result = del;
+        }
+      }
+    }
+
+    std::set<std::string> paths;
+    for (const DbPath& db_path : options.db_paths) {
+      paths.insert(db_path.path);
+    }
+    for (const ColumnFamilyDescriptor& cf : column_families) {
+      for (const DbPath& cf_path : cf.options.cf_paths) {
+        paths.insert(cf_path.path);
+      }
+    }
+
+    for (const auto& path : paths) {
+      if (soptions.fs
+              ->GetChildren(path, io_opts, &filenames,
+                            /*IODebugContext*=*/nullptr)
+              .ok()) {
+        for (const auto& fname : filenames) {
+          if (ParseFileName(fname, &number, &type) &&
+              (type == kTableFile ||
+               type == kBlobFile)) {  // Lock file will be deleted at end
+            std::string file_path = path + "/" + fname;
+            Status del = DeleteDBFile(&soptions, file_path, dbname,
+                                      /*force_bg=*/false, /*force_fg=*/false);
+            if (!del.ok() && result.ok()) {
+              result = del;
+            }
+          }
+        }
+        // TODO: Should we return an error if we cannot delete the directory?
+        env->DeleteDir(path).PermitUncheckedError();
+      }
+    }
+
+    std::vector<std::string> walDirFiles;
+    std::string archivedir = ArchivalDirectory(dbname);
+    bool wal_dir_exists = false;
+    if (!soptions.IsWalDirSameAsDBPath(dbname)) {
+      wal_dir_exists =
+          soptions.fs
+              ->GetChildren(soptions.wal_dir, io_opts, &walDirFiles,
+                            /*IODebugContext*=*/nullptr)
+              .ok();
+      archivedir = ArchivalDirectory(soptions.wal_dir);
+    }
+
+    // Archive dir may be inside wal dir or dbname and should be
+    // processed and removed before those otherwise we have issues
+    // removing them
+    std::vector<std::string> archiveFiles;
+    if (soptions.fs
+            ->GetChildren(archivedir, io_opts, &archiveFiles,
+                          /*IODebugContext*=*/nullptr)
+            .ok()) {
+      // Delete archival files.
+      for (const auto& file : archiveFiles) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
+          Status del =
+              DeleteDBFile(&soptions, archivedir + "/" + file, archivedir,
+                           /*force_bg=*/false, /*force_fg=*/!wal_in_db_path);
+          if (!del.ok() && result.ok()) {
+            result = del;
+          }
+        }
+      }
+      // Ignore error in case dir contains other files
+      env->DeleteDir(archivedir).PermitUncheckedError();
+    }
+
+    // Delete log files in the WAL dir
+    if (wal_dir_exists) {
+      for (const auto& file : walDirFiles) {
+        if (ParseFileName(file, &number, &type) && type == kWalFile) {
+          Status del =
+              DeleteDBFile(&soptions, LogFileName(soptions.wal_dir, number),
+                           soptions.wal_dir, /*force_bg=*/false,
+                           /*force_fg=*/!wal_in_db_path);
+          if (!del.ok() && result.ok()) {
+            result = del;
+          }
+        }
+      }
+      // Ignore error in case dir contains other files
+      env->DeleteDir(soptions.wal_dir).PermitUncheckedError();
+    }
+
+    // Ignore error since state is already gone
+    env->UnlockFile(lock).PermitUncheckedError();
+    env->DeleteFile(lockname).PermitUncheckedError();
+
+    // sst_file_manager holds a ref to the logger. Make sure the logger is
+    // gone before trying to remove the directory.
+    soptions.sst_file_manager.reset();
+
+    // Ignore error in case dir contains other files
+    env->DeleteDir(dbname).PermitUncheckedError();
+    ;
+  }
+  return result;
+}
+
+Status DBImpl::WriteOptionsFile(bool need_mutex_lock,
+                                bool need_enter_write_thread) {
+#ifndef ROCKSDB_LITE
+  WriteThread::Writer w;
+  if (need_mutex_lock) {
+    mutex_.Lock();
+  } else {
+    mutex_.AssertHeld();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.EnterUnbatched(&w, &mutex_);
+  }
+
+  std::vector<std::string> cf_names;
+  std::vector<ColumnFamilyOptions> cf_opts;
+
+  // This part requires mutex to protect the column family options
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    cf_names.push_back(cfd->GetName());
+    cf_opts.push_back(cfd->GetLatestCFOptions());
+  }
+
+  // Unlock during expensive operations.  New writes cannot get here
+  // because the single write thread ensures all new writes get queued.
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  mutex_.Unlock();
+
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:1");
+  TEST_SYNC_POINT("DBImpl::WriteOptionsFile:2");
+  TEST_SYNC_POINT_CALLBACK("DBImpl::WriteOptionsFile:PersistOptions",
+                           &db_options);
+
+  std::string file_name =
+      TempOptionsFileName(GetName(), versions_->NewFileNumber());
+  Status s = PersistRocksDBOptions(db_options, cf_names, cf_opts, file_name,
+                                   fs_.get());
+
+  if (s.ok()) {
+    s = RenameTempFileToOptionsFile(file_name);
+  }
+  // restore lock
+  if (!need_mutex_lock) {
+    mutex_.Lock();
+  }
+  if (need_enter_write_thread) {
+    write_thread_.ExitUnbatched(&w);
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Unnable to persist options -- %s", s.ToString().c_str());
+    if (immutable_db_options_.fail_if_options_file_error) {
+      return Status::IOError("Unable to persist options.",
+                             s.ToString().c_str());
+    }
+  }
+#else
+  (void)need_mutex_lock;
+  (void)need_enter_write_thread;
+#endif  // !ROCKSDB_LITE
+  return Status::OK();
+}
+
+#ifndef ROCKSDB_LITE
+namespace {
+void DeleteOptionsFilesHelper(const std::map<uint64_t, std::string>& filenames,
+                              const size_t num_files_to_keep,
+                              const std::shared_ptr<Logger>& info_log,
+                              Env* env) {
+  if (filenames.size() <= num_files_to_keep) {
+    return;
+  }
+  for (auto iter = std::next(filenames.begin(), num_files_to_keep);
+       iter != filenames.end(); ++iter) {
+    if (!env->DeleteFile(iter->second).ok()) {
+      ROCKS_LOG_WARN(info_log, "Unable to delete options file %s",
+                     iter->second.c_str());
+    }
+  }
+}
+}  // namespace
+#endif  // !ROCKSDB_LITE
+
+Status DBImpl::DeleteObsoleteOptionsFiles() {
+#ifndef ROCKSDB_LITE
+  std::vector<std::string> filenames;
+  // use ordered map to store keep the filenames sorted from the newest
+  // to the oldest.
+  std::map<uint64_t, std::string> options_filenames;
+  Status s;
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = fs_->GetChildren(GetName(), io_opts, &filenames,
+                       /*IODebugContext*=*/nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+  for (auto& filename : filenames) {
+    uint64_t file_number;
+    FileType type;
+    if (ParseFileName(filename, &file_number, &type) && type == kOptionsFile) {
+      options_filenames.insert(
+          {std::numeric_limits<uint64_t>::max() - file_number,
+           GetName() + "/" + filename});
+    }
+  }
+
+  // Keeps the latest 2 Options file
+  const size_t kNumOptionsFilesKept = 2;
+  DeleteOptionsFilesHelper(options_filenames, kNumOptionsFilesKept,
+                           immutable_db_options_.info_log, GetEnv());
+  return Status::OK();
+#else
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) {
+#ifndef ROCKSDB_LITE
+  Status s;
+
+  uint64_t options_file_number = versions_->NewFileNumber();
+  std::string options_file_name =
+      OptionsFileName(GetName(), options_file_number);
+  uint64_t options_file_size = 0;
+  s = GetEnv()->GetFileSize(file_name, &options_file_size);
+  if (s.ok()) {
+    // Retry if the file name happen to conflict with an existing one.
+    s = GetEnv()->RenameFile(file_name, options_file_name);
+    std::unique_ptr<FSDirectory> dir_obj;
+    if (s.ok()) {
+      s = fs_->NewDirectory(GetName(), IOOptions(), &dir_obj, nullptr);
+    }
+    if (s.ok()) {
+      s = dir_obj->FsyncWithDirOptions(IOOptions(), nullptr,
+                                       DirFsyncOptions(options_file_name));
+    }
+    if (s.ok()) {
+      Status temp_s = dir_obj->Close(IOOptions(), nullptr);
+      // The default Close() could return "NotSupproted" and we bypass it
+      // if it is not impelmented. Detailed explanations can be found in
+      // db/db_impl/db_impl.h
+      if (!temp_s.ok()) {
+        if (temp_s.IsNotSupported()) {
+          temp_s.PermitUncheckedError();
+        } else {
+          s = temp_s;
+        }
+      }
+    }
+  }
+  if (s.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    versions_->options_file_number_ = options_file_number;
+    versions_->options_file_size_ = options_file_size;
+  }
+
+  if (0 == disable_delete_obsolete_files_) {
+    // TODO: Should we check for errors here?
+    DeleteObsoleteOptionsFiles().PermitUncheckedError();
+  }
+  return s;
+#else
+  (void)file_name;
+  return Status::OK();
+#endif  // !ROCKSDB_LITE
+}
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(),
+                                          cfd->ioptions()->env);
+  }
+}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseColumnFamilyInfo(cfd);
+  }
+}
+
+void DBImpl::EraseThreadStatusDbInfo() const {
+  if (immutable_db_options_.enable_thread_tracking) {
+    ThreadStatusUtil::EraseDatabaseInfo(this);
+  }
+}
+
+#else
+void DBImpl::NewThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusCfInfo(ColumnFamilyData* /*cfd*/) const {}
+
+void DBImpl::EraseThreadStatusDbInfo() const {}
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+//
+// A global method that can dump out the build version
+void DumpRocksDBBuildVersion(Logger* log) {
+  ROCKS_LOG_HEADER(log, "RocksDB version: %s\n",
+                   GetRocksVersionAsString().c_str());
+  const auto& props = GetRocksBuildProperties();
+  const auto& sha = props.find("rocksdb_build_git_sha");
+  if (sha != props.end()) {
+    ROCKS_LOG_HEADER(log, "Git sha %s", sha->second.c_str());
+  }
+  const auto date = props.find("rocksdb_build_date");
+  if (date != props.end()) {
+    ROCKS_LOG_HEADER(log, "Compile date %s", date->second.c_str());
+  }
+}
+
+#ifndef ROCKSDB_LITE
+SequenceNumber DBImpl::GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                         bool include_history) {
+  // Find the earliest sequence number that we know we can rely on reading
+  // from the memtable without needing to check sst files.
+  SequenceNumber earliest_seq =
+      sv->imm->GetEarliestSequenceNumber(include_history);
+  if (earliest_seq == kMaxSequenceNumber) {
+    earliest_seq = sv->mem->GetEarliestSequenceNumber();
+  }
+  assert(sv->mem->GetEarliestSequenceNumber() >= earliest_seq);
+
+  return earliest_seq;
+}
+
+Status DBImpl::GetLatestSequenceForKey(
+    SuperVersion* sv, const Slice& key, bool cache_only,
+    SequenceNumber lower_bound_seq, SequenceNumber* seq, std::string* timestamp,
+    bool* found_record_for_key, bool* is_blob_index) {
+  Status s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+
+  ReadOptions read_options;
+  SequenceNumber current_seq = versions_->LastSequence();
+
+  ColumnFamilyData* cfd = sv->cfd;
+  assert(cfd);
+  const Comparator* const ucmp = cfd->user_comparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  std::string ts_buf;
+  if (ts_sz > 0) {
+    assert(timestamp);
+    ts_buf.assign(ts_sz, '\xff');
+  } else {
+    assert(!timestamp);
+  }
+  Slice ts(ts_buf);
+
+  LookupKey lkey(key, current_seq, ts_sz == 0 ? nullptr : &ts);
+
+  *seq = kMaxSequenceNumber;
+  *found_record_for_key = false;
+
+  // Check if there is a record for this key in the latest memtable
+  sv->mem->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+               &merge_context, &max_covering_tombstone_seq, seq, read_options,
+               false /* immutable_memtable */, nullptr /*read_callback*/,
+               is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTable::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLatestSequenceForKey:mem", timestamp);
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check immutable memtables
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_mem = sv->mem->GetEarliestSequenceNumber();
+  if (lower_bound_in_mem != kMaxSequenceNumber &&
+      lower_bound_in_mem < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->Get(lkey, /*value=*/nullptr, /*columns=*/nullptr, timestamp, &s,
+               &merge_context, &max_covering_tombstone_seq, seq, read_options,
+               nullptr /*read_callback*/, is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "Unexpected status returned from MemTableList::Get: %s\n",
+                    s.ToString().c_str());
+
+    return s;
+  }
+
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check memtable history
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  SequenceNumber lower_bound_in_imm = sv->imm->GetEarliestSequenceNumber();
+  if (lower_bound_in_imm != kMaxSequenceNumber &&
+      lower_bound_in_imm < lower_bound_seq) {
+    *found_record_for_key = false;
+    return Status::OK();
+  }
+
+  // Check if there is a record for this key in the immutable memtables
+  sv->imm->GetFromHistory(lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                          timestamp, &s, &merge_context,
+                          &max_covering_tombstone_seq, seq, read_options,
+                          is_blob_index);
+
+  if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+    // unexpected error reading memtable.
+    ROCKS_LOG_ERROR(
+        immutable_db_options_.info_log,
+        "Unexpected status returned from MemTableList::GetFromHistory: %s\n",
+        s.ToString().c_str());
+
+    return s;
+  }
+
+  assert(!ts_sz ||
+         (*seq != kMaxSequenceNumber &&
+          *timestamp != std::string(ts_sz, '\xff')) ||
+         (*seq == kMaxSequenceNumber && timestamp->empty()));
+
+  if (*seq != kMaxSequenceNumber) {
+    // Found a sequence number, no need to check SST files
+    assert(0 == ts_sz || *timestamp != std::string(ts_sz, '\xff'));
+    *found_record_for_key = true;
+    return Status::OK();
+  }
+
+  // We could do a sv->imm->GetEarliestSequenceNumber(/*include_history*/ true)
+  // check here to skip the history if possible. But currently the caller
+  // already does that. Maybe we should move the logic here later.
+
+  // TODO(agiardullo): possible optimization: consider checking cached
+  // SST files if cache_only=true?
+  if (!cache_only) {
+    // Check tables
+    PinnedIteratorsManager pinned_iters_mgr;
+    sv->current->Get(read_options, lkey, /*value=*/nullptr, /*columns=*/nullptr,
+                     timestamp, &s, &merge_context, &max_covering_tombstone_seq,
+                     &pinned_iters_mgr, nullptr /* value_found */,
+                     found_record_for_key, seq, nullptr /*read_callback*/,
+                     is_blob_index);
+
+    if (!(s.ok() || s.IsNotFound() || s.IsMergeInProgress())) {
+      // unexpected error reading SST files
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Unexpected status returned from Version::Get: %s\n",
+                      s.ToString().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status DBImpl::IngestExternalFile(
+    ColumnFamilyHandle* column_family,
+    const std::vector<std::string>& external_files,
+    const IngestExternalFileOptions& ingestion_options) {
+  IngestExternalFileArg arg;
+  arg.column_family = column_family;
+  arg.external_files = external_files;
+  arg.options = ingestion_options;
+  return IngestExternalFiles({arg});
+}
+
+Status DBImpl::IngestExternalFiles(
+    const std::vector<IngestExternalFileArg>& args) {
+  if (args.empty()) {
+    return Status::InvalidArgument("ingestion arg list is empty");
+  }
+  {
+    std::unordered_set<ColumnFamilyHandle*> unique_cfhs;
+    for (const auto& arg : args) {
+      if (arg.column_family == nullptr) {
+        return Status::InvalidArgument("column family handle is null");
+      } else if (unique_cfhs.count(arg.column_family) > 0) {
+        return Status::InvalidArgument(
+            "ingestion args have duplicate column families");
+      }
+      unique_cfhs.insert(arg.column_family);
+    }
+  }
+  // Ingest multiple external SST files atomically.
+  const size_t num_cfs = args.size();
+  for (size_t i = 0; i != num_cfs; ++i) {
+    if (args[i].external_files.empty()) {
+      char err_msg[128] = {0};
+      snprintf(err_msg, 128, "external_files[%zu] is empty", i);
+      return Status::InvalidArgument(err_msg);
+    }
+  }
+  for (const auto& arg : args) {
+    const IngestExternalFileOptions& ingest_opts = arg.options;
+    if (ingest_opts.ingest_behind &&
+        !immutable_db_options_.allow_ingest_behind) {
+      return Status::InvalidArgument(
+          "can't ingest_behind file in DB with allow_ingest_behind=false");
+    }
+  }
+
+  // TODO (yanqin) maybe handle the case in which column_families have
+  // duplicates
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  size_t total = 0;
+  for (const auto& arg : args) {
+    total += arg.external_files.size();
+  }
+  uint64_t next_file_number = 0;
+  Status status = ReserveFileNumbersBeforeIngestion(
+      static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd(), total,
+      pending_output_elem, &next_file_number);
+  if (!status.ok()) {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<ExternalSstFileIngestionJob> ingestion_jobs;
+  for (const auto& arg : args) {
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(arg.column_family)->cfd();
+    ingestion_jobs.emplace_back(versions_.get(), cfd, immutable_db_options_,
+                                file_options_, &snapshots_, arg.options,
+                                &directories_, &event_logger_, io_tracer_);
+  }
+
+  // TODO(yanqin) maybe make jobs run in parallel
+  uint64_t start_file_number = next_file_number;
+  for (size_t i = 1; i != num_cfs; ++i) {
+    start_file_number += args[i - 1].external_files.size();
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    Status es = ingestion_jobs[i].Prepare(
+        args[i].external_files, args[i].files_checksums,
+        args[i].files_checksum_func_names, args[i].file_temperature,
+        start_file_number, super_version);
+    // capture first error only
+    if (!es.ok() && status.ok()) {
+      status = es;
+    }
+    CleanupSuperVersion(super_version);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1");
+  {
+    auto* cfd =
+        static_cast<ColumnFamilyHandleImpl*>(args[0].column_family)->cfd();
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    Status es = ingestion_jobs[0].Prepare(
+        args[0].external_files, args[0].files_checksums,
+        args[0].files_checksum_func_names, args[0].file_temperature,
+        next_file_number, super_version);
+    if (!es.ok()) {
+      status = es;
+    }
+    CleanupSuperVersion(super_version);
+  }
+  if (!status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      ingestion_jobs[i].Cleanup(status);
+    }
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    return status;
+  }
+
+  std::vector<SuperVersionContext> sv_ctxs;
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs.emplace_back(true /* create_superversion */);
+  }
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:0");
+  TEST_SYNC_POINT("DBImpl::IngestExternalFiles:BeforeJobsRun:1");
+  TEST_SYNC_POINT("DBImpl::AddFile:Start");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexLock");
+
+    // Stop writes to the DB by entering both write threads
+    WriteThread::Writer w;
+    write_thread_.EnterUnbatched(&w, &mutex_);
+    WriteThread::Writer nonmem_w;
+    if (two_write_queues_) {
+      nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    }
+
+    // When unordered_write is enabled, the keys are writing to memtable in an
+    // unordered way. If the ingestion job checks memtable key range before the
+    // key landing in memtable, the ingestion job may skip the necessary
+    // memtable flush.
+    // So wait here to ensure there is no pending write to memtable.
+    WaitForPendingWrites();
+
+    num_running_ingest_file_ += static_cast<int>(num_cfs);
+    TEST_SYNC_POINT("DBImpl::IngestExternalFile:AfterIncIngestFileCounter");
+
+    bool at_least_one_cf_need_flush = false;
+    std::vector<bool> need_flush(num_cfs, false);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (cfd->IsDropped()) {
+        // TODO (yanqin) investigate whether we should abort ingestion or
+        // proceed with other non-dropped column families.
+        status = Status::InvalidArgument(
+            "cannot ingest an external file into a dropped CF");
+        break;
+      }
+      bool tmp = false;
+      status = ingestion_jobs[i].NeedsFlush(&tmp, cfd->GetSuperVersion());
+      need_flush[i] = tmp;
+      at_least_one_cf_need_flush = (at_least_one_cf_need_flush || tmp);
+      if (!status.ok()) {
+        break;
+      }
+    }
+    TEST_SYNC_POINT_CALLBACK("DBImpl::IngestExternalFile:NeedFlush",
+                             &at_least_one_cf_need_flush);
+
+    if (status.ok() && at_least_one_cf_need_flush) {
+      FlushOptions flush_opts;
+      flush_opts.allow_write_stall = true;
+      if (immutable_db_options_.atomic_flush) {
+        autovector<ColumnFamilyData*> cfds_to_flush;
+        SelectColumnFamiliesForAtomicFlush(&cfds_to_flush);
+        mutex_.Unlock();
+        status = AtomicFlushMemTables(cfds_to_flush, flush_opts,
+                                      FlushReason::kExternalFileIngestion,
+                                      true /* entered_write_thread */);
+        mutex_.Lock();
+      } else {
+        for (size_t i = 0; i != num_cfs; ++i) {
+          if (need_flush[i]) {
+            mutex_.Unlock();
+            auto* cfd =
+                static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)
+                    ->cfd();
+            status = FlushMemTable(cfd, flush_opts,
+                                   FlushReason::kExternalFileIngestion,
+                                   true /* entered_write_thread */);
+            mutex_.Lock();
+            if (!status.ok()) {
+              break;
+            }
+          }
+        }
+      }
+    }
+    // Run ingestion jobs.
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        status = ingestion_jobs[i].Run();
+        if (!status.ok()) {
+          break;
+        }
+      }
+    }
+    if (status.ok()) {
+      autovector<ColumnFamilyData*> cfds_to_commit;
+      autovector<const MutableCFOptions*> mutable_cf_options_list;
+      autovector<autovector<VersionEdit*>> edit_lists;
+      uint32_t num_entries = 0;
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (cfd->IsDropped()) {
+          continue;
+        }
+        cfds_to_commit.push_back(cfd);
+        mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+        autovector<VersionEdit*> edit_list;
+        edit_list.push_back(ingestion_jobs[i].edit());
+        edit_lists.push_back(edit_list);
+        ++num_entries;
+      }
+      // Mark the version edits as an atomic group if the number of version
+      // edits exceeds 1.
+      if (cfds_to_commit.size() > 1) {
+        for (auto& edits : edit_lists) {
+          assert(edits.size() == 1);
+          edits[0]->MarkAtomicGroup(--num_entries);
+        }
+        assert(0 == num_entries);
+      }
+      status =
+          versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list,
+                                 edit_lists, &mutex_, directories_.GetDbDir());
+      // It is safe to update VersionSet last seqno here after LogAndApply since
+      // LogAndApply persists last sequence number from VersionEdits,
+      // which are from file's largest seqno and not from VersionSet.
+      //
+      // It is necessary to update last seqno here since LogAndApply releases
+      // mutex when persisting MANIFEST file, and the snapshots taken during
+      // that period will not be stable if VersionSet last seqno is updated
+      // before LogAndApply.
+      int consumed_seqno_count =
+          ingestion_jobs[0].ConsumedSequenceNumbersCount();
+      for (size_t i = 1; i != num_cfs; ++i) {
+        consumed_seqno_count =
+            std::max(consumed_seqno_count,
+                     ingestion_jobs[i].ConsumedSequenceNumbersCount());
+      }
+      if (consumed_seqno_count > 0) {
+        const SequenceNumber last_seqno = versions_->LastSequence();
+        versions_->SetLastAllocatedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastPublishedSequence(last_seqno + consumed_seqno_count);
+        versions_->SetLastSequence(last_seqno + consumed_seqno_count);
+      }
+    }
+
+    if (status.ok()) {
+      for (size_t i = 0; i != num_cfs; ++i) {
+        auto* cfd =
+            static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+        if (!cfd->IsDropped()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_ctxs[i],
+                                             *cfd->GetLatestMutableCFOptions());
+#ifndef NDEBUG
+          if (0 == i && num_cfs > 1) {
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:0");
+            TEST_SYNC_POINT(
+                "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1");
+          }
+#endif  // !NDEBUG
+        }
+      }
+    } else if (versions_->io_status().IsIOError()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      const IOStatus& io_s = versions_->io_status();
+      // Should handle return error?
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kManifestWrite);
+    }
+
+    // Resume writes to the DB
+    if (two_write_queues_) {
+      nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+    }
+    write_thread_.ExitUnbatched(&w);
+
+    if (status.ok()) {
+      for (auto& job : ingestion_jobs) {
+        job.UpdateStats();
+      }
+    }
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+    num_running_ingest_file_ -= static_cast<int>(num_cfs);
+    if (0 == num_running_ingest_file_) {
+      bg_cv_.SignalAll();
+    }
+    TEST_SYNC_POINT("DBImpl::AddFile:MutexUnlock");
+  }
+  // mutex_ is unlocked here
+
+  // Cleanup
+  for (size_t i = 0; i != num_cfs; ++i) {
+    sv_ctxs[i].Clean();
+    // This may rollback jobs that have completed successfully. This is
+    // intended for atomicity.
+    ingestion_jobs[i].Cleanup(status);
+  }
+  if (status.ok()) {
+    for (size_t i = 0; i != num_cfs; ++i) {
+      auto* cfd =
+          static_cast<ColumnFamilyHandleImpl*>(args[i].column_family)->cfd();
+      if (!cfd->IsDropped()) {
+        NotifyOnExternalFileIngested(cfd, ingestion_jobs[i]);
+      }
+    }
+  }
+  return status;
+}
+
+Status DBImpl::CreateColumnFamilyWithImport(
+    const ColumnFamilyOptions& options, const std::string& column_family_name,
+    const ImportColumnFamilyOptions& import_options,
+    const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) {
+  assert(handle != nullptr);
+  assert(*handle == nullptr);
+  std::string cf_comparator_name = options.comparator->Name();
+  if (cf_comparator_name != metadata.db_comparator_name) {
+    return Status::InvalidArgument("Comparator name mismatch");
+  }
+
+  // Create column family.
+  auto status = CreateColumnFamily(options, column_family_name, handle);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Import sst files from metadata.
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(*handle);
+  auto cfd = cfh->cfd();
+  ImportColumnFamilyJob import_job(versions_.get(), cfd, immutable_db_options_,
+                                   file_options_, import_options,
+                                   metadata.files, io_tracer_);
+
+  SuperVersionContext dummy_sv_ctx(/* create_superversion */ true);
+  VersionEdit dummy_edit;
+  uint64_t next_file_number = 0;
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_output_elem;
+  {
+    // Lock db mutex
+    InstrumentedMutexLock l(&mutex_);
+    if (error_handler_.IsDBStopped()) {
+      // Don't import files when there is a bg_error
+      status = error_handler_.GetBGError();
+    }
+
+    // Make sure that bg cleanup wont delete the files that we are importing
+    pending_output_elem.reset(new std::list<uint64_t>::iterator(
+        CaptureCurrentFileNumberInPendingOutputs()));
+
+    if (status.ok()) {
+      // If crash happen after a hard link established, Recover function may
+      // reuse the file number that has already assigned to the internal file,
+      // and this will overwrite the external file. To protect the external
+      // file, we have to make sure the file number will never being reused.
+      next_file_number = versions_->FetchAddFileNumber(metadata.files.size());
+      auto cf_options = cfd->GetLatestMutableCFOptions();
+      status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                                      directories_.GetDbDir());
+      if (status.ok()) {
+        InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+      }
+    }
+  }
+  dummy_sv_ctx.Clean();
+
+  if (status.ok()) {
+    SuperVersion* sv = cfd->GetReferencedSuperVersion(this);
+    status = import_job.Prepare(next_file_number, sv);
+    CleanupSuperVersion(sv);
+  }
+
+  if (status.ok()) {
+    SuperVersionContext sv_context(true /*create_superversion*/);
+    {
+      // Lock db mutex
+      InstrumentedMutexLock l(&mutex_);
+
+      // Stop writes to the DB by entering both write threads
+      WriteThread::Writer w;
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      WriteThread::Writer nonmem_w;
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+
+      num_running_ingest_file_++;
+      assert(!cfd->IsDropped());
+      status = import_job.Run();
+
+      // Install job edit [Mutex will be unlocked here]
+      if (status.ok()) {
+        auto cf_options = cfd->GetLatestMutableCFOptions();
+        status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(),
+                                        &mutex_, directories_.GetDbDir());
+        if (status.ok()) {
+          InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options);
+        }
+      }
+
+      // Resume writes to the DB
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+      write_thread_.ExitUnbatched(&w);
+
+      num_running_ingest_file_--;
+      if (num_running_ingest_file_ == 0) {
+        bg_cv_.SignalAll();
+      }
+    }
+    // mutex_ is unlocked here
+
+    sv_context.Clean();
+  }
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    ReleaseFileNumberFromPendingOutputs(pending_output_elem);
+  }
+
+  import_job.Cleanup(status);
+  if (!status.ok()) {
+    Status temp_s = DropColumnFamily(*handle);
+    if (!temp_s.ok()) {
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "DropColumnFamily failed with error %s",
+                      temp_s.ToString().c_str());
+    }
+    // Always returns Status::OK()
+    temp_s = DestroyColumnFamilyHandle(*handle);
+    assert(temp_s.ok());
+    *handle = nullptr;
+  }
+  return status;
+}
+
+Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true);
+}
+
+Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
+  return VerifyChecksumInternal(read_options, /*use_file_checksum=*/false);
+}
+
+Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
+                                      bool use_file_checksum) {
+  // `bytes_read` stat is enabled based on compile-time support and cannot
+  // be dynamically toggled. So we do not need to worry about `PerfLevel`
+  // here, unlike many other `IOStatsContext` / `PerfContext` stats.
+  uint64_t prev_bytes_read = IOSTATS(bytes_read);
+
+  Status s;
+
+  if (use_file_checksum) {
+    FileChecksumGenFactory* const file_checksum_gen_factory =
+        immutable_db_options_.file_checksum_gen_factory.get();
+    if (!file_checksum_gen_factory) {
+      s = Status::InvalidArgument(
+          "Cannot verify file checksum if options.file_checksum_gen_factory is "
+          "null");
+      return s;
+    }
+  }
+
+  // TODO: simplify using GetRefedColumnFamilySet?
+  std::vector<ColumnFamilyData*> cfd_list;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped() && cfd->initialized()) {
+        cfd->Ref();
+        cfd_list.push_back(cfd);
+      }
+    }
+  }
+  std::vector<SuperVersion*> sv_list;
+  for (auto cfd : cfd_list) {
+    sv_list.push_back(cfd->GetReferencedSuperVersion(this));
+  }
+
+  for (auto& sv : sv_list) {
+    VersionStorageInfo* vstorage = sv->current->storage_info();
+    ColumnFamilyData* cfd = sv->current->cfd();
+    Options opts;
+    if (!use_file_checksum) {
+      InstrumentedMutexLock l(&mutex_);
+      opts = Options(BuildDBOptions(immutable_db_options_, mutable_db_options_),
+                     cfd->GetLatestCFOptions());
+    }
+    for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) {
+      for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok();
+           j++) {
+        const auto& fd_with_krange = vstorage->LevelFilesBrief(i).files[j];
+        const auto& fd = fd_with_krange.fd;
+        const FileMetaData* fmeta = fd_with_krange.file_metadata;
+        assert(fmeta);
+        std::string fname = TableFileName(cfd->ioptions()->cf_paths,
+                                          fd.GetNumber(), fd.GetPathId());
+        if (use_file_checksum) {
+          s = VerifyFullFileChecksum(fmeta->file_checksum,
+                                     fmeta->file_checksum_func_name, fname,
+                                     read_options);
+        } else {
+          s = ROCKSDB_NAMESPACE::VerifySstFileChecksum(
+              opts, file_options_, read_options, fname, fd.largest_seqno);
+        }
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+      }
+    }
+
+    if (s.ok() && use_file_checksum) {
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& meta : blob_files) {
+        assert(meta);
+
+        const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+        const std::string blob_file_name = BlobFileName(
+            cfd->ioptions()->cf_paths.front().path, blob_file_number);
+        s = VerifyFullFileChecksum(meta->GetChecksumValue(),
+                                   meta->GetChecksumMethod(), blob_file_name,
+                                   read_options);
+        RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+                   IOSTATS(bytes_read) - prev_bytes_read);
+        prev_bytes_read = IOSTATS(bytes_read);
+        if (!s.ok()) {
+          break;
+        }
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool defer_purge = immutable_db_options().avoid_unnecessary_blocking_io;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto sv : sv_list) {
+      if (sv && sv->Unref()) {
+        sv->Cleanup();
+        if (defer_purge) {
+          AddSuperVersionsToFreeQueue(sv);
+        } else {
+          delete sv;
+        }
+      }
+    }
+    if (defer_purge) {
+      SchedulePurge();
+    }
+    for (auto cfd : cfd_list) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  RecordTick(stats_, VERIFY_CHECKSUM_READ_BYTES,
+             IOSTATS(bytes_read) - prev_bytes_read);
+  return s;
+}
+
+Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                      const std::string& func_name_expected,
+                                      const std::string& fname,
+                                      const ReadOptions& read_options) {
+  Status s;
+  if (file_checksum_expected == kUnknownFileChecksum) {
+    return s;
+  }
+  std::string file_checksum;
+  std::string func_name;
+  s = ROCKSDB_NAMESPACE::GenerateOneFileChecksum(
+      fs_.get(), fname, immutable_db_options_.file_checksum_gen_factory.get(),
+      func_name_expected, &file_checksum, &func_name,
+      read_options.readahead_size, immutable_db_options_.allow_mmap_reads,
+      io_tracer_, immutable_db_options_.rate_limiter.get(),
+      read_options.rate_limiter_priority);
+  if (s.ok()) {
+    assert(func_name_expected == func_name);
+    if (file_checksum != file_checksum_expected) {
+      std::ostringstream oss;
+      oss << fname << " file checksum mismatch, ";
+      oss << "expecting "
+          << Slice(file_checksum_expected).ToString(/*hex=*/true);
+      oss << ", but actual " << Slice(file_checksum).ToString(/*hex=*/true);
+      s = Status::Corruption(oss.str());
+      TEST_SYNC_POINT_CALLBACK("DBImpl::VerifyFullFileChecksum:mismatch", &s);
+    }
+  }
+  return s;
+}
+
+void DBImpl::NotifyOnExternalFileIngested(
+    ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) {
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+
+  for (const IngestedFileInfo& f : ingestion_job.files_to_ingest()) {
+    ExternalFileIngestionInfo info;
+    info.cf_name = cfd->GetName();
+    info.external_file_path = f.external_file_path;
+    info.internal_file_path = f.internal_file_path;
+    info.global_seqno = f.assigned_seqno;
+    info.table_properties = f.table_properties;
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnExternalFileIngested(this, info);
+    }
+  }
+}
+
+void DBImpl::WaitForIngestFile() {
+  mutex_.AssertHeld();
+  while (num_running_ingest_file_ > 0) {
+    bg_cv_.Wait();
+  }
+}
+
+Status DBImpl::StartTrace(const TraceOptions& trace_options,
+                          std::unique_ptr<TraceWriter>&& trace_writer) {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  tracer_.reset(new Tracer(immutable_db_options_.clock, trace_options,
+                           std::move(trace_writer)));
+  return Status::OK();
+}
+
+Status DBImpl::EndTrace() {
+  InstrumentedMutexLock lock(&trace_mutex_);
+  Status s;
+  if (tracer_ != nullptr) {
+    s = tracer_->Close();
+    tracer_.reset();
+  } else {
+    s = Status::IOError("No trace file to close");
+  }
+  return s;
+}
+
+Status DBImpl::NewDefaultReplayer(
+    const std::vector<ColumnFamilyHandle*>& handles,
+    std::unique_ptr<TraceReader>&& reader,
+    std::unique_ptr<Replayer>* replayer) {
+  replayer->reset(new ReplayerImpl(this, handles, std::move(reader)));
+  return Status::OK();
+}
+
+Status DBImpl::StartBlockCacheTrace(
+    const TraceOptions& trace_options,
+    std::unique_ptr<TraceWriter>&& trace_writer) {
+  BlockCacheTraceOptions block_trace_opts;
+  block_trace_opts.sampling_frequency = trace_options.sampling_frequency;
+
+  BlockCacheTraceWriterOptions trace_writer_opt;
+  trace_writer_opt.max_trace_file_size = trace_options.max_trace_file_size;
+
+  std::unique_ptr<BlockCacheTraceWriter> block_cache_trace_writer =
+      NewBlockCacheTraceWriter(env_->GetSystemClock().get(), trace_writer_opt,
+                               std::move(trace_writer));
+
+  return block_cache_tracer_.StartTrace(block_trace_opts,
+                                        std::move(block_cache_trace_writer));
+}
+
+Status DBImpl::StartBlockCacheTrace(
+    const BlockCacheTraceOptions& trace_options,
+    std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) {
+  return block_cache_tracer_.StartTrace(trace_options, std::move(trace_writer));
+}
+
+Status DBImpl::EndBlockCacheTrace() {
+  block_cache_tracer_.EndTrace();
+  return Status::OK();
+}
+
+Status DBImpl::TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                                 const Slice& lower_bound,
+                                 const Slice upper_bound) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeek(cf_id, key, lower_bound, upper_bound);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                        const Slice& lower_bound,
+                                        const Slice upper_bound) {
+  Status s;
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      s = tracer_->IteratorSeekForPrev(cf_id, key, lower_bound, upper_bound);
+    }
+  }
+  return s;
+}
+
+Status DBImpl::ReserveFileNumbersBeforeIngestion(
+    ColumnFamilyData* cfd, uint64_t num,
+    std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+    uint64_t* next_file_number) {
+  Status s;
+  SuperVersionContext dummy_sv_ctx(true /* create_superversion */);
+  assert(nullptr != next_file_number);
+  InstrumentedMutexLock l(&mutex_);
+  if (error_handler_.IsDBStopped()) {
+    // Do not ingest files when there is a bg_error
+    return error_handler_.GetBGError();
+  }
+  pending_output_elem.reset(new std::list<uint64_t>::iterator(
+      CaptureCurrentFileNumberInPendingOutputs()));
+  *next_file_number = versions_->FetchAddFileNumber(static_cast<uint64_t>(num));
+  auto cf_options = cfd->GetLatestMutableCFOptions();
+  VersionEdit dummy_edit;
+  // If crash happen after a hard link established, Recover function may
+  // reuse the file number that has already assigned to the internal file,
+  // and this will overwrite the external file. To protect the external
+  // file, we have to make sure the file number will never being reused.
+  s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_,
+                             directories_.GetDbDir());
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options);
+  }
+  dummy_sv_ctx.Clean();
+  return s;
+}
+
+Status DBImpl::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  if (mutable_db_options_.max_open_files == -1) {
+    uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (!cfd->IsDropped()) {
+        uint64_t ctime;
+        {
+          SuperVersion* sv = GetAndRefSuperVersion(cfd);
+          Version* version = sv->current;
+          version->GetCreationTimeOfOldestFile(&ctime);
+          ReturnAndCleanupSuperVersion(cfd, sv);
+        }
+
+        if (ctime < oldest_time) {
+          oldest_time = ctime;
+        }
+        if (oldest_time == 0) {
+          break;
+        }
+      }
+    }
+    *creation_time = oldest_time;
+    return Status::OK();
+  } else {
+    return Status::NotSupported("This API only works if max_open_files = -1");
+  }
+}
+
+void DBImpl::RecordSeqnoToTimeMapping() {
+  // Get time first then sequence number, so the actual time of seqno is <=
+  // unix_time recorded
+  int64_t unix_time = 0;
+  immutable_db_options_.clock->GetCurrentTime(&unix_time)
+      .PermitUncheckedError();  // Ignore error
+  SequenceNumber seqno = GetLatestSequenceNumber();
+  bool appended = false;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    appended = seqno_time_mapping_.Append(seqno, unix_time);
+  }
+  if (!appended) {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "Failed to insert sequence number to time entry: %" PRIu64
+                   " -> %" PRIu64,
+                   seqno, unix_time);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl.h b/src/rocksdb/db/db_impl/db_impl.h
new file mode 100644
index 000000000..725e77c18
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl.h
@@ -0,0 +1,2804 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/compaction/compaction_iterator.h"
+#include "db/compaction/compaction_job.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/flush_job.h"
+#include "db/flush_scheduler.h"
+#include "db/import_column_family_job.h"
+#include "db/internal_stats.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_checker.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/version_edit.h"
+#include "db/wal_manager.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/status.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/trace_reader_writer.h"
+#endif  // ROCKSDB_LITE
+#include "rocksdb/transaction_log.h"
+#ifndef ROCKSDB_LITE
+#include "rocksdb/utilities/replayer.h"
+#endif  // ROCKSDB_LITE
+#include "rocksdb/write_buffer_manager.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/hash.h"
+#include "util/repeatable_thread.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Arena;
+class ArenaWrappedDBIter;
+class InMemoryStatsHistoryIterator;
+class MemTable;
+class PersistentStatsHistoryIterator;
+class TableCache;
+class TaskLimiterToken;
+class Version;
+class VersionEdit;
+class VersionSet;
+class WriteCallback;
+struct JobContext;
+struct ExternalSstFileInfo;
+struct MemTableInfo;
+
+// Class to maintain directories for all database paths other than main one.
+class Directories {
+ public:
+  IOStatus SetDirectories(FileSystem* fs, const std::string& dbname,
+                          const std::string& wal_dir,
+                          const std::vector<DbPath>& data_paths);
+
+  FSDirectory* GetDataDir(size_t path_id) const {
+    assert(path_id < data_dirs_.size());
+    FSDirectory* ret_dir = data_dirs_[path_id].get();
+    if (ret_dir == nullptr) {
+      // Should use db_dir_
+      return db_dir_.get();
+    }
+    return ret_dir;
+  }
+
+  FSDirectory* GetWalDir() {
+    if (wal_dir_) {
+      return wal_dir_.get();
+    }
+    return db_dir_.get();
+  }
+
+  FSDirectory* GetDbDir() { return db_dir_.get(); }
+
+  IOStatus Close(const IOOptions& options, IODebugContext* dbg) {
+    // close all directories for all database paths
+    IOStatus s = IOStatus::OK();
+
+    // The default implementation for Close() in Directory/FSDirectory class
+    // "NotSupported" status, the upper level interface should be able to
+    // handle this error so that Close() does not fail after upgrading when
+    // run on FileSystems that have not implemented `Directory::Close()` or
+    // `FSDirectory::Close()` yet
+
+    if (db_dir_) {
+      IOStatus temp_s = db_dir_->Close(options, dbg);
+      if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+        s = std::move(temp_s);
+      }
+    }
+
+    // Attempt to close everything even if one fails
+    s.PermitUncheckedError();
+
+    if (wal_dir_) {
+      IOStatus temp_s = wal_dir_->Close(options, dbg);
+      if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+        s = std::move(temp_s);
+      }
+    }
+
+    s.PermitUncheckedError();
+
+    for (auto& data_dir_ptr : data_dirs_) {
+      if (data_dir_ptr) {
+        IOStatus temp_s = data_dir_ptr->Close(options, dbg);
+        if (!temp_s.ok() && !temp_s.IsNotSupported() && s.ok()) {
+          s = std::move(temp_s);
+        }
+      }
+    }
+
+    // Ready for caller
+    s.MustCheck();
+    return s;
+  }
+
+ private:
+  std::unique_ptr<FSDirectory> db_dir_;
+  std::vector<std::unique_ptr<FSDirectory>> data_dirs_;
+  std::unique_ptr<FSDirectory> wal_dir_;
+};
+
+// While DB is the public interface of RocksDB, and DBImpl is the actual
+// class implementing it. It's the entrance of the core RocksdB engine.
+// All other DB implementations, e.g. TransactionDB, BlobDB, etc, wrap a
+// DBImpl internally.
+// Other than functions implementing the DB interface, some public
+// functions are there for other internal components to call. For
+// example, TransactionDB directly calls DBImpl::WriteImpl() and
+// BlobDB directly calls DBImpl::GetImpl(). Some other functions
+// are for sub-components to call. For example, ColumnFamilyHandleImpl
+// calls DBImpl::FindObsoleteFiles().
+//
+// Since it's a very large class, the definition of the functions is
+// divided in several db_impl_*.cc files, besides db_impl.cc.
+class DBImpl : public DB {
+ public:
+  DBImpl(const DBOptions& options, const std::string& dbname,
+         const bool seq_per_batch = false, const bool batch_per_txn = true,
+         bool read_only = false);
+  // No copying allowed
+  DBImpl(const DBImpl&) = delete;
+  void operator=(const DBImpl&) = delete;
+
+  virtual ~DBImpl();
+
+  // ---- Implementations of the DB interface ----
+
+  using DB::Resume;
+  Status Resume() override;
+
+  using DB::Put;
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& value) override;
+  Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, const Slice& ts, const Slice& value) override;
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   const WideColumns& columns) override;
+
+  using DB::Merge;
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) override;
+  Status Merge(const WriteOptions& options, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) override;
+
+  using DB::Delete;
+  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+                const Slice& key) override;
+  Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family,
+                const Slice& key, const Slice& ts) override;
+
+  using DB::SingleDelete;
+  Status SingleDelete(const WriteOptions& options,
+                      ColumnFamilyHandle* column_family,
+                      const Slice& key) override;
+  Status SingleDelete(const WriteOptions& options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) override;
+
+  using DB::DeleteRange;
+  Status DeleteRange(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key) override;
+  Status DeleteRange(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& begin_key,
+                     const Slice& end_key, const Slice& ts) override;
+
+  using DB::Write;
+  virtual Status Write(const WriteOptions& options,
+                       WriteBatch* updates) override;
+
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value, std::string* timestamp) override;
+
+  using DB::GetEntity;
+  Status GetEntity(const ReadOptions& options,
+                   ColumnFamilyHandle* column_family, const Slice& key,
+                   PinnableWideColumns* columns) override;
+
+  using DB::GetMergeOperands;
+  Status GetMergeOperands(const ReadOptions& options,
+                          ColumnFamilyHandle* column_family, const Slice& key,
+                          PinnableSlice* merge_operands,
+                          GetMergeOperandsOptions* get_merge_operands_options,
+                          int* number_of_operands) override {
+    GetImplOptions get_impl_options;
+    get_impl_options.column_family = column_family;
+    get_impl_options.merge_operands = merge_operands;
+    get_impl_options.get_merge_operands_options = get_merge_operands_options;
+    get_impl_options.number_of_operands = number_of_operands;
+    get_impl_options.get_value = false;
+    return GetImpl(options, key, get_impl_options);
+  }
+
+  using DB::MultiGet;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* values) override;
+  virtual std::vector<Status> MultiGet(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_family,
+      const std::vector<Slice>& keys, std::vector<std::string>* values,
+      std::vector<std::string>* timestamps) override;
+
+  // This MultiGet is a batched version, which may be faster than calling Get
+  // multiple times, especially if the keys have some spatial locality that
+  // enables them to be queried in the same SST files/set of files. The larger
+  // the batch size, the more scope for batching and performance improvement
+  // The values and statuses parameters are arrays with number of elements
+  // equal to keys.size(). This allows the storage for those to be alloacted
+  // by the caller on the stack for small batches
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options,
+                        ColumnFamilyHandle* column_family,
+                        const size_t num_keys, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, Status* statuses,
+                        const bool sorted_input = false) override;
+  virtual void MultiGet(const ReadOptions& options, const size_t num_keys,
+                        ColumnFamilyHandle** column_families, const Slice* keys,
+                        PinnableSlice* values, std::string* timestamps,
+                        Status* statuses,
+                        const bool sorted_input = false) override;
+
+  virtual void MultiGetWithCallback(
+      const ReadOptions& options, ColumnFamilyHandle* column_family,
+      ReadCallback* callback,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys);
+
+  virtual Status CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                    const std::string& column_family,
+                                    ColumnFamilyHandle** handle) override;
+  virtual Status CreateColumnFamilies(
+      const ColumnFamilyOptions& cf_options,
+      const std::vector<std::string>& column_family_names,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status CreateColumnFamilies(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles) override;
+  virtual Status DropColumnFamily(ColumnFamilyHandle* column_family) override;
+  virtual Status DropColumnFamilies(
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+
+  // Returns false if key doesn't exist in the database and true if it may.
+  // If value_found is not passed in as null, then return the value if found in
+  // memory. On return, if value was found, then value_found will be set to true
+  // , otherwise false.
+  using DB::KeyMayExist;
+  virtual bool KeyMayExist(const ReadOptions& options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           std::string* value, std::string* timestamp,
+                           bool* value_found = nullptr) override;
+
+  using DB::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions& options,
+                                ColumnFamilyHandle* column_family) override;
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  virtual const Snapshot* GetSnapshot() override;
+  virtual void ReleaseSnapshot(const Snapshot* snapshot) override;
+  // Create a timestamped snapshot. This snapshot can be shared by multiple
+  // readers. If any of them uses it for write conflict checking, then
+  // is_write_conflict_boundary is true. For simplicity, set it to true by
+  // default.
+  std::pair<Status, std::shared_ptr<const Snapshot>> CreateTimestampedSnapshot(
+      SequenceNumber snapshot_seq, uint64_t ts);
+  std::shared_ptr<const SnapshotImpl> GetTimestampedSnapshot(uint64_t ts) const;
+  void ReleaseTimestampedSnapshotsOlderThan(
+      uint64_t ts, size_t* remaining_total_ss = nullptr);
+  Status GetTimestampedSnapshots(uint64_t ts_lb, uint64_t ts_ub,
+                                 std::vector<std::shared_ptr<const Snapshot>>&
+                                     timestamped_snapshots) const;
+
+  using DB::GetProperty;
+  virtual bool GetProperty(ColumnFamilyHandle* column_family,
+                           const Slice& property, std::string* value) override;
+  using DB::GetMapProperty;
+  virtual bool GetMapProperty(
+      ColumnFamilyHandle* column_family, const Slice& property,
+      std::map<std::string, std::string>* value) override;
+  using DB::GetIntProperty;
+  virtual bool GetIntProperty(ColumnFamilyHandle* column_family,
+                              const Slice& property, uint64_t* value) override;
+  using DB::GetAggregatedIntProperty;
+  virtual bool GetAggregatedIntProperty(const Slice& property,
+                                        uint64_t* aggregated_value) override;
+  using DB::GetApproximateSizes;
+  virtual Status GetApproximateSizes(const SizeApproximationOptions& options,
+                                     ColumnFamilyHandle* column_family,
+                                     const Range* range, int n,
+                                     uint64_t* sizes) override;
+  using DB::GetApproximateMemTableStats;
+  virtual void GetApproximateMemTableStats(ColumnFamilyHandle* column_family,
+                                           const Range& range,
+                                           uint64_t* const count,
+                                           uint64_t* const size) override;
+  using DB::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end) override;
+
+  using DB::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& compact_options,
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& input_file_names, const int output_level,
+      const int output_path_id = -1,
+      std::vector<std::string>* const output_file_names = nullptr,
+      CompactionJobInfo* compaction_job_info = nullptr) override;
+
+  virtual Status PauseBackgroundWork() override;
+  virtual Status ContinueBackgroundWork() override;
+
+  virtual Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& column_family_handles) override;
+
+  virtual void EnableManualCompaction() override;
+  virtual void DisableManualCompaction() override;
+
+  using DB::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* column_family,
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  virtual Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& options_map) override;
+
+  using DB::NumberLevels;
+  virtual int NumberLevels(ColumnFamilyHandle* column_family) override;
+  using DB::MaxMemCompactionLevel;
+  virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) override;
+  using DB::Level0StopWriteTrigger;
+  virtual int Level0StopWriteTrigger(
+      ColumnFamilyHandle* column_family) override;
+  virtual const std::string& GetName() const override;
+  virtual Env* GetEnv() const override;
+  virtual FileSystem* GetFileSystem() const override;
+  using DB::GetOptions;
+  virtual Options GetOptions(ColumnFamilyHandle* column_family) const override;
+  using DB::GetDBOptions;
+  virtual DBOptions GetDBOptions() const override;
+  using DB::Flush;
+  virtual Status Flush(const FlushOptions& options,
+                       ColumnFamilyHandle* column_family) override;
+  virtual Status Flush(
+      const FlushOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families) override;
+  virtual Status FlushWAL(bool sync) override;
+  bool WALBufferIsEmpty(bool lock = true);
+  virtual Status SyncWAL() override;
+  virtual Status LockWAL() override;
+  virtual Status UnlockWAL() override;
+
+  virtual SequenceNumber GetLatestSequenceNumber() const override;
+
+  // IncreaseFullHistoryTsLow(ColumnFamilyHandle*, std::string) will acquire
+  // and release db_mutex
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                  std::string ts_low) override;
+
+  // GetFullHistoryTsLow(ColumnFamilyHandle*, std::string*) will acquire and
+  // release db_mutex
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                             std::string* ts_low) override;
+
+  virtual Status GetDbIdentity(std::string& identity) const override;
+
+  virtual Status GetDbIdentityFromIdentityFile(std::string* identity) const;
+
+  virtual Status GetDbSessionId(std::string& session_id) const override;
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override;
+
+  ColumnFamilyHandle* PersistentStatsColumnFamily() const;
+
+  virtual Status Close() override;
+
+  virtual Status DisableFileDeletions() override;
+
+  virtual Status EnableFileDeletions(bool force) override;
+
+  virtual bool IsFileDeletionsEnabled() const;
+
+  Status GetStatsHistory(
+      uint64_t start_time, uint64_t end_time,
+      std::unique_ptr<StatsHistoryIterator>* stats_iterator) override;
+
+#ifndef ROCKSDB_LITE
+  using DB::ResetStats;
+  virtual Status ResetStats() override;
+  // All the returned filenames start with "/"
+  virtual Status GetLiveFiles(std::vector<std::string>&,
+                              uint64_t* manifest_file_size,
+                              bool flush_memtable = true) override;
+  virtual Status GetSortedWalFiles(VectorLogPtr& files) override;
+  virtual Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* current_log_file) override;
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* creation_time) override;
+
+  virtual Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options =
+          TransactionLogIterator::ReadOptions()) override;
+  virtual Status DeleteFile(std::string name) override;
+  Status DeleteFilesInRanges(ColumnFamilyHandle* column_family,
+                             const RangePtr* ranges, size_t n,
+                             bool include_end = true);
+
+  virtual void GetLiveFilesMetaData(
+      std::vector<LiveFileMetaData>* metadata) override;
+
+  virtual Status GetLiveFilesChecksumInfo(
+      FileChecksumList* checksum_list) override;
+
+  virtual Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& opts,
+      std::vector<LiveFileStorageInfo>* files) override;
+
+  // Obtains the meta data of the specified column family of the DB.
+  // TODO(yhchiang): output parameter is placed in the end in this codebase.
+  virtual void GetColumnFamilyMetaData(ColumnFamilyHandle* column_family,
+                                       ColumnFamilyMetaData* metadata) override;
+
+  void GetAllColumnFamilyMetaData(
+      std::vector<ColumnFamilyMetaData>* metadata) override;
+
+  Status SuggestCompactRange(ColumnFamilyHandle* column_family,
+                             const Slice* begin, const Slice* end) override;
+
+  Status PromoteL0(ColumnFamilyHandle* column_family,
+                   int target_level) override;
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* column_family,
+      const std::vector<std::string>& external_files,
+      const IngestExternalFileOptions& ingestion_options) override;
+
+  using DB::IngestExternalFiles;
+  virtual Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& args) override;
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& options, const std::string& column_family_name,
+      const ImportColumnFamilyOptions& import_options,
+      const ExportImportFilesMetaData& metadata,
+      ColumnFamilyHandle** handle) override;
+
+  using DB::VerifyFileChecksums;
+  Status VerifyFileChecksums(const ReadOptions& read_options) override;
+
+  using DB::VerifyChecksum;
+  virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
+  // Verify the checksums of files in db. Currently only tables are checked.
+  //
+  // read_options: controls file I/O behavior, e.g. read ahead size while
+  //               reading all the live table files.
+  //
+  // use_file_checksum: if false, verify the block checksums of all live table
+  //                    in db. Otherwise, obtain the file checksums and compare
+  //                    with the MANIFEST. Currently, file checksums are
+  //                    recomputed by reading all table files.
+  //
+  // Returns: OK if there is no file whose file or block checksum mismatches.
+  Status VerifyChecksumInternal(const ReadOptions& read_options,
+                                bool use_file_checksum);
+
+  Status VerifyFullFileChecksum(const std::string& file_checksum_expected,
+                                const std::string& func_name_expected,
+                                const std::string& fpath,
+                                const ReadOptions& read_options);
+
+  using DB::StartTrace;
+  virtual Status StartTrace(
+      const TraceOptions& options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndTrace;
+  virtual Status EndTrace() override;
+
+  using DB::NewDefaultReplayer;
+  virtual Status NewDefaultReplayer(
+      const std::vector<ColumnFamilyHandle*>& handles,
+      std::unique_ptr<TraceReader>&& reader,
+      std::unique_ptr<Replayer>* replayer) override;
+
+  using DB::StartBlockCacheTrace;
+  Status StartBlockCacheTrace(
+      const TraceOptions& trace_options,
+      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  Status StartBlockCacheTrace(
+      const BlockCacheTraceOptions& options,
+      std::unique_ptr<BlockCacheTraceWriter>&& trace_writer) override;
+
+  using DB::EndBlockCacheTrace;
+  Status EndBlockCacheTrace() override;
+
+  using DB::StartIOTrace;
+  Status StartIOTrace(const TraceOptions& options,
+                      std::unique_ptr<TraceWriter>&& trace_writer) override;
+
+  using DB::EndIOTrace;
+  Status EndIOTrace() override;
+
+  using DB::GetPropertiesOfAllTables;
+  virtual Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* column_family,
+      TablePropertiesCollection* props) override;
+  virtual Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* column_family, const Range* range, std::size_t n,
+      TablePropertiesCollection* props) override;
+
+#endif  // ROCKSDB_LITE
+
+  // ---- End of implementations of the DB interface ----
+  SystemClock* GetSystemClock() const;
+
+  struct GetImplOptions {
+    ColumnFamilyHandle* column_family = nullptr;
+    PinnableSlice* value = nullptr;
+    PinnableWideColumns* columns = nullptr;
+    std::string* timestamp = nullptr;
+    bool* value_found = nullptr;
+    ReadCallback* callback = nullptr;
+    bool* is_blob_index = nullptr;
+    // If true return value associated with key via value pointer else return
+    // all merge operands for key via merge_operands pointer
+    bool get_value = true;
+    // Pointer to an array of size
+    // get_merge_operands_options.expected_max_number_of_operands allocated by
+    // user
+    PinnableSlice* merge_operands = nullptr;
+    GetMergeOperandsOptions* get_merge_operands_options = nullptr;
+    int* number_of_operands = nullptr;
+  };
+
+  // Function that Get and KeyMayExist call with no_io true or false
+  // Note: 'value_found' from KeyMayExist propagates here
+  // This function is also called by GetMergeOperands
+  // If get_impl_options.get_value = true get value associated with
+  // get_impl_options.key via get_impl_options.value
+  // If get_impl_options.get_value = false get merge operands associated with
+  // get_impl_options.key via get_impl_options.merge_operands
+  Status GetImpl(const ReadOptions& options, const Slice& key,
+                 GetImplOptions& get_impl_options);
+
+  // If `snapshot` == kMaxSequenceNumber, set a recent one inside the file.
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool expose_blob_index = false,
+                                      bool allow_refresh = true);
+
+  virtual SequenceNumber GetLastPublishedSequence() const {
+    if (last_seq_same_as_publish_seq_) {
+      return versions_->LastSequence();
+    } else {
+      return versions_->LastPublishedSequence();
+    }
+  }
+
+  // REQUIRES: joined the main write queue if two_write_queues is disabled, and
+  // the second write queue otherwise.
+  virtual void SetLastPublishedSequence(SequenceNumber seq);
+  // Returns LastSequence in last_seq_same_as_publish_seq_
+  // mode and LastAllocatedSequence otherwise. This is useful when visiblility
+  // depends also on data written to the WAL but not to the memtable.
+  SequenceNumber TEST_GetLastVisibleSequence() const;
+
+#ifndef ROCKSDB_LITE
+  // Similar to Write() but will call the callback once on the single write
+  // thread to determine whether it is safe to perform the write.
+  virtual Status WriteWithCallback(const WriteOptions& write_options,
+                                   WriteBatch* my_batch,
+                                   WriteCallback* callback);
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into the current
+  // memtables. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  //
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestMemTableSequenceNumber(SuperVersion* sv,
+                                                   bool include_history);
+
+  // For a given key, check to see if there are any records for this key
+  // in the memtables, including memtable history.  If cache_only is false,
+  // SST files will also be checked.
+  //
+  // `key` should NOT have user-defined timestamp appended to user key even if
+  // timestamp is enabled.
+  //
+  // If a key is found, *found_record_for_key will be set to true and
+  // *seq will be set to the stored sequence number for the latest
+  // operation on this key or kMaxSequenceNumber if unknown. If user-defined
+  // timestamp is enabled for this column family and timestamp is not nullptr,
+  // then *timestamp will be set to the stored timestamp for the latest
+  // operation on this key.
+  // If no key is found, *found_record_for_key will be set to false.
+  //
+  // Note: If cache_only=false, it is possible for *seq to be set to 0 if
+  // the sequence number has been cleared from the record.  If the caller is
+  // holding an active db snapshot, we know the missing sequence must be less
+  // than the snapshot's sequence number (sequence numbers are only cleared
+  // when there are no earlier active snapshots).
+  //
+  // If NotFound is returned and found_record_for_key is set to false, then no
+  // record for this key was found.  If the caller is holding an active db
+  // snapshot, we know that no key could have existing after this snapshot
+  // (since we do not compact keys that have an earlier snapshot).
+  //
+  // Only records newer than or at `lower_bound_seq` are guaranteed to be
+  // returned. Memtables and files may not be checked if it only contains data
+  // older than `lower_bound_seq`.
+  //
+  // Returns OK or NotFound on success,
+  // other status on unexpected error.
+  // TODO(andrewkr): this API need to be aware of range deletion operations
+  Status GetLatestSequenceForKey(SuperVersion* sv, const Slice& key,
+                                 bool cache_only,
+                                 SequenceNumber lower_bound_seq,
+                                 SequenceNumber* seq, std::string* timestamp,
+                                 bool* found_record_for_key,
+                                 bool* is_blob_index);
+
+  Status TraceIteratorSeek(const uint32_t& cf_id, const Slice& key,
+                           const Slice& lower_bound, const Slice upper_bound);
+  Status TraceIteratorSeekForPrev(const uint32_t& cf_id, const Slice& key,
+                                  const Slice& lower_bound,
+                                  const Slice upper_bound);
+#endif  // ROCKSDB_LITE
+
+  // Similar to GetSnapshot(), but also lets the db know that this snapshot
+  // will be used for transaction write-conflict checking.  The DB can then
+  // make sure not to compact any keys that would prevent a write-conflict from
+  // being detected.
+  const Snapshot* GetSnapshotForWriteConflictBoundary();
+
+  // checks if all live files exist on file system and that their file sizes
+  // match to our in-memory records
+  virtual Status CheckConsistency();
+
+  // max_file_num_to_ignore allows bottom level compaction to filter out newly
+  // compacted SST files. Setting max_file_num_to_ignore to kMaxUint64 will
+  // disable the filtering
+  Status RunManualCompaction(ColumnFamilyData* cfd, int input_level,
+                             int output_level,
+                             const CompactRangeOptions& compact_range_options,
+                             const Slice* begin, const Slice* end,
+                             bool exclusive, bool disallow_trivial_move,
+                             uint64_t max_file_num_to_ignore,
+                             const std::string& trim_ts);
+
+  // Return an internal iterator over the current state of the database.
+  // The keys of this iterator are internal keys (see format.h).
+  // The returned iterator should be deleted when no longer needed.
+  // If allow_unprepared_value is true, the returned iterator may defer reading
+  // the value and so will require PrepareValue() to be called before value();
+  // allow_unprepared_value = false is convenient when this optimization is not
+  // useful, e.g. when reading the whole column family.
+  //
+  // read_options.ignore_range_deletions determines whether range tombstones are
+  // processed in the returned interator internally, i.e., whether range
+  // tombstone covered keys are in this iterator's output.
+  // @param read_options Must outlive the returned iterator.
+  InternalIterator* NewInternalIterator(
+      const ReadOptions& read_options, Arena* arena, SequenceNumber sequence,
+      ColumnFamilyHandle* column_family = nullptr,
+      bool allow_unprepared_value = false);
+
+  // Note: to support DB iterator refresh, memtable range tombstones in the
+  // underlying merging iterator needs to be refreshed. If db_iter is not
+  // nullptr, db_iter->SetMemtableRangetombstoneIter() is called with the
+  // memtable range tombstone iterator used by the underlying merging iterator.
+  // This range tombstone iterator can be refreshed later by db_iter.
+  // @param read_options Must outlive the returned iterator.
+  InternalIterator* NewInternalIterator(const ReadOptions& read_options,
+                                        ColumnFamilyData* cfd,
+                                        SuperVersion* super_version,
+                                        Arena* arena, SequenceNumber sequence,
+                                        bool allow_unprepared_value,
+                                        ArenaWrappedDBIter* db_iter = nullptr);
+
+  LogsWithPrepTracker* logs_with_prep_tracker() {
+    return &logs_with_prep_tracker_;
+  }
+
+  struct BGJobLimits {
+    int max_flushes;
+    int max_compactions;
+  };
+  // Returns maximum background flushes and compactions allowed to be scheduled
+  BGJobLimits GetBGJobLimits() const;
+  // Need a static version that can be called during SanitizeOptions().
+  static BGJobLimits GetBGJobLimits(int max_background_flushes,
+                                    int max_background_compactions,
+                                    int max_background_jobs,
+                                    bool parallelize_compactions);
+
+  // move logs pending closing from job_context to the DB queue and
+  // schedule a purge
+  void ScheduleBgLogWriterClose(JobContext* job_context);
+
+  uint64_t MinLogNumberToKeep();
+
+  // Returns the lower bound file number for SSTs that won't be deleted, even if
+  // they're obsolete. This lower bound is used internally to prevent newly
+  // created flush/compaction output files from being deleted before they're
+  // installed. This technique avoids the need for tracking the exact numbers of
+  // files pending creation, although it prevents more files than necessary from
+  // being deleted.
+  uint64_t MinObsoleteSstNumberToKeep();
+
+  // Returns the list of live files in 'live' and the list
+  // of all files in the filesystem in 'candidate_files'.
+  // If force == false and the last call was less than
+  // db_options_.delete_obsolete_files_period_micros microseconds ago,
+  // it will not fill up the job_context
+  void FindObsoleteFiles(JobContext* job_context, bool force,
+                         bool no_full_scan = false);
+
+  // Diffs the files listed in filenames and those that do not
+  // belong to live files are possibly removed. Also, removes all the
+  // files in sst_delete_files and log_delete_files.
+  // It is not necessary to hold the mutex when invoking this method.
+  // If FindObsoleteFiles() was run, we need to also run
+  // PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
+  void PurgeObsoleteFiles(JobContext& background_contet,
+                          bool schedule_only = false);
+
+  // Schedule a background job to actually delete obsolete files.
+  void SchedulePurge();
+
+  const SnapshotList& snapshots() const { return snapshots_; }
+
+  // load list of snapshots to `snap_vector` that is no newer than `max_seq`
+  // in ascending order.
+  // `oldest_write_conflict_snapshot` is filled with the oldest snapshot
+  // which satisfies SnapshotImpl.is_write_conflict_boundary_ = true.
+  void LoadSnapshots(std::vector<SequenceNumber>* snap_vector,
+                     SequenceNumber* oldest_write_conflict_snapshot,
+                     const SequenceNumber& max_seq) const {
+    InstrumentedMutexLock l(mutex());
+    snapshots().GetAll(snap_vector, oldest_write_conflict_snapshot, max_seq);
+  }
+
+  const ImmutableDBOptions& immutable_db_options() const {
+    return immutable_db_options_;
+  }
+
+  // Cancel all background jobs, including flush, compaction, background
+  // purging, stats dumping threads, etc. If `wait` = true, wait for the
+  // running jobs to abort or finish before returning. Otherwise, only
+  // sends the signals.
+  void CancelAllBackgroundWork(bool wait);
+
+  // Find Super version and reference it. Based on options, it might return
+  // the thread local cached one.
+  // Call ReturnAndCleanupSuperVersion() when it is no longer needed.
+  SuperVersion* GetAndRefSuperVersion(ColumnFamilyData* cfd);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.
+  SuperVersion* GetAndRefSuperVersion(uint32_t column_family_id);
+
+  // Un-reference the super version and clean it up if it is the last reference.
+  void CleanupSuperVersion(SuperVersion* sv);
+
+  // Un-reference the super version and return it to thread local cache if
+  // needed. If it is the last reference of the super version. Clean it up
+  // after un-referencing it.
+  void ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv);
+
+  // Similar to the previous function but looks up based on a column family id.
+  // nullptr will be returned if this column family no longer exists.
+  // REQUIRED: this function should only be called on the write thread.
+  void ReturnAndCleanupSuperVersion(uint32_t colun_family_id, SuperVersion* sv);
+
+  // REQUIRED: this function should only be called on the write thread or if the
+  // mutex is held.  Return value only valid until next call to this function or
+  // mutex is released.
+  ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id);
+
+  // Same as above, should called without mutex held and not on write thread.
+  std::unique_ptr<ColumnFamilyHandle> GetColumnFamilyHandleUnlocked(
+      uint32_t column_family_id);
+
+  // Returns the number of currently running flushes.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_flushes() {
+    mutex_.AssertHeld();
+    return num_running_flushes_;
+  }
+
+  // Returns the number of currently running compactions.
+  // REQUIREMENT: mutex_ must be held when calling this function.
+  int num_running_compactions() {
+    mutex_.AssertHeld();
+    return num_running_compactions_;
+  }
+
+  const WriteController& write_controller() { return write_controller_; }
+
+  // hollow transactions shell used for recovery.
+  // these will then be passed to TransactionDB so that
+  // locks can be reacquired before writing can resume.
+  struct RecoveredTransaction {
+    std::string name_;
+    bool unprepared_;
+
+    struct BatchInfo {
+      uint64_t log_number_;
+      // TODO(lth): For unprepared, the memory usage here can be big for
+      // unprepared transactions. This is only useful for rollbacks, and we
+      // can in theory just keep keyset for that.
+      WriteBatch* batch_;
+      // Number of sub-batches. A new sub-batch is created if txn attempts to
+      // insert a duplicate key,seq to memtable. This is currently used in
+      // WritePreparedTxn/WriteUnpreparedTxn.
+      size_t batch_cnt_;
+    };
+
+    // This maps the seq of the first key in the batch to BatchInfo, which
+    // contains WriteBatch and other information relevant to the batch.
+    //
+    // For WriteUnprepared, batches_ can have size greater than 1, but for
+    // other write policies, it must be of size 1.
+    std::map<SequenceNumber, BatchInfo> batches_;
+
+    explicit RecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared)
+        : name_(name), unprepared_(unprepared) {
+      batches_[seq] = {log, batch, batch_cnt};
+    }
+
+    ~RecoveredTransaction() {
+      for (auto& it : batches_) {
+        delete it.second.batch_;
+      }
+    }
+
+    void AddBatch(SequenceNumber seq, uint64_t log_number, WriteBatch* batch,
+                  size_t batch_cnt, bool unprepared) {
+      assert(batches_.count(seq) == 0);
+      batches_[seq] = {log_number, batch, batch_cnt};
+      // Prior state must be unprepared, since the prepare batch must be the
+      // last batch.
+      assert(unprepared_);
+      unprepared_ = unprepared;
+    }
+  };
+
+  bool allow_2pc() const { return immutable_db_options_.allow_2pc; }
+
+  std::unordered_map<std::string, RecoveredTransaction*>
+  recovered_transactions() {
+    return recovered_transactions_;
+  }
+
+  RecoveredTransaction* GetRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    if (it == recovered_transactions_.end()) {
+      return nullptr;
+    } else {
+      return it->second;
+    }
+  }
+
+  void InsertRecoveredTransaction(const uint64_t log, const std::string& name,
+                                  WriteBatch* batch, SequenceNumber seq,
+                                  size_t batch_cnt, bool unprepared_batch) {
+    // For WriteUnpreparedTxn, InsertRecoveredTransaction is called multiple
+    // times for every unprepared batch encountered during recovery.
+    //
+    // If the transaction is prepared, then the last call to
+    // InsertRecoveredTransaction will have unprepared_batch = false.
+    auto rtxn = recovered_transactions_.find(name);
+    if (rtxn == recovered_transactions_.end()) {
+      recovered_transactions_[name] = new RecoveredTransaction(
+          log, name, batch, seq, batch_cnt, unprepared_batch);
+    } else {
+      rtxn->second->AddBatch(seq, log, batch, batch_cnt, unprepared_batch);
+    }
+    logs_with_prep_tracker_.MarkLogAsContainingPrepSection(log);
+  }
+
+  void DeleteRecoveredTransaction(const std::string& name) {
+    auto it = recovered_transactions_.find(name);
+    assert(it != recovered_transactions_.end());
+    auto* trx = it->second;
+    recovered_transactions_.erase(it);
+    for (const auto& info : trx->batches_) {
+      logs_with_prep_tracker_.MarkLogAsHavingPrepSectionFlushed(
+          info.second.log_number_);
+    }
+    delete trx;
+  }
+
+  void DeleteAllRecoveredTransactions() {
+    for (auto it = recovered_transactions_.begin();
+         it != recovered_transactions_.end(); ++it) {
+      delete it->second;
+    }
+    recovered_transactions_.clear();
+  }
+
+  void AddToLogsToFreeQueue(log::Writer* log_writer) {
+    mutex_.AssertHeld();
+    logs_to_free_queue_.push_back(log_writer);
+  }
+
+  void AddSuperVersionsToFreeQueue(SuperVersion* sv) {
+    superversions_to_free_queue_.push_back(sv);
+  }
+
+  void SetSnapshotChecker(SnapshotChecker* snapshot_checker);
+
+  // Fill JobContext with snapshot information needed by flush and compaction.
+  void GetSnapshotContext(JobContext* job_context,
+                          std::vector<SequenceNumber>* snapshot_seqs,
+                          SequenceNumber* earliest_write_conflict_snapshot,
+                          SnapshotChecker** snapshot_checker);
+
+  // Not thread-safe.
+  void SetRecoverableStatePreReleaseCallback(PreReleaseCallback* callback);
+
+  InstrumentedMutex* mutex() const { return &mutex_; }
+
+  // Initialize a brand new DB. The DB directory is expected to be empty before
+  // calling it. Push new manifest file name into `new_filenames`.
+  Status NewDB(std::vector<std::string>* new_filenames);
+
+  // This is to be used only by internal rocksdb classes.
+  static Status Open(const DBOptions& db_options, const std::string& name,
+                     const std::vector<ColumnFamilyDescriptor>& column_families,
+                     std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                     const bool seq_per_batch, const bool batch_per_txn);
+
+  static IOStatus CreateAndNewDirectory(
+      FileSystem* fs, const std::string& dirname,
+      std::unique_ptr<FSDirectory>* directory);
+
+  // find stats map from stats_history_ with smallest timestamp in
+  // the range of [start_time, end_time)
+  bool FindStatsByTime(uint64_t start_time, uint64_t end_time,
+                       uint64_t* new_time,
+                       std::map<std::string, uint64_t>* stats_map);
+
+  // Print information of all tombstones of all iterators to the std::string
+  // This is only used by ldb. The output might be capped. Tombstones
+  // printed out are not guaranteed to be in any order.
+  Status TablesRangeTombstoneSummary(ColumnFamilyHandle* column_family,
+                                     int max_entries_to_print,
+                                     std::string* out_str);
+
+  VersionSet* GetVersionSet() const { return versions_.get(); }
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status WaitForCompact(bool waitUnscheduled = false);
+
+#ifndef NDEBUG
+  // Compact any files in the named level that overlap [*begin, *end]
+  Status TEST_CompactRange(int level, const Slice* begin, const Slice* end,
+                           ColumnFamilyHandle* column_family = nullptr,
+                           bool disallow_trivial_move = false);
+
+  Status TEST_SwitchWAL();
+
+  bool TEST_UnableToReleaseOldestLog() { return unable_to_release_oldest_log_; }
+
+  bool TEST_IsLogGettingFlushed() {
+    return alive_log_files_.begin()->getting_flushed;
+  }
+
+  Status TEST_SwitchMemtable(ColumnFamilyData* cfd = nullptr);
+
+  // Force current memtable contents to be flushed.
+  Status TEST_FlushMemTable(bool wait = true, bool allow_write_stall = false,
+                            ColumnFamilyHandle* cfh = nullptr);
+
+  Status TEST_FlushMemTable(ColumnFamilyData* cfd,
+                            const FlushOptions& flush_opts);
+
+  // Flush (multiple) ColumnFamilyData without using ColumnFamilyHandle. This
+  // is because in certain cases, we can flush column families, wait for the
+  // flush to complete, but delete the column family handle before the wait
+  // finishes. For example in CompactRange.
+  Status TEST_AtomicFlushMemTables(const autovector<ColumnFamilyData*>& cfds,
+                                   const FlushOptions& flush_opts);
+
+  // Wait for background threads to complete scheduled work.
+  Status TEST_WaitForBackgroundWork();
+
+  // Wait for memtable compaction
+  Status TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family = nullptr);
+
+  // Wait for any compaction
+  // We add a bool parameter to wait for unscheduledCompactions_ == 0, but this
+  // is only for the special test of CancelledCompactions
+  Status TEST_WaitForCompact(bool waitUnscheduled = false);
+
+  // Wait for any background purge
+  Status TEST_WaitForPurge();
+
+  // Get the background error status
+  Status TEST_GetBGError();
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  uint64_t TEST_MaxNextLevelOverlappingBytes(
+      ColumnFamilyHandle* column_family = nullptr);
+
+  // Return the current manifest file no.
+  uint64_t TEST_Current_Manifest_FileNo();
+
+  // Returns the number that'll be assigned to the next file that's created.
+  uint64_t TEST_Current_Next_FileNo();
+
+  // get total level0 file size. Only for testing.
+  uint64_t TEST_GetLevel0TotalSize();
+
+  void TEST_GetFilesMetaData(
+      ColumnFamilyHandle* column_family,
+      std::vector<std::vector<FileMetaData>>* metadata,
+      std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata = nullptr);
+
+  void TEST_LockMutex();
+
+  void TEST_UnlockMutex();
+
+  // REQUIRES: mutex locked
+  void* TEST_BeginWrite();
+
+  // REQUIRES: mutex locked
+  // pass the pointer that you got from TEST_BeginWrite()
+  void TEST_EndWrite(void* w);
+
+  uint64_t TEST_MaxTotalInMemoryState() const {
+    return max_total_in_memory_state_;
+  }
+
+  size_t TEST_LogsToFreeSize();
+
+  uint64_t TEST_LogfileNumber();
+
+  uint64_t TEST_total_log_size() const { return total_log_size_; }
+
+  // Returns column family name to ImmutableCFOptions map.
+  Status TEST_GetAllImmutableCFOptions(
+      std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map);
+
+  // Return the lastest MutableCFOptions of a column family
+  Status TEST_GetLatestMutableCFOptions(ColumnFamilyHandle* column_family,
+                                        MutableCFOptions* mutable_cf_options);
+
+  Cache* TEST_table_cache() { return table_cache_.get(); }
+
+  WriteController& TEST_write_controler() { return write_controller_; }
+
+  uint64_t TEST_FindMinLogContainingOutstandingPrep();
+  uint64_t TEST_FindMinPrepLogReferencedByMemTable();
+  size_t TEST_PreparedSectionCompletedSize();
+  size_t TEST_LogsWithPrepSize();
+
+  int TEST_BGCompactionsAllowed() const;
+  int TEST_BGFlushesAllowed() const;
+  size_t TEST_GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  void TEST_WaitForPeridicTaskRun(std::function<void()> callback) const;
+  SeqnoToTimeMapping TEST_GetSeqnoToTimeMapping() const;
+  size_t TEST_EstimateInMemoryStatsHistorySize() const;
+
+  uint64_t TEST_GetCurrentLogNumber() const {
+    InstrumentedMutexLock l(mutex());
+    assert(!logs_.empty());
+    return logs_.back().number;
+  }
+
+  const std::unordered_set<uint64_t>& TEST_GetFilesGrabbedForPurge() const {
+    return files_grabbed_for_purge_;
+  }
+
+#ifndef ROCKSDB_LITE
+  const PeriodicTaskScheduler& TEST_GetPeriodicTaskScheduler() const;
+#endif  // !ROCKSDB_LITE
+
+#endif  // NDEBUG
+
+  // persist stats to column family "_persistent_stats"
+  void PersistStats();
+
+  // dump rocksdb.stats to LOG
+  void DumpStats();
+
+  // flush LOG out of application buffer
+  void FlushInfoLog();
+
+  // record current sequence number to time mapping
+  void RecordSeqnoToTimeMapping();
+
+  // Interface to block and signal the DB in case of stalling writes by
+  // WriteBufferManager. Each DBImpl object contains ptr to WBMStallInterface.
+  // When DB needs to be blocked or signalled by WriteBufferManager,
+  // state_ is changed accordingly.
+  class WBMStallInterface : public StallInterface {
+   public:
+    enum State {
+      BLOCKED = 0,
+      RUNNING,
+    };
+
+    WBMStallInterface() : state_cv_(&state_mutex_) {
+      MutexLock lock(&state_mutex_);
+      state_ = State::RUNNING;
+    }
+
+    void SetState(State state) {
+      MutexLock lock(&state_mutex_);
+      state_ = state;
+    }
+
+    // Change the state_ to State::BLOCKED and wait until its state is
+    // changed by WriteBufferManager. When stall is cleared, Signal() is
+    // called to change the state and unblock the DB.
+    void Block() override {
+      MutexLock lock(&state_mutex_);
+      while (state_ == State::BLOCKED) {
+        TEST_SYNC_POINT("WBMStallInterface::BlockDB");
+        state_cv_.Wait();
+      }
+    }
+
+    // Called from WriteBufferManager. This function changes the state_
+    // to State::RUNNING indicating the stall is cleared and DB can proceed.
+    void Signal() override {
+      {
+        MutexLock lock(&state_mutex_);
+        state_ = State::RUNNING;
+      }
+      state_cv_.Signal();
+    }
+
+   private:
+    // Conditional variable and mutex to block and
+    // signal the DB during stalling process.
+    port::Mutex state_mutex_;
+    port::CondVar state_cv_;
+    // state represting whether DB is running or blocked because of stall by
+    // WriteBufferManager.
+    State state_;
+  };
+
+  static void TEST_ResetDbSessionIdGen();
+  static std::string GenerateDbSessionId(Env* env);
+
+  bool seq_per_batch() const { return seq_per_batch_; }
+
+ protected:
+  const std::string dbname_;
+  // TODO(peterd): unify with VersionSet::db_id_
+  std::string db_id_;
+  // db_session_id_ is an identifier that gets reset
+  // every time the DB is opened
+  std::string db_session_id_;
+  std::unique_ptr<VersionSet> versions_;
+  // Flag to check whether we allocated and own the info log file
+  bool own_info_log_;
+  Status init_logger_creation_s_;
+  const DBOptions initial_db_options_;
+  Env* const env_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  const ImmutableDBOptions immutable_db_options_;
+  FileSystemPtr fs_;
+  MutableDBOptions mutable_db_options_;
+  Statistics* stats_;
+  std::unordered_map<std::string, RecoveredTransaction*>
+      recovered_transactions_;
+  std::unique_ptr<Tracer> tracer_;
+  InstrumentedMutex trace_mutex_;
+  BlockCacheTracer block_cache_tracer_;
+
+  // constant false canceled flag, used when the compaction is not manual
+  const std::atomic<bool> kManualCompactionCanceledFalse_{false};
+
+  // State below is protected by mutex_
+  // With two_write_queues enabled, some of the variables that accessed during
+  // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
+  // logs_, logfile_number_. Refer to the definition of each variable below for
+  // more description.
+  //
+  // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
+  // cachelines.
+  mutable CacheAlignedInstrumentedMutex mutex_;
+
+  ColumnFamilyHandleImpl* default_cf_handle_;
+  InternalStats* default_cf_internal_stats_;
+
+  // table_cache_ provides its own synchronization
+  std::shared_ptr<Cache> table_cache_;
+
+  ErrorHandler error_handler_;
+
+  // Unified interface for logging events
+  EventLogger event_logger_;
+
+  // only used for dynamically adjusting max_total_wal_size. it is a sum of
+  // [write_buffer_size * max_write_buffer_number] over all column families
+  std::atomic<uint64_t> max_total_in_memory_state_;
+
+  // The options to access storage files
+  const FileOptions file_options_;
+
+  // Additonal options for compaction and flush
+  FileOptions file_options_for_compaction_;
+
+  std::unique_ptr<ColumnFamilyMemTablesImpl> column_family_memtables_;
+
+  // Increase the sequence number after writing each batch, whether memtable is
+  // disabled for that or not. Otherwise the sequence number is increased after
+  // writing each key into memtable. This implies that when disable_memtable is
+  // set, the seq is not increased at all.
+  //
+  // Default: false
+  const bool seq_per_batch_;
+  // This determines during recovery whether we expect one writebatch per
+  // recovered transaction, or potentially multiple writebatches per
+  // transaction. For WriteUnprepared, this is set to false, since multiple
+  // batches can exist per transaction.
+  //
+  // Default: true
+  const bool batch_per_txn_;
+
+  // Each flush or compaction gets its own job id. this counter makes sure
+  // they're unique
+  std::atomic<int> next_job_id_;
+
+  std::atomic<bool> shutting_down_;
+
+  // RecoveryContext struct stores the context about version edits along
+  // with corresponding column_family_data and column_family_options.
+  class RecoveryContext {
+   public:
+    ~RecoveryContext() {
+      for (auto& edit_list : edit_lists_) {
+        for (auto* edit : edit_list) {
+          delete edit;
+        }
+      }
+    }
+
+    void UpdateVersionEdits(ColumnFamilyData* cfd, const VersionEdit& edit) {
+      assert(cfd != nullptr);
+      if (map_.find(cfd->GetID()) == map_.end()) {
+        uint32_t size = static_cast<uint32_t>(map_.size());
+        map_.emplace(cfd->GetID(), size);
+        cfds_.emplace_back(cfd);
+        mutable_cf_opts_.emplace_back(cfd->GetLatestMutableCFOptions());
+        edit_lists_.emplace_back(autovector<VersionEdit*>());
+      }
+      uint32_t i = map_[cfd->GetID()];
+      edit_lists_[i].emplace_back(new VersionEdit(edit));
+    }
+
+    std::unordered_map<uint32_t, uint32_t> map_;  // cf_id to index;
+    autovector<ColumnFamilyData*> cfds_;
+    autovector<const MutableCFOptions*> mutable_cf_opts_;
+    autovector<autovector<VersionEdit*>> edit_lists_;
+    // files_to_delete_ contains sst files
+    std::unordered_set<std::string> files_to_delete_;
+  };
+
+  // Except in DB::Open(), WriteOptionsFile can only be called when:
+  // Persist options to options file.
+  // If need_mutex_lock = false, the method will lock DB mutex.
+  // If need_enter_write_thread = false, the method will enter write thread.
+  Status WriteOptionsFile(bool need_mutex_lock, bool need_enter_write_thread);
+
+  Status CompactRangeInternal(const CompactRangeOptions& options,
+                              ColumnFamilyHandle* column_family,
+                              const Slice* begin, const Slice* end,
+                              const std::string& trim_ts);
+
+  // The following two functions can only be called when:
+  // 1. WriteThread::Writer::EnterUnbatched() is used.
+  // 2. db_mutex is NOT held
+  Status RenameTempFileToOptionsFile(const std::string& file_name);
+  Status DeleteObsoleteOptionsFiles();
+
+  void NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                          const MutableCFOptions& mutable_cf_options,
+                          int job_id);
+
+  void NotifyOnFlushCompleted(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info);
+
+  void NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                               const Status& st,
+                               const CompactionJobStats& job_stats, int job_id);
+
+  void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, Compaction* c,
+                                   const Status& st,
+                                   const CompactionJobStats& job_stats,
+                                   int job_id);
+  void NotifyOnMemTableSealed(ColumnFamilyData* cfd,
+                              const MemTableInfo& mem_table_info);
+
+#ifndef ROCKSDB_LITE
+  void NotifyOnExternalFileIngested(
+      ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job);
+
+  virtual Status FlushForGetLiveFiles();
+#endif  // !ROCKSDB_LITE
+
+  void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const;
+
+  void EraseThreadStatusDbInfo() const;
+
+  // If disable_memtable is set the application logic must guarantee that the
+  // batch will still be skipped from memtable during the recovery. An excption
+  // to this is seq_per_batch_ mode, in which since each batch already takes one
+  // seq, it is ok for the batch to write to memtable during recovery as long as
+  // it only takes one sequence number: i.e., no duplicate keys.
+  // In WriteCommitted it is guarnateed since disable_memtable is used for
+  // prepare batch which will be written to memtable later during the commit,
+  // and in WritePrepared it is guaranteed since it will be used only for WAL
+  // markers which will never be written to memtable. If the commit marker is
+  // accompanied with CommitTimeWriteBatch that is not written to memtable as
+  // long as it has no duplicate keys, it does not violate the one-seq-per-batch
+  // policy.
+  // batch_cnt is expected to be non-zero in seq_per_batch mode and
+  // indicates the number of sub-patches. A sub-patch is a subset of the write
+  // batch that does not have duplicate keys.
+  Status WriteImpl(const WriteOptions& options, WriteBatch* updates,
+                   WriteCallback* callback = nullptr,
+                   uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                   bool disable_memtable = false, uint64_t* seq_used = nullptr,
+                   size_t batch_cnt = 0,
+                   PreReleaseCallback* pre_release_callback = nullptr,
+                   PostMemTableCallback* post_memtable_callback = nullptr);
+
+  Status PipelinedWriteImpl(const WriteOptions& options, WriteBatch* updates,
+                            WriteCallback* callback = nullptr,
+                            uint64_t* log_used = nullptr, uint64_t log_ref = 0,
+                            bool disable_memtable = false,
+                            uint64_t* seq_used = nullptr);
+
+  // Write only to memtables without joining any write queue
+  Status UnorderedWriteMemtable(const WriteOptions& write_options,
+                                WriteBatch* my_batch, WriteCallback* callback,
+                                uint64_t log_ref, SequenceNumber seq,
+                                const size_t sub_batch_cnt);
+
+  // Whether the batch requires to be assigned with an order
+  enum AssignOrder : bool { kDontAssignOrder, kDoAssignOrder };
+  // Whether it requires publishing last sequence or not
+  enum PublishLastSeq : bool { kDontPublishLastSeq, kDoPublishLastSeq };
+
+  // Join the write_thread to write the batch only to the WAL. It is the
+  // responsibility of the caller to also write the write batch to the memtable
+  // if it required.
+  //
+  // sub_batch_cnt is expected to be non-zero when assign_order = kDoAssignOrder
+  // indicating the number of sub-batches in my_batch. A sub-patch is a subset
+  // of the write batch that does not have duplicate keys. When seq_per_batch is
+  // not set, each key is a separate sub_batch. Otherwise each duplicate key
+  // marks start of a new sub-batch.
+  Status WriteImplWALOnly(
+      WriteThread* write_thread, const WriteOptions& options,
+      WriteBatch* updates, WriteCallback* callback, uint64_t* log_used,
+      const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+      PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+      const PublishLastSeq publish_last_seq, const bool disable_memtable);
+
+  // write cached_recoverable_state_ to memtable if it is not empty
+  // The writer must be the leader in write_thread_ and holding mutex_
+  Status WriteRecoverableState();
+
+  // Actual implementation of Close()
+  Status CloseImpl();
+
+  // Recover the descriptor from persistent storage.  May do a significant
+  // amount of work to recover recently logged updates.  Any changes to
+  // be made to the descriptor are added to *edit.
+  // recovered_seq is set to less than kMaxSequenceNumber if the log's tail is
+  // skipped.
+  // recovery_ctx stores the context about version edits and all those
+  // edits are persisted to new Manifest after successfully syncing the new WAL.
+  virtual Status Recover(
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only = false, bool error_if_wal_file_exists = false,
+      bool error_if_data_exists_in_wals = false,
+      uint64_t* recovered_seq = nullptr,
+      RecoveryContext* recovery_ctx = nullptr);
+
+  virtual bool OwnTablesAndLogs() const { return true; }
+
+  // Setup DB identity file, and write DB ID to manifest if necessary.
+  Status SetupDBId(bool read_only, RecoveryContext* recovery_ctx);
+  // Assign db_id_ and write DB ID to manifest if necessary.
+  void SetDBId(std::string&& id, bool read_only, RecoveryContext* recovery_ctx);
+
+  // REQUIRES: db mutex held when calling this function, but the db mutex can
+  // be released and re-acquired. Db mutex will be held when the function
+  // returns.
+  // After recovery, there may be SST files in db/cf paths that are
+  // not referenced in the MANIFEST (e.g.
+  // 1. It's best effort recovery;
+  // 2. The VersionEdits referencing the SST files are appended to
+  // RecoveryContext, DB crashes when syncing the MANIFEST, the VersionEdits are
+  // still not synced to MANIFEST during recovery.)
+  // It stores the SST files to be deleted in RecoveryContext. In the
+  // meantime, we find out the largest file number present in the paths, and
+  // bump up the version set's next_file_number_ to be 1 + largest_file_number.
+  // recovery_ctx stores the context about version edits and files to be
+  // deleted. All those edits are persisted to new Manifest after successfully
+  // syncing the new WAL.
+  Status DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx);
+
+  // SetDbSessionId() should be called in the constuctor DBImpl()
+  // to ensure that db_session_id_ gets updated every time the DB is opened
+  void SetDbSessionId();
+
+  Status FailIfCfHasTs(const ColumnFamilyHandle* column_family) const;
+  Status FailIfTsMismatchCf(ColumnFamilyHandle* column_family, const Slice& ts,
+                            bool ts_for_read) const;
+
+  // recovery_ctx stores the context about version edits and
+  // LogAndApplyForRecovery persist all those edits to new Manifest after
+  // successfully syncing new WAL.
+  // LogAndApplyForRecovery should be called only once during recovery and it
+  // should be called when RocksDB writes to a first new MANIFEST since this
+  // recovery.
+  Status LogAndApplyForRecovery(const RecoveryContext& recovery_ctx);
+
+  void InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+  // Return true to proceed with current WAL record whose content is stored in
+  // `batch`. Return false to skip current WAL record.
+  bool InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+                                          const std::string& wal_fname,
+                                          log::Reader::Reporter& reporter,
+                                          Status& status, bool& stop_replay,
+                                          WriteBatch& batch);
+
+ private:
+  friend class DB;
+  friend class ErrorHandler;
+  friend class InternalStats;
+  friend class PessimisticTransaction;
+  friend class TransactionBaseImpl;
+  friend class WriteCommittedTxn;
+  friend class WritePreparedTxn;
+  friend class WritePreparedTxnDB;
+  friend class WriteBatchWithIndex;
+  friend class WriteUnpreparedTxnDB;
+  friend class WriteUnpreparedTxn;
+
+#ifndef ROCKSDB_LITE
+  friend class ForwardIterator;
+#endif
+  friend struct SuperVersion;
+  friend class CompactedDBImpl;
+  friend class DBTest_ConcurrentFlushWAL_Test;
+  friend class DBTest_MixedSlowdownOptionsStop_Test;
+  friend class DBCompactionTest_CompactBottomLevelFilesWithDeletions_Test;
+  friend class DBCompactionTest_CompactionDuringShutdown_Test;
+  friend class StatsHistoryTest_PersistentStatsCreateColumnFamilies_Test;
+#ifndef NDEBUG
+  friend class DBTest2_ReadCallbackTest_Test;
+  friend class WriteCallbackPTest_WriteWithCallbackTest_Test;
+  friend class XFTransactionWriteHandler;
+  friend class DBBlobIndexTest;
+  friend class WriteUnpreparedTransactionTest_RecoveryTest_Test;
+#endif
+
+  struct CompactionState;
+  struct PrepickedCompaction;
+  struct PurgeFileInfo;
+
+  struct WriteContext {
+    SuperVersionContext superversion_context;
+    autovector<MemTable*> memtables_to_free_;
+
+    explicit WriteContext(bool create_superversion = false)
+        : superversion_context(create_superversion) {}
+
+    ~WriteContext() {
+      superversion_context.Clean();
+      for (auto& m : memtables_to_free_) {
+        delete m;
+      }
+    }
+  };
+
+  struct LogFileNumberSize {
+    explicit LogFileNumberSize(uint64_t _number) : number(_number) {}
+    LogFileNumberSize() {}
+    void AddSize(uint64_t new_size) { size += new_size; }
+    uint64_t number;
+    uint64_t size = 0;
+    bool getting_flushed = false;
+  };
+
+  struct LogWriterNumber {
+    // pass ownership of _writer
+    LogWriterNumber(uint64_t _number, log::Writer* _writer)
+        : number(_number), writer(_writer) {}
+
+    log::Writer* ReleaseWriter() {
+      auto* w = writer;
+      writer = nullptr;
+      return w;
+    }
+    Status ClearWriter() {
+      Status s = writer->WriteBuffer();
+      delete writer;
+      writer = nullptr;
+      return s;
+    }
+
+    bool IsSyncing() { return getting_synced; }
+
+    uint64_t GetPreSyncSize() {
+      assert(getting_synced);
+      return pre_sync_size;
+    }
+
+    void PrepareForSync() {
+      assert(!getting_synced);
+      // Size is expected to be monotonically increasing.
+      assert(writer->file()->GetFlushedSize() >= pre_sync_size);
+      getting_synced = true;
+      pre_sync_size = writer->file()->GetFlushedSize();
+    }
+
+    void FinishSync() {
+      assert(getting_synced);
+      getting_synced = false;
+    }
+
+    uint64_t number;
+    // Visual Studio doesn't support deque's member to be noncopyable because
+    // of a std::unique_ptr as a member.
+    log::Writer* writer;  // own
+
+   private:
+    // true for some prefix of logs_
+    bool getting_synced = false;
+    // The size of the file before the sync happens. This amount is guaranteed
+    // to be persisted even if appends happen during sync so it can be used for
+    // tracking the synced size in MANIFEST.
+    uint64_t pre_sync_size = 0;
+  };
+
+  struct LogContext {
+    explicit LogContext(bool need_sync = false)
+        : need_log_sync(need_sync), need_log_dir_sync(need_sync) {}
+    bool need_log_sync = false;
+    bool need_log_dir_sync = false;
+    log::Writer* writer = nullptr;
+    LogFileNumberSize* log_file_number_size = nullptr;
+  };
+
+  // PurgeFileInfo is a structure to hold information of files to be deleted in
+  // purge_files_
+  struct PurgeFileInfo {
+    std::string fname;
+    std::string dir_to_sync;
+    FileType type;
+    uint64_t number;
+    int job_id;
+    PurgeFileInfo(std::string fn, std::string d, FileType t, uint64_t num,
+                  int jid)
+        : fname(fn), dir_to_sync(d), type(t), number(num), job_id(jid) {}
+  };
+
+  // Argument required by background flush thread.
+  struct BGFlushArg {
+    BGFlushArg()
+        : cfd_(nullptr), max_memtable_id_(0), superversion_context_(nullptr) {}
+    BGFlushArg(ColumnFamilyData* cfd, uint64_t max_memtable_id,
+               SuperVersionContext* superversion_context)
+        : cfd_(cfd),
+          max_memtable_id_(max_memtable_id),
+          superversion_context_(superversion_context) {}
+
+    // Column family to flush.
+    ColumnFamilyData* cfd_;
+    // Maximum ID of memtable to flush. In this column family, memtables with
+    // IDs smaller than this value must be flushed before this flush completes.
+    uint64_t max_memtable_id_;
+    // Pointer to a SuperVersionContext object. After flush completes, RocksDB
+    // installs a new superversion for the column family. This operation
+    // requires a SuperVersionContext object (currently embedded in JobContext).
+    SuperVersionContext* superversion_context_;
+  };
+
+  // Argument passed to flush thread.
+  struct FlushThreadArg {
+    DBImpl* db_;
+
+    Env::Priority thread_pri_;
+  };
+
+  // Information for a manual compaction
+  struct ManualCompactionState {
+    ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+                          int _output_level, uint32_t _output_path_id,
+                          bool _exclusive, bool _disallow_trivial_move,
+                          std::atomic<bool>* _canceled)
+        : cfd(_cfd),
+          input_level(_input_level),
+          output_level(_output_level),
+          output_path_id(_output_path_id),
+          exclusive(_exclusive),
+          disallow_trivial_move(_disallow_trivial_move),
+          canceled(_canceled ? *_canceled : canceled_internal_storage) {}
+    // When _canceled is not provided by ther user, we assign the reference of
+    // canceled_internal_storage to it to consolidate canceled and
+    // manual_compaction_paused since DisableManualCompaction() might be
+    // called
+
+    ColumnFamilyData* cfd;
+    int input_level;
+    int output_level;
+    uint32_t output_path_id;
+    Status status;
+    bool done = false;
+    bool in_progress = false;    // compaction request being processed?
+    bool incomplete = false;     // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin = nullptr;  // nullptr means beginning of key range
+    const InternalKey* end = nullptr;    // nullptr means end of key range
+    InternalKey* manual_end = nullptr;   // how far we are compacting
+    InternalKey tmp_storage;      // Used to keep track of compaction progress
+    InternalKey tmp_storage1;     // Used to keep track of compaction progress
+
+    // When the user provides a canceled pointer in CompactRangeOptions, the
+    // above varaibe is the reference of the user-provided
+    // `canceled`, otherwise, it is the reference of canceled_internal_storage
+    std::atomic<bool> canceled_internal_storage = false;
+    std::atomic<bool>& canceled;  // Compaction canceled pointer reference
+  };
+  struct PrepickedCompaction {
+    // background compaction takes ownership of `compaction`.
+    Compaction* compaction;
+    // caller retains ownership of `manual_compaction_state` as it is reused
+    // across background compactions.
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    // task limiter token is requested during compaction picking.
+    std::unique_ptr<TaskLimiterToken> task_token;
+  };
+
+  struct CompactionArg {
+    // caller retains ownership of `db`.
+    DBImpl* db;
+    // background compaction takes ownership of `prepicked_compaction`.
+    PrepickedCompaction* prepicked_compaction;
+    Env::Priority compaction_pri_;
+  };
+
+  // Initialize the built-in column family for persistent stats. Depending on
+  // whether on-disk persistent stats have been enabled before, it may either
+  // create a new column family and column family handle or just a column family
+  // handle.
+  // Required: DB mutex held
+  Status InitPersistStatsColumnFamily();
+
+  // Persistent Stats column family has two format version key which are used
+  // for compatibility check. Write format version if it's created for the
+  // first time, read format version and check compatibility if recovering
+  // from disk. This function requires DB mutex held at entrance but may
+  // release and re-acquire DB mutex in the process.
+  // Required: DB mutex held
+  Status PersistentStatsProcessFormatVersion();
+
+  Status ResumeImpl(DBRecoverContext context);
+
+  void MaybeIgnoreError(Status* s) const;
+
+  const Status CreateArchivalDirectory();
+
+  Status CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options,
+                                const std::string& cf_name,
+                                ColumnFamilyHandle** handle);
+
+  Status DropColumnFamilyImpl(ColumnFamilyHandle* column_family);
+
+  // Delete any unneeded files and stale in-memory entries.
+  void DeleteObsoleteFiles();
+  // Delete obsolete files and log status and information of file deletion
+  void DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                              const std::string& path_to_sync, FileType type,
+                              uint64_t number);
+
+  // Background process needs to call
+  //     auto x = CaptureCurrentFileNumberInPendingOutputs()
+  //     auto file_num = versions_->NewFileNumber();
+  //     <do something>
+  //     ReleaseFileNumberFromPendingOutputs(x)
+  // This will protect any file with number `file_num` or greater from being
+  // deleted while <do something> is running.
+  // -----------
+  // This function will capture current file number and append it to
+  // pending_outputs_. This will prevent any background process to delete any
+  // file created after this point.
+  std::list<uint64_t>::iterator CaptureCurrentFileNumberInPendingOutputs();
+  // This function should be called with the result of
+  // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file
+  // created between the calls CaptureCurrentFileNumberInPendingOutputs() and
+  // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live
+  // and blocked by any other pending_outputs_ calls)
+  void ReleaseFileNumberFromPendingOutputs(
+      std::unique_ptr<std::list<uint64_t>::iterator>& v);
+
+  IOStatus SyncClosedLogs(JobContext* job_context, VersionEdit* synced_wals);
+
+  // Flush the in-memory write buffer to storage.  Switches to a new
+  // log-file/memtable and writes a new descriptor iff successful. Then
+  // installs a new super version for the column family.
+  Status FlushMemTableToOutputFile(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      bool* madeProgress, JobContext* job_context,
+      SuperVersionContext* superversion_context,
+      std::vector<SequenceNumber>& snapshot_seqs,
+      SequenceNumber earliest_write_conflict_snapshot,
+      SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+      Env::Priority thread_pri);
+
+  // Flush the memtables of (multiple) column families to multiple files on
+  // persistent storage.
+  Status FlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  Status AtomicFlushMemTablesToOutputFiles(
+      const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+      JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri);
+
+  // REQUIRES: log_numbers are sorted in ascending order
+  // corrupted_log_found is set to true if we recover from a corrupted log file.
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence, bool read_only,
+                         bool* corrupted_log_found,
+                         RecoveryContext* recovery_ctx);
+
+  // The following two methods are used to flush a memtable to
+  // storage. The first one is used at database RecoveryTime (when the
+  // database is opened) and is heavyweight because it holds the mutex
+  // for the entire period. The second method WriteLevel0Table supports
+  // concurrent flush memtables to storage.
+  Status WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                     MemTable* mem, VersionEdit* edit);
+
+  // Get the size of a log file and, if truncate is true, truncate the
+  // log file to its actual size, thereby freeing preallocated space.
+  // Return success even if truncate fails
+  Status GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                    LogFileNumberSize* log);
+
+  // Restore alive_log_files_ and total_log_size_ after recovery.
+  // It needs to run only when there's no flush during recovery
+  // (e.g. avoid_flush_during_recovery=true). May also trigger flush
+  // in case total_log_size > max_total_wal_size.
+  Status RestoreAliveLogFiles(const std::vector<uint64_t>& log_numbers);
+
+  // num_bytes: for slowdown case, delay time is calculated based on
+  //            `num_bytes` going through.
+  Status DelayWrite(uint64_t num_bytes, const WriteOptions& write_options);
+
+  // Begin stalling of writes when memory usage increases beyond a certain
+  // threshold.
+  void WriteBufferManagerStallWrites();
+
+  Status ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                      WriteBatch* my_batch);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status ScheduleFlushes(WriteContext* context);
+
+  void MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds);
+
+  Status TrimMemtableHistory(WriteContext* context);
+
+  Status SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context);
+
+  void SelectColumnFamiliesForAtomicFlush(autovector<ColumnFamilyData*>* cfds);
+
+  // Force current memtable contents to be flushed.
+  Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options,
+                       FlushReason flush_reason,
+                       bool entered_write_thread = false);
+
+  Status AtomicFlushMemTables(
+      const autovector<ColumnFamilyData*>& column_family_datas,
+      const FlushOptions& options, FlushReason flush_reason,
+      bool entered_write_thread = false);
+
+  // Wait until flushing this column family won't stall writes
+  Status WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                           bool* flush_needed);
+
+  // Wait for memtable flushed.
+  // If flush_memtable_id is non-null, wait until the memtable with the ID
+  // gets flush. Otherwise, wait until the column family don't have any
+  // memtable pending flush.
+  // resuming_from_bg_err indicates whether the caller is attempting to resume
+  // from background error.
+  Status WaitForFlushMemTable(ColumnFamilyData* cfd,
+                              const uint64_t* flush_memtable_id = nullptr,
+                              bool resuming_from_bg_err = false) {
+    return WaitForFlushMemTables({cfd}, {flush_memtable_id},
+                                 resuming_from_bg_err);
+  }
+  // Wait for memtables to be flushed for multiple column families.
+  Status WaitForFlushMemTables(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const uint64_t*>& flush_memtable_ids,
+      bool resuming_from_bg_err);
+
+  inline void WaitForPendingWrites() {
+    mutex_.AssertHeld();
+    TEST_SYNC_POINT("DBImpl::WaitForPendingWrites:BeforeBlock");
+    // In case of pipelined write is enabled, wait for all pending memtable
+    // writers.
+    if (immutable_db_options_.enable_pipelined_write) {
+      // Memtable writers may call DB::Get in case max_successive_merges > 0,
+      // which may lock mutex. Unlocking mutex here to avoid deadlock.
+      mutex_.Unlock();
+      write_thread_.WaitForMemTableWriters();
+      mutex_.Lock();
+    }
+
+    if (!immutable_db_options_.unordered_write) {
+      // Then the writes are finished before the next write group starts
+      return;
+    }
+
+    // Wait for the ones who already wrote to the WAL to finish their
+    // memtable write.
+    if (pending_memtable_writes_.load() != 0) {
+      std::unique_lock<std::mutex> guard(switch_mutex_);
+      switch_cv_.wait(guard,
+                      [&] { return pending_memtable_writes_.load() == 0; });
+    }
+  }
+
+  // TaskType is used to identify tasks in thread-pool, currently only
+  // differentiate manual compaction, which could be unscheduled from the
+  // thread-pool.
+  enum class TaskType : uint8_t {
+    kDefault = 0,
+    kManualCompaction = 1,
+    kCount = 2,
+  };
+
+  // Task tag is used to identity tasks in thread-pool, which is
+  // dbImpl obj address + type
+  inline void* GetTaskTag(TaskType type) {
+    return GetTaskTag(static_cast<uint8_t>(type));
+  }
+
+  inline void* GetTaskTag(uint8_t type) {
+    return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+  }
+
+  // REQUIRES: mutex locked and in write thread.
+  void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status SwitchWAL(WriteContext* write_context);
+
+  // REQUIRES: mutex locked and in write thread.
+  Status HandleWriteBufferManagerFlush(WriteContext* write_context);
+
+  // REQUIRES: mutex locked
+  Status PreprocessWrite(const WriteOptions& write_options,
+                         LogContext* log_context, WriteContext* write_context);
+
+  // Merge write batches in the write group into merged_batch.
+  // Returns OK if merge is successful.
+  // Returns Corruption if corruption in write batch is detected.
+  Status MergeBatch(const WriteThread::WriteGroup& write_group,
+                    WriteBatch* tmp_batch, WriteBatch** merged_batch,
+                    size_t* write_with_wal, WriteBatch** to_be_cached_state);
+
+  // rate_limiter_priority is used to charge `DBOptions::rate_limiter`
+  // for automatic WAL flush (`Options::manual_wal_flush` == false)
+  // associated with this WriteToWAL
+  IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
+                      uint64_t* log_used, uint64_t* log_size,
+                      Env::IOPriority rate_limiter_priority,
+                      LogFileNumberSize& log_file_number_size);
+
+  IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
+                      log::Writer* log_writer, uint64_t* log_used,
+                      bool need_log_sync, bool need_log_dir_sync,
+                      SequenceNumber sequence,
+                      LogFileNumberSize& log_file_number_size);
+
+  IOStatus ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
+                                uint64_t* log_used,
+                                SequenceNumber* last_sequence, size_t seq_inc);
+
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  // Caller must hold mutex_.
+  void WriteStatusCheckOnLocked(const Status& status);
+
+  // Used by WriteImpl to update bg_error_ if paranoid check is enabled.
+  void WriteStatusCheck(const Status& status);
+
+  // Used by WriteImpl to update bg_error_ when IO error happens, e.g., write
+  // WAL, sync WAL fails, if paranoid check is enabled.
+  void IOStatusCheck(const IOStatus& status);
+
+  // Used by WriteImpl to update bg_error_ in case of memtable insert error.
+  void MemTableInsertStatusCheck(const Status& memtable_insert_status);
+
+#ifndef ROCKSDB_LITE
+  Status CompactFilesImpl(const CompactionOptions& compact_options,
+                          ColumnFamilyData* cfd, Version* version,
+                          const std::vector<std::string>& input_file_names,
+                          std::vector<std::string>* const output_file_names,
+                          const int output_level, int output_path_id,
+                          JobContext* job_context, LogBuffer* log_buffer,
+                          CompactionJobInfo* compaction_job_info);
+
+  // Wait for current IngestExternalFile() calls to finish.
+  // REQUIRES: mutex_ held
+  void WaitForIngestFile();
+#else
+  // IngestExternalFile is not supported in ROCKSDB_LITE so this function
+  // will be no-op
+  void WaitForIngestFile() {}
+#endif  // ROCKSDB_LITE
+
+  ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name);
+
+  void MaybeScheduleFlushOrCompaction();
+
+  // A flush request specifies the column families to flush as well as the
+  // largest memtable id to persist for each column family. Once all the
+  // memtables whose IDs are smaller than or equal to this per-column-family
+  // specified value, this flush request is considered to have completed its
+  // work of flushing this column family. After completing the work for all
+  // column families in this request, this flush is considered complete.
+  using FlushRequest = std::vector<std::pair<ColumnFamilyData*, uint64_t>>;
+
+  void GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                            FlushRequest* req);
+
+  void SchedulePendingFlush(const FlushRequest& req, FlushReason flush_reason);
+
+  void SchedulePendingCompaction(ColumnFamilyData* cfd);
+  void SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                            FileType type, uint64_t number, int job_id);
+  static void BGWorkCompaction(void* arg);
+  // Runs a pre-chosen universal compaction involving bottom level in a
+  // separate, bottom-pri thread pool.
+  static void BGWorkBottomCompaction(void* arg);
+  static void BGWorkFlush(void* arg);
+  static void BGWorkPurge(void* arg);
+  static void UnscheduleCompactionCallback(void* arg);
+  static void UnscheduleFlushCallback(void* arg);
+  void BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                Env::Priority thread_pri);
+  void BackgroundCallFlush(Env::Priority thread_pri);
+  void BackgroundCallPurge();
+  Status BackgroundCompaction(bool* madeProgress, JobContext* job_context,
+                              LogBuffer* log_buffer,
+                              PrepickedCompaction* prepicked_compaction,
+                              Env::Priority thread_pri);
+  Status BackgroundFlush(bool* madeProgress, JobContext* job_context,
+                         LogBuffer* log_buffer, FlushReason* reason,
+                         Env::Priority thread_pri);
+
+  bool EnoughRoomForCompaction(ColumnFamilyData* cfd,
+                               const std::vector<CompactionInputFiles>& inputs,
+                               bool* sfm_bookkeeping, LogBuffer* log_buffer);
+
+  // Request compaction tasks token from compaction thread limiter.
+  // It always succeeds if force = true or limiter is disable.
+  bool RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                              std::unique_ptr<TaskLimiterToken>* token,
+                              LogBuffer* log_buffer);
+
+  // Schedule background tasks
+  Status StartPeriodicTaskScheduler();
+
+  Status RegisterRecordSeqnoTimeWorker();
+
+  void PrintStatistics();
+
+  size_t EstimateInMemoryStatsHistorySize() const;
+
+  // Return the minimum empty level that could hold the total data in the
+  // input level. Return the input level, if such level could not be found.
+  int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
+                                   const MutableCFOptions& mutable_cf_options,
+                                   int level);
+
+  // Move the files in the input level to the target level.
+  // If target_level < 0, automatically calculate the minimum level that could
+  // hold the data set.
+  Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1);
+
+  // helper functions for adding and removing from flush & compaction queues
+  void AddToCompactionQueue(ColumnFamilyData* cfd);
+  ColumnFamilyData* PopFirstFromCompactionQueue();
+  FlushRequest PopFirstFromFlushQueue();
+
+  // Pick the first unthrottled compaction with task token from queue.
+  ColumnFamilyData* PickCompactionFromQueue(
+      std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer);
+
+  // helper function to call after some of the logs_ were synced
+  void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit);
+  Status ApplyWALToManifest(VersionEdit* edit);
+  // WALs with log number up to up_to are not synced successfully.
+  void MarkLogsNotSynced(uint64_t up_to);
+
+  SnapshotImpl* GetSnapshotImpl(bool is_write_conflict_boundary,
+                                bool lock = true);
+
+  // If snapshot_seq != kMaxSequenceNumber, then this function can only be
+  // called from the write thread that publishes sequence numbers to readers.
+  // For 1) write-committed, or 2) write-prepared + one-write-queue, this will
+  // be the write thread performing memtable writes. For write-prepared with
+  // two write queues, this will be the write thread writing commit marker to
+  // the WAL.
+  // If snapshot_seq == kMaxSequenceNumber, this function is called by a caller
+  // ensuring no writes to the database.
+  std::pair<Status, std::shared_ptr<const SnapshotImpl>>
+  CreateTimestampedSnapshotImpl(SequenceNumber snapshot_seq, uint64_t ts,
+                                bool lock = true);
+
+  uint64_t GetMaxTotalWalSize() const;
+
+  FSDirectory* GetDataDir(ColumnFamilyData* cfd, size_t path_id) const;
+
+  Status MaybeReleaseTimestampedSnapshotsAndCheck();
+
+  Status CloseHelper();
+
+  void WaitForBackgroundWork();
+
+  // Background threads call this function, which is just a wrapper around
+  // the InstallSuperVersion() function. Background threads carry
+  // sv_context which can have new_superversion already
+  // allocated.
+  // All ColumnFamily state changes go through this function. Here we analyze
+  // the new state and we schedule background work if we detect that the new
+  // state needs flush or compaction.
+  void InstallSuperVersionAndScheduleWork(
+      ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+      const MutableCFOptions& mutable_cf_options);
+
+  bool GetIntPropertyInternal(ColumnFamilyData* cfd,
+                              const DBPropertyInfo& property_info,
+                              bool is_locked, uint64_t* value);
+  bool GetPropertyHandleOptionsStatistics(std::string* value);
+
+  bool HasPendingManualCompaction();
+  bool HasExclusiveManualCompaction();
+  void AddManualCompaction(ManualCompactionState* m);
+  void RemoveManualCompaction(ManualCompactionState* m);
+  bool ShouldntRunManualCompaction(ManualCompactionState* m);
+  bool HaveManualCompaction(ColumnFamilyData* cfd);
+  bool MCOverlap(ManualCompactionState* m, ManualCompactionState* m1);
+#ifndef ROCKSDB_LITE
+  void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
+                              const Status& st,
+                              const CompactionJobStats& compaction_job_stats,
+                              const int job_id, const Version* current,
+                              CompactionJobInfo* compaction_job_info) const;
+  // Reserve the next 'num' file numbers for to-be-ingested external SST files,
+  // and return the current file_number in 'next_file_number'.
+  // Write a version edit to the MANIFEST.
+  Status ReserveFileNumbersBeforeIngestion(
+      ColumnFamilyData* cfd, uint64_t num,
+      std::unique_ptr<std::list<uint64_t>::iterator>& pending_output_elem,
+      uint64_t* next_file_number);
+#endif  //! ROCKSDB_LITE
+
+  bool ShouldPurge(uint64_t file_number) const;
+  void MarkAsGrabbedForPurge(uint64_t file_number);
+
+  size_t GetWalPreallocateBlockSize(uint64_t write_buffer_size) const;
+  Env::WriteLifeTimeHint CalculateWALWriteHint() { return Env::WLTH_SHORT; }
+
+  IOStatus CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                     size_t preallocate_block_size, log::Writer** new_log);
+
+  // Validate self-consistency of DB options
+  static Status ValidateOptions(const DBOptions& db_options);
+  // Validate self-consistency of DB options and its consistency with cf options
+  static Status ValidateOptions(
+      const DBOptions& db_options,
+      const std::vector<ColumnFamilyDescriptor>& column_families);
+
+  // Utility function to do some debug validation and sort the given vector
+  // of MultiGet keys
+  void PrepareMultiGetKeys(
+      const size_t num_keys, bool sorted,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* key_ptrs);
+
+  // A structure to hold the information required to process MultiGet of keys
+  // belonging to one column family. For a multi column family MultiGet, there
+  // will be a container of these objects.
+  struct MultiGetColumnFamilyData {
+    ColumnFamilyHandle* cf;
+    ColumnFamilyData* cfd;
+
+    // For the batched MultiGet which relies on sorted keys, start specifies
+    // the index of first key belonging to this column family in the sorted
+    // list.
+    size_t start;
+
+    // For the batched MultiGet case, num_keys specifies the number of keys
+    // belonging to this column family in the sorted list
+    size_t num_keys;
+
+    // SuperVersion for the column family obtained in a manner that ensures a
+    // consistent view across all column families in the DB
+    SuperVersion* super_version;
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family,
+                             SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(0),
+          num_keys(0),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData(ColumnFamilyHandle* column_family, size_t first,
+                             size_t count, SuperVersion* sv)
+        : cf(column_family),
+          cfd(static_cast<ColumnFamilyHandleImpl*>(cf)->cfd()),
+          start(first),
+          num_keys(count),
+          super_version(sv) {}
+
+    MultiGetColumnFamilyData() = default;
+  };
+
+  // A common function to obtain a consistent snapshot, which can be implicit
+  // if the user doesn't specify a snapshot in read_options, across
+  // multiple column families for MultiGet. It will attempt to get an implicit
+  // snapshot without acquiring the db_mutes, but will give up after a few
+  // tries and acquire the mutex if a memtable flush happens. The template
+  // allows both the batched and non-batched MultiGet to call this with
+  // either an std::unordered_map or autovector of column families.
+  //
+  // If callback is non-null, the callback is refreshed with the snapshot
+  // sequence number
+  //
+  // A return value of true indicates that the SuperVersions were obtained
+  // from the ColumnFamilyData, whereas false indicates they are thread
+  // local
+  template <class T>
+  bool MultiCFSnapshot(
+      const ReadOptions& read_options, ReadCallback* callback,
+      std::function<MultiGetColumnFamilyData*(typename T::iterator&)>&
+          iter_deref_func,
+      T* cf_list, SequenceNumber* snapshot);
+
+  // The actual implementation of the batching MultiGet. The caller is expected
+  // to have acquired the SuperVersion and pass in a snapshot sequence number
+  // in order to construct the LookupKeys. The start_key and num_keys specify
+  // the range of keys in the sorted_keys vector for a single column family.
+  Status MultiGetImpl(
+      const ReadOptions& read_options, size_t start_key, size_t num_keys,
+      autovector<KeyContext*, MultiGetContext::MAX_BATCH_SIZE>* sorted_keys,
+      SuperVersion* sv, SequenceNumber snap_seqnum, ReadCallback* callback);
+
+  Status DisableFileDeletionsWithLock();
+
+  Status IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                      std::string ts_low);
+
+  bool ShouldReferenceSuperVersion(const MergeContext& merge_context);
+
+  // Lock over the persistent DB state.  Non-nullptr iff successfully acquired.
+  FileLock* db_lock_;
+
+  // In addition to mutex_, log_write_mutex_ protected writes to stats_history_
+  InstrumentedMutex stats_history_mutex_;
+  // In addition to mutex_, log_write_mutex_ protected writes to logs_ and
+  // logfile_number_. With two_write_queues it also protects alive_log_files_,
+  // and log_empty_. Refer to the definition of each variable below for more
+  // details.
+  // Note: to avoid dealock, if needed to acquire both log_write_mutex_ and
+  // mutex_, the order should be first mutex_ and then log_write_mutex_.
+  InstrumentedMutex log_write_mutex_;
+
+  // If zero, manual compactions are allowed to proceed. If non-zero, manual
+  // compactions may still be running, but will quickly fail with
+  // `Status::Incomplete`. The value indicates how many threads have paused
+  // manual compactions. It is accessed in read mode outside the DB mutex in
+  // compaction code paths.
+  std::atomic<int> manual_compaction_paused_;
+
+  // This condition variable is signaled on these conditions:
+  // * whenever bg_compaction_scheduled_ goes down to 0
+  // * if AnyManualCompaction, whenever a compaction finishes, even if it hasn't
+  // made any progress
+  // * whenever a compaction made any progress
+  // * whenever bg_flush_scheduled_ or bg_purge_scheduled_ value decreases
+  // (i.e. whenever a flush is done, even if it didn't make any progress)
+  // * whenever there is an error in background purge, flush or compaction
+  // * whenever num_running_ingest_file_ goes to 0.
+  // * whenever pending_purge_obsolete_files_ goes to 0.
+  // * whenever disable_delete_obsolete_files_ goes to 0.
+  // * whenever SetOptions successfully updates options.
+  // * whenever a column family is dropped.
+  InstrumentedCondVar bg_cv_;
+  // Writes are protected by locking both mutex_ and log_write_mutex_, and reads
+  // must be under either mutex_ or log_write_mutex_. Since after ::Open,
+  // logfile_number_ is currently updated only in write_thread_, it can be read
+  // from the same write_thread_ without any locks.
+  uint64_t logfile_number_;
+  // Log files that we can recycle. Must be protected by db mutex_.
+  std::deque<uint64_t> log_recycle_files_;
+  // Protected by log_write_mutex_.
+  bool log_dir_synced_;
+  // Without two_write_queues, read and writes to log_empty_ are protected by
+  // mutex_. Since it is currently updated/read only in write_thread_, it can be
+  // accessed from the same write_thread_ without any locks. With
+  // two_write_queues writes, where it can be updated in different threads,
+  // read and writes are protected by log_write_mutex_ instead. This is to avoid
+  // expensive mutex_ lock during WAL write, which update log_empty_.
+  bool log_empty_;
+
+  ColumnFamilyHandleImpl* persist_stats_cf_handle_;
+
+  bool persistent_stats_cfd_exists_ = true;
+
+  // alive_log_files_ is protected by mutex_ and log_write_mutex_ with details
+  // as follows:
+  // 1. read by FindObsoleteFiles() which can be called in either application
+  //    thread or RocksDB bg threads, both mutex_ and log_write_mutex_ are
+  //    held.
+  // 2. pop_front() by FindObsoleteFiles(), both mutex_ and log_write_mutex_
+  //    are held.
+  // 3. push_back() by DBImpl::Open() and DBImpl::RestoreAliveLogFiles()
+  //    (actually called by Open()), only mutex_ is held because at this point,
+  //    the DB::Open() call has not returned success to application, and the
+  //    only other thread(s) that can conflict are bg threads calling
+  //    FindObsoleteFiles() which ensure that both mutex_ and log_write_mutex_
+  //    are held when accessing alive_log_files_.
+  // 4. read by DBImpl::Open() is protected by mutex_.
+  // 5. push_back() by SwitchMemtable(). Both mutex_ and log_write_mutex_ are
+  //    held. This is done by the write group leader. Note that in the case of
+  //    two-write-queues, another WAL-only write thread can be writing to the
+  //    WAL concurrently. See 9.
+  // 6. read by SwitchWAL() with both mutex_ and log_write_mutex_ held. This is
+  //    done by write group leader.
+  // 7. read by ConcurrentWriteToWAL() by the write group leader in the case of
+  //    two-write-queues. Only log_write_mutex_ is held to protect concurrent
+  //    pop_front() by FindObsoleteFiles().
+  // 8. read by PreprocessWrite() by the write group leader. log_write_mutex_
+  //    is held to protect the data structure from concurrent pop_front() by
+  //    FindObsoleteFiles().
+  // 9. read by ConcurrentWriteToWAL() by a WAL-only write thread in the case
+  //    of two-write-queues. Only log_write_mutex_ is held. This suffices to
+  //    protect the data structure from concurrent push_back() by current
+  //    write group leader as well as pop_front() by FindObsoleteFiles().
+  std::deque<LogFileNumberSize> alive_log_files_;
+
+  // Log files that aren't fully synced, and the current log file.
+  // Synchronization:
+  // 1. read by FindObsoleteFiles() which can be called either in application
+  //    thread or RocksDB bg threads. log_write_mutex_ is always held, while
+  //    some reads are performed without mutex_.
+  // 2. pop_front() by FindObsoleteFiles() with only log_write_mutex_ held.
+  // 3. read by DBImpl::Open() with both mutex_ and log_write_mutex_.
+  // 4. emplace_back() by DBImpl::Open() with both mutex_ and log_write_mutex.
+  //    Note that at this point, DB::Open() has not returned success to
+  //    application, thus the only other thread(s) that can conflict are bg
+  //    threads calling FindObsoleteFiles(). See 1.
+  // 5. iteration and clear() from CloseHelper() always hold log_write_mutex
+  //    and mutex_.
+  // 6. back() called by APIs FlushWAL() and LockWAL() are protected by only
+  //    log_write_mutex_. These two can be called by application threads after
+  //    DB::Open() returns success to applications.
+  // 7. read by SyncWAL(), another API, protected by only log_write_mutex_.
+  // 8. read by MarkLogsNotSynced() and MarkLogsSynced() are protected by
+  //    log_write_mutex_.
+  // 9. erase() by MarkLogsSynced() protected by log_write_mutex_.
+  // 10. read by SyncClosedLogs() protected by only log_write_mutex_. This can
+  //     happen in bg flush threads after DB::Open() returns success to
+  //     applications.
+  // 11. reads, e.g. front(), iteration, and back() called by PreprocessWrite()
+  //     holds only the log_write_mutex_. This is done by the write group
+  //     leader. A bg thread calling FindObsoleteFiles() or MarkLogsSynced()
+  //     can happen concurrently. This is fine because log_write_mutex_ is used
+  //     by all parties. See 2, 5, 9.
+  // 12. reads, empty(), back() called by SwitchMemtable() hold both mutex_ and
+  //     log_write_mutex_. This happens in the write group leader.
+  // 13. emplace_back() by SwitchMemtable() hold both mutex_ and
+  //     log_write_mutex_. This happens in the write group leader. Can conflict
+  //     with bg threads calling FindObsoleteFiles(), MarkLogsSynced(),
+  //     SyncClosedLogs(), etc. as well as application threads calling
+  //     FlushWAL(), SyncWAL(), LockWAL(). This is fine because all parties
+  //     require at least log_write_mutex_.
+  // 14. iteration called in WriteToWAL(write_group) protected by
+  //     log_write_mutex_. This is done by write group leader when
+  //     two-write-queues is disabled and write needs to sync logs.
+  // 15. back() called in ConcurrentWriteToWAL() protected by log_write_mutex_.
+  //     This can be done by the write group leader if two-write-queues is
+  //     enabled. It can also be done by another WAL-only write thread.
+  //
+  // Other observations:
+  //  - back() and items with getting_synced=true are not popped,
+  //  - The same thread that sets getting_synced=true will reset it.
+  //  - it follows that the object referred by back() can be safely read from
+  //  the write_thread_ without using mutex. Note that calling back() without
+  //  mutex may be unsafe because different implementations of deque::back() may
+  //  access other member variables of deque, causing undefined behaviors.
+  //  Generally, do not access stl containers without proper synchronization.
+  //  - it follows that the items with getting_synced=true can be safely read
+  //  from the same thread that has set getting_synced=true
+  std::deque<LogWriterNumber> logs_;
+
+  // Signaled when getting_synced becomes false for some of the logs_.
+  InstrumentedCondVar log_sync_cv_;
+  // This is the app-level state that is written to the WAL but will be used
+  // only during recovery. Using this feature enables not writing the state to
+  // memtable on normal writes and hence improving the throughput. Each new
+  // write of the state will replace the previous state entirely even if the
+  // keys in the two consecutive states do not overlap.
+  // It is protected by log_write_mutex_ when two_write_queues_ is enabled.
+  // Otherwise only the heaad of write_thread_ can access it.
+  WriteBatch cached_recoverable_state_;
+  std::atomic<bool> cached_recoverable_state_empty_ = {true};
+  std::atomic<uint64_t> total_log_size_;
+
+  // If this is non-empty, we need to delete these log files in background
+  // threads. Protected by log_write_mutex_.
+  autovector<log::Writer*> logs_to_free_;
+
+  bool is_snapshot_supported_;
+
+  std::map<uint64_t, std::map<std::string, uint64_t>> stats_history_;
+
+  std::map<std::string, uint64_t> stats_slice_;
+
+  bool stats_slice_initialized_ = false;
+
+  Directories directories_;
+
+  WriteBufferManager* write_buffer_manager_;
+
+  WriteThread write_thread_;
+  WriteBatch tmp_batch_;
+  // The write thread when the writers have no memtable write. This will be used
+  // in 2PC to batch the prepares separately from the serial commit.
+  WriteThread nonmem_write_thread_;
+
+  WriteController write_controller_;
+
+  // Size of the last batch group. In slowdown mode, next write needs to
+  // sleep if it uses up the quota.
+  // Note: This is to protect memtable and compaction. If the batch only writes
+  // to the WAL its size need not to be included in this.
+  uint64_t last_batch_group_size_;
+
+  FlushScheduler flush_scheduler_;
+
+  TrimHistoryScheduler trim_history_scheduler_;
+
+  SnapshotList snapshots_;
+
+  TimestampedSnapshotList timestamped_snapshots_;
+
+  // For each background job, pending_outputs_ keeps the current file number at
+  // the time that background job started.
+  // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has
+  // number bigger than any of the file number in pending_outputs_. Since file
+  // numbers grow monotonically, this also means that pending_outputs_ is always
+  // sorted. After a background job is done executing, its file number is
+  // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean
+  // it up.
+  // State is protected with db mutex.
+  std::list<uint64_t> pending_outputs_;
+
+  // flush_queue_ and compaction_queue_ hold column families that we need to
+  // flush and compact, respectively.
+  // A column family is inserted into flush_queue_ when it satisfies condition
+  // cfd->imm()->IsFlushPending()
+  // A column family is inserted into compaction_queue_ when it satisfied
+  // condition cfd->NeedsCompaction()
+  // Column families in this list are all Ref()-erenced
+  // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will
+  // do RAII on ColumnFamilyData
+  // Column families are in this queue when they need to be flushed or
+  // compacted. Consumers of these queues are flush and compaction threads. When
+  // column family is put on this queue, we increase unscheduled_flushes_ and
+  // unscheduled_compactions_. When these variables are bigger than zero, that
+  // means we need to schedule background threads for flush and compaction.
+  // Once the background threads are scheduled, we decrease unscheduled_flushes_
+  // and unscheduled_compactions_. That way we keep track of number of
+  // compaction and flush threads we need to schedule. This scheduling is done
+  // in MaybeScheduleFlushOrCompaction()
+  // invariant(column family present in flush_queue_ <==>
+  // ColumnFamilyData::pending_flush_ == true)
+  std::deque<FlushRequest> flush_queue_;
+  // invariant(column family present in compaction_queue_ <==>
+  // ColumnFamilyData::pending_compaction_ == true)
+  std::deque<ColumnFamilyData*> compaction_queue_;
+
+  // A map to store file numbers and filenames of the files to be purged
+  std::unordered_map<uint64_t, PurgeFileInfo> purge_files_;
+
+  // A vector to store the file numbers that have been assigned to certain
+  // JobContext. Current implementation tracks table and blob files only.
+  std::unordered_set<uint64_t> files_grabbed_for_purge_;
+
+  // A queue to store log writers to close. Protected by db mutex_.
+  std::deque<log::Writer*> logs_to_free_queue_;
+
+  std::deque<SuperVersion*> superversions_to_free_queue_;
+
+  int unscheduled_flushes_;
+
+  int unscheduled_compactions_;
+
+  // count how many background compactions are running or have been scheduled in
+  // the BOTTOM pool
+  int bg_bottom_compaction_scheduled_;
+
+  // count how many background compactions are running or have been scheduled
+  int bg_compaction_scheduled_;
+
+  // stores the number of compactions are currently running
+  int num_running_compactions_;
+
+  // number of background memtable flush jobs, submitted to the HIGH pool
+  int bg_flush_scheduled_;
+
+  // stores the number of flushes are currently running
+  int num_running_flushes_;
+
+  // number of background obsolete file purge jobs, submitted to the HIGH pool
+  int bg_purge_scheduled_;
+
+  std::deque<ManualCompactionState*> manual_compaction_dequeue_;
+
+  // shall we disable deletion of obsolete files
+  // if 0 the deletion is enabled.
+  // if non-zero, files will not be getting deleted
+  // This enables two different threads to call
+  // EnableFileDeletions() and DisableFileDeletions()
+  // without any synchronization
+  int disable_delete_obsolete_files_;
+
+  // Number of times FindObsoleteFiles has found deletable files and the
+  // corresponding call to PurgeObsoleteFiles has not yet finished.
+  int pending_purge_obsolete_files_;
+
+  // last time when DeleteObsoleteFiles with full scan was executed. Originally
+  // initialized with startup time.
+  uint64_t delete_obsolete_files_last_run_;
+
+  // last time stats were dumped to LOG
+  std::atomic<uint64_t> last_stats_dump_time_microsec_;
+
+  // The thread that wants to switch memtable, can wait on this cv until the
+  // pending writes to memtable finishes.
+  std::condition_variable switch_cv_;
+  // The mutex used by switch_cv_. mutex_ should be acquired beforehand.
+  std::mutex switch_mutex_;
+  // Number of threads intending to write to memtable
+  std::atomic<size_t> pending_memtable_writes_ = {};
+
+  // A flag indicating whether the current rocksdb database has any
+  // data that is not yet persisted into either WAL or SST file.
+  // Used when disableWAL is true.
+  std::atomic<bool> has_unpersisted_data_;
+
+  // if an attempt was made to flush all column families that
+  // the oldest log depends on but uncommitted data in the oldest
+  // log prevents the log from being released.
+  // We must attempt to free the dependent memtables again
+  // at a later time after the transaction in the oldest
+  // log is fully commited.
+  bool unable_to_release_oldest_log_;
+
+  // Number of running IngestExternalFile() or CreateColumnFamilyWithImport()
+  // calls.
+  // REQUIRES: mutex held
+  int num_running_ingest_file_;
+
+#ifndef ROCKSDB_LITE
+  WalManager wal_manager_;
+#endif  // ROCKSDB_LITE
+
+  // A value of > 0 temporarily disables scheduling of background work
+  int bg_work_paused_;
+
+  // A value of > 0 temporarily disables scheduling of background compaction
+  int bg_compaction_paused_;
+
+  // Guard against multiple concurrent refitting
+  bool refitting_level_;
+
+  // Indicate DB was opened successfully
+  bool opened_successfully_;
+
+  // The min threshold to triggere bottommost compaction for removing
+  // garbages, among all column families.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  LogsWithPrepTracker logs_with_prep_tracker_;
+
+  // Callback for compaction to check if a key is visible to a snapshot.
+  // REQUIRES: mutex held
+  std::unique_ptr<SnapshotChecker> snapshot_checker_;
+
+  // Callback for when the cached_recoverable_state_ is written to memtable
+  // Only to be set during initialization
+  std::unique_ptr<PreReleaseCallback> recoverable_state_pre_release_callback_;
+
+#ifndef ROCKSDB_LITE
+  // Scheduler to run DumpStats(), PersistStats(), and FlushInfoLog().
+  // Currently, internally it has a global timer instance for running the tasks.
+  PeriodicTaskScheduler periodic_task_scheduler_;
+
+  // It contains the implementations for each periodic task.
+  std::map<PeriodicTaskType, const PeriodicTaskFunc> periodic_task_functions_;
+#endif
+
+  // When set, we use a separate queue for writes that don't write to memtable.
+  // In 2PC these are the writes at Prepare phase.
+  const bool two_write_queues_;
+  const bool manual_wal_flush_;
+
+  // LastSequence also indicates last published sequence visibile to the
+  // readers. Otherwise LastPublishedSequence should be used.
+  const bool last_seq_same_as_publish_seq_;
+  // It indicates that a customized gc algorithm must be used for
+  // flush/compaction and if it is not provided vis SnapshotChecker, we should
+  // disable gc to be safe.
+  const bool use_custom_gc_;
+  // Flag to indicate that the DB instance shutdown has been initiated. This
+  // different from shutting_down_ atomic in that it is set at the beginning
+  // of shutdown sequence, specifically in order to prevent any background
+  // error recovery from going on in parallel. The latter, shutting_down_,
+  // is set a little later during the shutdown after scheduling memtable
+  // flushes
+  std::atomic<bool> shutdown_initiated_;
+  // Flag to indicate whether sst_file_manager object was allocated in
+  // DB::Open() or passed to us
+  bool own_sfm_;
+
+  // Flag to check whether Close() has been called on this DB
+  bool closed_;
+  // save the closing status, for re-calling the close()
+  Status closing_status_;
+  // mutex for DB::Close()
+  InstrumentedMutex closing_mutex_;
+
+  // Conditional variable to coordinate installation of atomic flush results.
+  // With atomic flush, each bg thread installs the result of flushing multiple
+  // column families, and different threads can flush different column
+  // families. It's difficult to rely on one thread to perform batch
+  // installation for all threads. This is different from the non-atomic flush
+  // case.
+  // atomic_flush_install_cv_ makes sure that threads install atomic flush
+  // results sequentially. Flush results of memtables with lower IDs get
+  // installed to MANIFEST first.
+  InstrumentedCondVar atomic_flush_install_cv_;
+
+  bool wal_in_db_path_;
+  std::atomic<uint64_t> max_total_wal_size_;
+
+  BlobFileCompletionCallback blob_callback_;
+
+  // Pointer to WriteBufferManager stalling interface.
+  std::unique_ptr<StallInterface> wbm_stall_;
+
+  // seqno_time_mapping_ stores the sequence number to time mapping, it's not
+  // thread safe, both read and write need db mutex hold.
+  SeqnoToTimeMapping seqno_time_mapping_;
+};
+
+class GetWithTimestampReadCallback : public ReadCallback {
+ public:
+  explicit GetWithTimestampReadCallback(SequenceNumber seq)
+      : ReadCallback(seq) {}
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return seq <= max_visible_seq_;
+  }
+};
+
+extern Options SanitizeOptions(const std::string& db, const Options& src,
+                               bool read_only = false,
+                               Status* logger_creation_s = nullptr);
+
+extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src,
+                                 bool read_only = false,
+                                 Status* logger_creation_s = nullptr);
+
+extern CompressionType GetCompressionFlush(
+    const ImmutableCFOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options);
+
+// Return the earliest log file to keep after the memtable flush is
+// finalized.
+// `cfd_to_flush` is the column family whose memtable (specified in
+// `memtables_to_flush`) will be flushed and thus will not depend on any WAL
+// file.
+// The function is only applicable to 2pc mode.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker);
+
+// In non-2PC mode, WALs with log number < the returned number can be
+// deleted after the cfd_to_flush column family is flushed successfully.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list);
+// For atomic flush.
+extern uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists);
+
+// `cfd_to_flush` is the column family whose memtable will be flushed and thus
+// will not depend on any WAL file. nullptr means no memtable is being flushed.
+// The function is only applicable to 2pc mode.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush);
+// For atomic flush.
+extern uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush);
+
+// Fix user-supplied options to be reasonable
+template <class T, class V>
+static void ClipToRange(T* ptr, V minvalue, V maxvalue) {
+  if (static_cast<V>(*ptr) > maxvalue) *ptr = maxvalue;
+  if (static_cast<V>(*ptr) < minvalue) *ptr = minvalue;
+}
+
+inline Status DBImpl::FailIfCfHasTs(
+    const ColumnFamilyHandle* column_family) const {
+  column_family = column_family ? column_family : DefaultColumnFamily();
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0) {
+    std::ostringstream oss;
+    oss << "cannot call this method on column family "
+        << column_family->GetName() << " that enables timestamp";
+    return Status::InvalidArgument(oss.str());
+  }
+  return Status::OK();
+}
+
+inline Status DBImpl::FailIfTsMismatchCf(ColumnFamilyHandle* column_family,
+                                         const Slice& ts,
+                                         bool ts_for_read) const {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be null");
+  }
+  assert(column_family);
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  if (0 == ucmp->timestamp_size()) {
+    std::stringstream oss;
+    oss << "cannot call this method on column family "
+        << column_family->GetName() << " that does not enable timestamp";
+    return Status::InvalidArgument(oss.str());
+  }
+  const size_t ts_sz = ts.size();
+  if (ts_sz != ucmp->timestamp_size()) {
+    std::stringstream oss;
+    oss << "Timestamp sizes mismatch: expect " << ucmp->timestamp_size() << ", "
+        << ts_sz << " given";
+    return Status::InvalidArgument(oss.str());
+  }
+  if (ts_for_read) {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    auto cfd = cfh->cfd();
+    std::string current_ts_low = cfd->GetFullHistoryTsLow();
+    if (!current_ts_low.empty() &&
+        ucmp->CompareTimestamp(ts, current_ts_low) < 0) {
+      std::stringstream oss;
+      oss << "Read timestamp: " << ts.ToString(true)
+          << " is smaller than full_history_ts_low: "
+          << Slice(current_ts_low).ToString(true) << std::endl;
+      return Status::InvalidArgument(oss.str());
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
new file mode 100644
index 000000000..a605fac87
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_compaction_flush.cc
@@ -0,0 +1,3857 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <deque>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_updater.h"
+#include "monitoring/thread_status_util.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/concurrent_task_limiter_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool DBImpl::EnoughRoomForCompaction(
+    ColumnFamilyData* cfd, const std::vector<CompactionInputFiles>& inputs,
+    bool* sfm_reserved_compact_space, LogBuffer* log_buffer) {
+  // Check if we have enough room to do the compaction
+  bool enough_room = true;
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm) {
+    // Pass the current bg_error_ to SFM so it can decide what checks to
+    // perform. If this DB instance hasn't seen any error yet, the SFM can be
+    // optimistic and not do disk space checks
+    Status bg_error = error_handler_.GetBGError();
+    enough_room = sfm->EnoughRoomForCompaction(cfd, inputs, bg_error);
+    bg_error.PermitUncheckedError();  // bg_error is just a copy of the Status
+                                      // from the error_handler_
+    if (enough_room) {
+      *sfm_reserved_compact_space = true;
+    }
+  }
+#else
+  (void)cfd;
+  (void)inputs;
+  (void)sfm_reserved_compact_space;
+#endif  // ROCKSDB_LITE
+  if (!enough_room) {
+    // Just in case tests want to change the value of enough_room
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction():CancelledCompaction", &enough_room);
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Cancelled compaction because not enough room");
+    RecordTick(stats_, COMPACTION_CANCELLED, 1);
+  }
+  return enough_room;
+}
+
+bool DBImpl::RequestCompactionToken(ColumnFamilyData* cfd, bool force,
+                                    std::unique_ptr<TaskLimiterToken>* token,
+                                    LogBuffer* log_buffer) {
+  assert(*token == nullptr);
+  auto limiter = static_cast<ConcurrentTaskLimiterImpl*>(
+      cfd->ioptions()->compaction_thread_limiter.get());
+  if (limiter == nullptr) {
+    return true;
+  }
+  *token = limiter->GetToken(force);
+  if (*token != nullptr) {
+    ROCKS_LOG_BUFFER(log_buffer,
+                     "Thread limiter [%s] increase [%s] compaction task, "
+                     "force: %s, tasks after: %d",
+                     limiter->GetName().c_str(), cfd->GetName().c_str(),
+                     force ? "true" : "false", limiter->GetOutstandingTask());
+    return true;
+  }
+  return false;
+}
+
+IOStatus DBImpl::SyncClosedLogs(JobContext* job_context,
+                                VersionEdit* synced_wals) {
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Start");
+  InstrumentedMutexLock l(&log_write_mutex_);
+  autovector<log::Writer*, 1> logs_to_sync;
+  uint64_t current_log_number = logfile_number_;
+  while (logs_.front().number < current_log_number &&
+         logs_.front().IsSyncing()) {
+    log_sync_cv_.Wait();
+  }
+  for (auto it = logs_.begin();
+       it != logs_.end() && it->number < current_log_number; ++it) {
+    auto& log = *it;
+    log.PrepareForSync();
+    logs_to_sync.push_back(log.writer);
+  }
+
+  IOStatus io_s;
+  if (!logs_to_sync.empty()) {
+    log_write_mutex_.Unlock();
+
+    assert(job_context);
+
+    for (log::Writer* log : logs_to_sync) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Syncing log #%" PRIu64, job_context->job_id,
+                     log->get_log_number());
+      if (error_handler_.IsRecoveryInProgress()) {
+        log->file()->reset_seen_error();
+      }
+      io_s = log->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
+        break;
+      }
+
+      if (immutable_db_options_.recycle_log_file_num > 0) {
+        if (error_handler_.IsRecoveryInProgress()) {
+          log->file()->reset_seen_error();
+        }
+        io_s = log->Close();
+        if (!io_s.ok()) {
+          break;
+        }
+      }
+    }
+    if (io_s.ok()) {
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+
+    TEST_SYNC_POINT_CALLBACK("DBImpl::SyncClosedLogs:BeforeReLock",
+                             /*arg=*/nullptr);
+    log_write_mutex_.Lock();
+
+    // "number <= current_log_number - 1" is equivalent to
+    // "number < current_log_number".
+    if (io_s.ok()) {
+      MarkLogsSynced(current_log_number - 1, true, synced_wals);
+    } else {
+      MarkLogsNotSynced(current_log_number - 1);
+    }
+    if (!io_s.ok()) {
+      TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Failed");
+      return io_s;
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::SyncClosedLogs:end");
+  return io_s;
+}
+
+Status DBImpl::FlushMemTableToOutputFile(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    bool* made_progress, JobContext* job_context,
+    SuperVersionContext* superversion_context,
+    std::vector<SequenceNumber>& snapshot_seqs,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, LogBuffer* log_buffer,
+    Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
+  assert(cfd->imm()->NumNotFlushed() != 0);
+  assert(cfd->imm()->IsFlushPending());
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+  // If there are more than one column families, we need to make sure that
+  // all the log files except the most recent one are synced. Otherwise if
+  // the host crashes after flushing and before WAL is persistent, the
+  // flushed SST may contain data from write batches whose updates to
+  // other (unflushed) column families are missing.
+  const bool needs_to_sync_closed_wals =
+      logfile_number_ > 0 &&
+      versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
+  // If needs_to_sync_closed_wals is true, we need to record the current
+  // maximum memtable ID of this column family so that a later PickMemtables()
+  // call will not pick memtables whose IDs are higher. This is due to the fact
+  // that SyncClosedLogs() may release the db mutex, and memtable switch can
+  // happen for this column family in the meantime. The newly created memtables
+  // have their data backed by unsynced WALs, thus they cannot be included in
+  // this flush job.
+  // Another reason why we must record the current maximum memtable ID of this
+  // column family: SyncClosedLogs() may release db mutex, thus it's possible
+  // for application to continue to insert into memtables increasing db's
+  // sequence number. The application may take a snapshot, but this snapshot is
+  // not included in `snapshot_seqs` which will be passed to flush job because
+  // `snapshot_seqs` has already been computed before this function starts.
+  // Recording the max memtable ID ensures that the flush job does not flush
+  // a memtable without knowing such snapshot(s).
+  uint64_t max_memtable_id = needs_to_sync_closed_wals
+                                 ? cfd->imm()->GetLatestMemTableID()
+                                 : std::numeric_limits<uint64_t>::max();
+
+  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+  // existing memtables of the column family when PickMemTable() is called
+  // later. Although we won't call SyncClosedLogs() in this case, we may still
+  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+  // releases and re-acquires the db mutex. In the meantime, the application
+  // can still insert into the memtables and increase the db's sequence number.
+  // The application can take a snapshot, hoping that the latest visible state
+  // to this snapshto is preserved. This is hard to guarantee since db mutex
+  // not held. This newly-created snapshot is not included in `snapshot_seqs`
+  // and the flush job is unaware of its presence. Consequently, the flush job
+  // may drop certain keys when generating the L0, causing incorrect data to be
+  // returned for snapshot read using this snapshot.
+  // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+  // picking so that no new snapshot can be taken between the two functions.
+
+  FlushJob flush_job(
+      dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
+      file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      job_context, log_buffer, directories_.GetDbDir(), GetDataDir(cfd, 0U),
+      GetCompressionFlush(*cfd->ioptions(), mutable_cf_options), stats_,
+      &event_logger_, mutable_cf_options.report_bg_io_stats,
+      true /* sync_output_directory */, true /* write_manifest */, thread_pri,
+      io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+      cfd->GetFullHistoryTsLow(), &blob_callback_);
+  FileMetaData file_meta;
+
+  Status s;
+  bool need_cancel = false;
+  IOStatus log_io_s = IOStatus::OK();
+  if (needs_to_sync_closed_wals) {
+    // SyncClosedLogs() may unlock and re-lock the log_write_mutex multiple
+    // times.
+    VersionEdit synced_wals;
+    mutex_.Unlock();
+    log_io_s = SyncClosedLogs(job_context, &synced_wals);
+    mutex_.Lock();
+    if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+      log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+      TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+                               nullptr);
+    }
+
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+    }
+  } else {
+    TEST_SYNC_POINT("DBImpl::SyncClosedLogs:Skip");
+  }
+  s = log_io_s;
+
+  // If the log sync failed, we do not need to pick memtable. Otherwise,
+  // num_flush_not_started_ needs to be rollback.
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:BeforePickMemtables");
+  if (s.ok()) {
+    flush_job.PickMemTable();
+    need_cancel = true;
+  }
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
+
+#ifndef ROCKSDB_LITE
+  // may temporarily unlock and lock the mutex.
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif  // ROCKSDB_LITE
+
+  bool switched_to_mempurge = false;
+  // Within flush_job.Run, rocksdb may call event listener to notify
+  // file creation and deletion.
+  //
+  // Note that flush_job.Run will unlock and lock the db_mutex,
+  // and EventListener callback will be called when the db_mutex
+  // is unlocked by the current thread.
+  if (s.ok()) {
+    s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
+                      &switched_to_mempurge);
+    need_cancel = false;
+  }
+
+  if (!s.ok() && need_cancel) {
+    flush_job.Cancel();
+  }
+
+  if (s.ok()) {
+    InstallSuperVersionAndScheduleWork(cfd, superversion_context,
+                                       mutable_cf_options);
+    if (made_progress) {
+      *made_progress = true;
+    }
+
+    const std::string& column_family_name = cfd->GetName();
+
+    Version* const current = cfd->current();
+    assert(current);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    assert(storage_info);
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                     column_family_name.c_str(),
+                     storage_info->LevelSummary(&tmp));
+
+    const auto& blob_files = storage_info->GetBlobFiles();
+    if (!blob_files.empty()) {
+      assert(blob_files.front());
+      assert(blob_files.back());
+
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+          column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+          blob_files.back()->GetBlobFileNumber());
+    }
+  }
+
+  if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped()) {
+    if (log_io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor is
+        // needed.
+        error_handler_.SetBGError(s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      assert(s == log_io_s);
+      Status new_bg_error = s;
+      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+    }
+  }
+  // If flush ran smoothly and no mempurge happened
+  // install new SST file path.
+  if (s.ok() && (!switched_to_mempurge)) {
+#ifndef ROCKSDB_LITE
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushCompleted(cfd, mutable_cf_options,
+                           flush_job.GetCommittedFlushJobsInfo());
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm) {
+      // Notify sst_file_manager that a new file was added
+      std::string file_path = MakeTableFileName(
+          cfd->ioptions()->cf_paths[0].path, file_meta.fd.GetNumber());
+      // TODO (PR7798).  We should only add the file to the FileManager if it
+      // exists. Otherwise, some tests may fail.  Ignore the error in the
+      // interim.
+      sfm->OnAddFile(file_path).PermitUncheckedError();
+      if (sfm->IsMaxAllowedSpaceReached()) {
+        Status new_bg_error =
+            Status::SpaceLimit("Max allowed space was reached");
+        TEST_SYNC_POINT_CALLBACK(
+            "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+            &new_bg_error);
+        error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTableToOutputFile:Finish");
+  return s;
+}
+
+Status DBImpl::FlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  if (immutable_db_options_.atomic_flush) {
+    return AtomicFlushMemTablesToOutputFiles(
+        bg_flush_args, made_progress, job_context, log_buffer, thread_pri);
+  }
+  assert(bg_flush_args.size() == 1);
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+  const auto& bg_flush_arg = bg_flush_args[0];
+  ColumnFamilyData* cfd = bg_flush_arg.cfd_;
+  // intentional infrequent copy for each flush
+  MutableCFOptions mutable_cf_options_copy = *cfd->GetLatestMutableCFOptions();
+  SuperVersionContext* superversion_context =
+      bg_flush_arg.superversion_context_;
+  Status s = FlushMemTableToOutputFile(
+      cfd, mutable_cf_options_copy, made_progress, job_context,
+      superversion_context, snapshot_seqs, earliest_write_conflict_snapshot,
+      snapshot_checker, log_buffer, thread_pri);
+  return s;
+}
+
+/*
+ * Atomically flushes multiple column families.
+ *
+ * For each column family, all memtables with ID smaller than or equal to the
+ * ID specified in bg_flush_args will be flushed. Only after all column
+ * families finish flush will this function commit to MANIFEST. If any of the
+ * column families are not flushed successfully, this function does not have
+ * any side-effect on the state of the database.
+ */
+Status DBImpl::AtomicFlushMemTablesToOutputFiles(
+    const autovector<BGFlushArg>& bg_flush_args, bool* made_progress,
+    JobContext* job_context, LogBuffer* log_buffer, Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  autovector<ColumnFamilyData*> cfds;
+  for (const auto& arg : bg_flush_args) {
+    cfds.emplace_back(arg.cfd_);
+  }
+
+#ifndef NDEBUG
+  for (const auto cfd : cfds) {
+    assert(cfd->imm()->NumNotFlushed() != 0);
+    assert(cfd->imm()->IsFlushPending());
+    assert(cfd->GetFlushReason() == cfds[0]->GetFlushReason());
+  }
+#endif /* !NDEBUG */
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  autovector<FSDirectory*> distinct_output_dirs;
+  autovector<std::string> distinct_output_dir_paths;
+  std::vector<std::unique_ptr<FlushJob>> jobs;
+  std::vector<MutableCFOptions> all_mutable_cf_options;
+  int num_cfs = static_cast<int>(cfds.size());
+  all_mutable_cf_options.reserve(num_cfs);
+  for (int i = 0; i < num_cfs; ++i) {
+    auto cfd = cfds[i];
+    FSDirectory* data_dir = GetDataDir(cfd, 0U);
+    const std::string& curr_path = cfd->ioptions()->cf_paths[0].path;
+
+    // Add to distinct output directories if eligible. Use linear search. Since
+    // the number of elements in the vector is not large, performance should be
+    // tolerable.
+    bool found = false;
+    for (const auto& path : distinct_output_dir_paths) {
+      if (path == curr_path) {
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      distinct_output_dir_paths.emplace_back(curr_path);
+      distinct_output_dirs.emplace_back(data_dir);
+    }
+
+    all_mutable_cf_options.emplace_back(*cfd->GetLatestMutableCFOptions());
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.back();
+    uint64_t max_memtable_id = bg_flush_args[i].max_memtable_id_;
+    jobs.emplace_back(new FlushJob(
+        dbname_, cfd, immutable_db_options_, mutable_cf_options,
+        max_memtable_id, file_options_for_compaction_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, log_buffer, directories_.GetDbDir(),
+        data_dir, GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+        stats_, &event_logger_, mutable_cf_options.report_bg_io_stats,
+        false /* sync_output_directory */, false /* write_manifest */,
+        thread_pri, io_tracer_, seqno_time_mapping_, db_id_, db_session_id_,
+        cfd->GetFullHistoryTsLow(), &blob_callback_));
+  }
+
+  std::vector<FileMetaData> file_meta(num_cfs);
+  // Use of deque<bool> because vector<bool>
+  // is specific and doesn't allow &v[i].
+  std::deque<bool> switched_to_mempurge(num_cfs, false);
+  Status s;
+  IOStatus log_io_s = IOStatus::OK();
+  assert(num_cfs == static_cast<int>(jobs.size()));
+
+#ifndef ROCKSDB_LITE
+  for (int i = 0; i != num_cfs; ++i) {
+    const MutableCFOptions& mutable_cf_options = all_mutable_cf_options.at(i);
+    // may temporarily unlock and lock the mutex.
+    NotifyOnFlushBegin(cfds[i], &file_meta[i], mutable_cf_options,
+                       job_context->job_id);
+  }
+#endif /* !ROCKSDB_LITE */
+
+  if (logfile_number_ > 0) {
+    // TODO (yanqin) investigate whether we should sync the closed logs for
+    // single column family case.
+    VersionEdit synced_wals;
+    mutex_.Unlock();
+    log_io_s = SyncClosedLogs(job_context, &synced_wals);
+    mutex_.Lock();
+    if (log_io_s.ok() && synced_wals.IsWalAddition()) {
+      log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals));
+    }
+
+    if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() &&
+        !log_io_s.IsColumnFamilyDropped()) {
+      if (total_log_size_ > 0) {
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlush);
+      } else {
+        // If the WAL is empty, we use different error reason
+        error_handler_.SetBGError(log_io_s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    }
+  }
+  s = log_io_s;
+
+  // exec_status stores the execution status of flush_jobs as
+  // <bool /* executed */, Status /* status code */>
+  autovector<std::pair<bool, Status>> exec_status;
+  std::vector<bool> pick_status;
+  for (int i = 0; i != num_cfs; ++i) {
+    // Initially all jobs are not executed, with status OK.
+    exec_status.emplace_back(false, Status::OK());
+    pick_status.push_back(false);
+  }
+
+  if (s.ok()) {
+    for (int i = 0; i != num_cfs; ++i) {
+      jobs[i]->PickMemTable();
+      pick_status[i] = true;
+    }
+  }
+
+  if (s.ok()) {
+    assert(switched_to_mempurge.size() ==
+           static_cast<long unsigned int>(num_cfs));
+    // TODO (yanqin): parallelize jobs with threads.
+    for (int i = 1; i != num_cfs; ++i) {
+      exec_status[i].second =
+          jobs[i]->Run(&logs_with_prep_tracker_, &file_meta[i],
+                       &(switched_to_mempurge.at(i)));
+      exec_status[i].first = true;
+    }
+    if (num_cfs > 1) {
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:1");
+      TEST_SYNC_POINT(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:SomeFlushJobsComplete:2");
+    }
+    assert(exec_status.size() > 0);
+    assert(!file_meta.empty());
+    exec_status[0].second = jobs[0]->Run(
+        &logs_with_prep_tracker_, file_meta.data() /* &file_meta[0] */,
+        switched_to_mempurge.empty() ? nullptr : &(switched_to_mempurge.at(0)));
+    exec_status[0].first = true;
+
+    Status error_status;
+    for (const auto& e : exec_status) {
+      if (!e.second.ok()) {
+        s = e.second;
+        if (!e.second.IsShutdownInProgress() &&
+            !e.second.IsColumnFamilyDropped()) {
+          // If a flush job did not return OK, and the CF is not dropped, and
+          // the DB is not shutting down, then we have to return this result to
+          // caller later.
+          error_status = e.second;
+        }
+      }
+    }
+
+    s = error_status.ok() ? s : error_status;
+  }
+
+  if (s.IsColumnFamilyDropped()) {
+    s = Status::OK();
+  }
+
+  if (s.ok() || s.IsShutdownInProgress()) {
+    // Sync on all distinct output directories.
+    for (auto dir : distinct_output_dirs) {
+      if (dir != nullptr) {
+        Status error_status = dir->FsyncWithDirOptions(
+            IOOptions(), nullptr,
+            DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+        if (!error_status.ok()) {
+          s = error_status;
+          break;
+        }
+      }
+    }
+  } else {
+    // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+    // it is not because of CF drop.
+    // Have to cancel the flush jobs that have NOT executed because we need to
+    // unref the versions.
+    for (int i = 0; i != num_cfs; ++i) {
+      if (pick_status[i] && !exec_status[i].first) {
+        jobs[i]->Cancel();
+      }
+    }
+    for (int i = 0; i != num_cfs; ++i) {
+      if (exec_status[i].second.ok() && exec_status[i].first) {
+        auto& mems = jobs[i]->GetMemTables();
+        cfds[i]->imm()->RollbackMemtableFlush(mems,
+                                              file_meta[i].fd.GetNumber());
+      }
+    }
+  }
+
+  if (s.ok()) {
+    const auto wait_to_install_func =
+        [&]() -> std::pair<Status, bool /*continue to wait*/> {
+      if (!versions_->io_status().ok()) {
+        // Something went wrong elsewhere, we cannot count on waiting for our
+        // turn to write/sync to MANIFEST or CURRENT. Just return.
+        return std::make_pair(versions_->io_status(), false);
+      } else if (shutting_down_.load(std::memory_order_acquire)) {
+        return std::make_pair(Status::ShutdownInProgress(), false);
+      }
+      bool ready = true;
+      for (size_t i = 0; i != cfds.size(); ++i) {
+        const auto& mems = jobs[i]->GetMemTables();
+        if (cfds[i]->IsDropped()) {
+          // If the column family is dropped, then do not wait.
+          continue;
+        } else if (!mems.empty() &&
+                   cfds[i]->imm()->GetEarliestMemTableID() < mems[0]->GetID()) {
+          // If a flush job needs to install the flush result for mems and
+          // mems[0] is not the earliest memtable, it means another thread must
+          // be installing flush results for the same column family, then the
+          // current thread needs to wait.
+          ready = false;
+          break;
+        } else if (mems.empty() && cfds[i]->imm()->GetEarliestMemTableID() <=
+                                       bg_flush_args[i].max_memtable_id_) {
+          // If a flush job does not need to install flush results, then it has
+          // to wait until all memtables up to max_memtable_id_ (inclusive) are
+          // installed.
+          ready = false;
+          break;
+        }
+      }
+      return std::make_pair(Status::OK(), !ready);
+    };
+
+    bool resuming_from_bg_err =
+        error_handler_.IsDBStopped() ||
+        (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+         cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+    while ((!resuming_from_bg_err || error_handler_.GetRecoveryError().ok())) {
+      std::pair<Status, bool> res = wait_to_install_func();
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::AtomicFlushMemTablesToOutputFiles:WaitToCommit", &res);
+
+      if (!res.first.ok()) {
+        s = res.first;
+        break;
+      } else if (!res.second) {
+        break;
+      }
+      atomic_flush_install_cv_.Wait();
+
+      resuming_from_bg_err =
+          error_handler_.IsDBStopped() ||
+          (cfds[0]->GetFlushReason() == FlushReason::kErrorRecovery ||
+           cfds[0]->GetFlushReason() == FlushReason::kErrorRecoveryRetryFlush);
+    }
+
+    if (!resuming_from_bg_err) {
+      // If not resuming from bg err, then we determine future action based on
+      // whether we hit background error.
+      if (s.ok()) {
+        s = error_handler_.GetBGError();
+      }
+    } else if (s.ok()) {
+      // If resuming from bg err, we still rely on wait_to_install_func()'s
+      // result to determine future action. If wait_to_install_func() returns
+      // non-ok already, then we should not proceed to flush result
+      // installation.
+      s = error_handler_.GetRecoveryError();
+    }
+  }
+
+  if (s.ok()) {
+    autovector<ColumnFamilyData*> tmp_cfds;
+    autovector<const autovector<MemTable*>*> mems_list;
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    autovector<FileMetaData*> tmp_file_meta;
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
+    for (int i = 0; i != num_cfs; ++i) {
+      const auto& mems = jobs[i]->GetMemTables();
+      if (!cfds[i]->IsDropped() && !mems.empty()) {
+        tmp_cfds.emplace_back(cfds[i]);
+        mems_list.emplace_back(&mems);
+        mutable_cf_options_list.emplace_back(&all_mutable_cf_options[i]);
+        tmp_file_meta.emplace_back(&file_meta[i]);
+#ifndef ROCKSDB_LITE
+        committed_flush_jobs_info.emplace_back(
+            jobs[i]->GetCommittedFlushJobsInfo());
+#endif  //! ROCKSDB_LITE
+      }
+    }
+
+    s = InstallMemtableAtomicFlushResults(
+        nullptr /* imm_lists */, tmp_cfds, mutable_cf_options_list, mems_list,
+        versions_.get(), &logs_with_prep_tracker_, &mutex_, tmp_file_meta,
+        committed_flush_jobs_info, &job_context->memtables_to_free,
+        directories_.GetDbDir(), log_buffer);
+  }
+
+  if (s.ok()) {
+    assert(num_cfs ==
+           static_cast<int>(job_context->superversion_contexts.size()));
+    for (int i = 0; i != num_cfs; ++i) {
+      assert(cfds[i]);
+
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      InstallSuperVersionAndScheduleWork(cfds[i],
+                                         &job_context->superversion_contexts[i],
+                                         all_mutable_cf_options[i]);
+
+      const std::string& column_family_name = cfds[i]->GetName();
+
+      Version* const current = cfds[i]->current();
+      assert(current);
+
+      const VersionStorageInfo* const storage_info = current->storage_info();
+      assert(storage_info);
+
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
+                       column_family_name.c_str(),
+                       storage_info->LevelSummary(&tmp));
+
+      const auto& blob_files = storage_info->GetBlobFiles();
+      if (!blob_files.empty()) {
+        assert(blob_files.front());
+        assert(blob_files.back());
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Blob file summary: head=%" PRIu64 ", tail=%" PRIu64 "\n",
+            column_family_name.c_str(), blob_files.front()->GetBlobFileNumber(),
+            blob_files.back()->GetBlobFileNumber());
+      }
+    }
+    if (made_progress) {
+      *made_progress = true;
+    }
+#ifndef ROCKSDB_LITE
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    assert(all_mutable_cf_options.size() == static_cast<size_t>(num_cfs));
+    for (int i = 0; s.ok() && i != num_cfs; ++i) {
+      // If mempurge happened instead of Flush,
+      // no NotifyOnFlushCompleted call (no SST file created).
+      if (switched_to_mempurge[i]) {
+        continue;
+      }
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      NotifyOnFlushCompleted(cfds[i], all_mutable_cf_options[i],
+                             jobs[i]->GetCommittedFlushJobsInfo());
+      if (sfm) {
+        std::string file_path = MakeTableFileName(
+            cfds[i]->ioptions()->cf_paths[0].path, file_meta[i].fd.GetNumber());
+        // TODO (PR7798).  We should only add the file to the FileManager if it
+        // exists. Otherwise, some tests may fail.  Ignore the error in the
+        // interim.
+        sfm->OnAddFile(file_path).PermitUncheckedError();
+        if (sfm->IsMaxAllowedSpaceReached() &&
+            error_handler_.GetBGError().ok()) {
+          Status new_bg_error =
+              Status::SpaceLimit("Max allowed space was reached");
+          error_handler_.SetBGError(new_bg_error,
+                                    BackgroundErrorReason::kFlush);
+        }
+      }
+    }
+#endif  // ROCKSDB_LITE
+  }
+
+  // Need to undo atomic flush if something went wrong, i.e. s is not OK and
+  // it is not because of CF drop.
+  if (!s.ok() && !s.IsColumnFamilyDropped()) {
+    if (log_io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      if (!versions_->io_status().ok()) {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the Manifest write will be map to soft error.
+        // TODO: kManifestWriteNoWAL and kFlushNoWAL are misleading. Refactor
+        // is needed.
+        error_handler_.SetBGError(s,
+                                  BackgroundErrorReason::kManifestWriteNoWAL);
+      } else {
+        // If WAL sync is successful (either WAL size is 0 or there is no IO
+        // error), all the other SST file write errors will be set as
+        // kFlushNoWAL.
+        error_handler_.SetBGError(s, BackgroundErrorReason::kFlushNoWAL);
+      }
+    } else {
+      assert(s == log_io_s);
+      Status new_bg_error = s;
+      error_handler_.SetBGError(new_bg_error, BackgroundErrorReason::kFlush);
+    }
+  }
+
+  return s;
+}
+
+void DBImpl::NotifyOnFlushBegin(ColumnFamilyData* cfd, FileMetaData* file_meta,
+                                const MutableCFOptions& mutable_cf_options,
+                                int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    FlushJobInfo info{};
+    info.cf_id = cfd->GetID();
+    info.cf_name = cfd->GetName();
+    // TODO(yhchiang): make db_paths dynamic in case flush does not
+    //                 go to L0 in the future.
+    const uint64_t file_number = file_meta->fd.GetNumber();
+    info.file_path =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_number);
+    info.file_number = file_number;
+    info.thread_id = env_->GetThreadID();
+    info.job_id = job_id;
+    info.triggered_writes_slowdown = triggered_writes_slowdown;
+    info.triggered_writes_stop = triggered_writes_stop;
+    info.smallest_seqno = file_meta->fd.smallest_seqno;
+    info.largest_seqno = file_meta->fd.largest_seqno;
+    info.flush_reason = cfd->GetFlushReason();
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnFlushBegin(this, info);
+    }
+  }
+  mutex_.Lock();
+// no need to signal bg_cv_ as it will be signaled at the end of the
+// flush process.
+#else
+  (void)cfd;
+  (void)file_meta;
+  (void)mutable_cf_options;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnFlushCompleted(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    std::list<std::unique_ptr<FlushJobInfo>>* flush_jobs_info) {
+#ifndef ROCKSDB_LITE
+  assert(flush_jobs_info != nullptr);
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  bool triggered_writes_slowdown =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_slowdown_writes_trigger);
+  bool triggered_writes_stop =
+      (cfd->current()->storage_info()->NumLevelFiles(0) >=
+       mutable_cf_options.level0_stop_writes_trigger);
+  // release lock while notifying events
+  mutex_.Unlock();
+  {
+    for (auto& info : *flush_jobs_info) {
+      info->triggered_writes_slowdown = triggered_writes_slowdown;
+      info->triggered_writes_stop = triggered_writes_stop;
+      for (auto listener : immutable_db_options_.listeners) {
+        listener->OnFlushCompleted(this, *info);
+      }
+      TEST_SYNC_POINT(
+          "DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted");
+    }
+    flush_jobs_info->clear();
+  }
+  mutex_.Lock();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)mutable_cf_options;
+  (void)flush_jobs_info;
+#endif  // ROCKSDB_LITE
+}
+
+Status DBImpl::CompactRange(const CompactRangeOptions& options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice* begin_without_ts,
+                            const Slice* end_without_ts) {
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz == 0) {
+    return CompactRangeInternal(options, column_family, begin_without_ts,
+                                end_without_ts, "" /*trim_ts*/);
+  }
+
+  std::string begin_str;
+  std::string end_str;
+
+  // CompactRange compact all keys: [begin, end] inclusively. Add maximum
+  // timestamp to include all `begin` keys, and add minimal timestamp to include
+  // all `end` keys.
+  if (begin_without_ts != nullptr) {
+    AppendKeyWithMaxTimestamp(&begin_str, *begin_without_ts, ts_sz);
+  }
+  if (end_without_ts != nullptr) {
+    AppendKeyWithMinTimestamp(&end_str, *end_without_ts, ts_sz);
+  }
+  Slice begin(begin_str);
+  Slice end(end_str);
+
+  Slice* begin_with_ts = begin_without_ts ? &begin : nullptr;
+  Slice* end_with_ts = end_without_ts ? &end : nullptr;
+
+  return CompactRangeInternal(options, column_family, begin_with_ts,
+                              end_with_ts, "" /*trim_ts*/);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLow(ColumnFamilyHandle* column_family,
+                                        std::string ts_low) {
+  ColumnFamilyData* cfd = nullptr;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    assert(cfh != nullptr);
+    cfd = cfh->cfd();
+  }
+  assert(cfd != nullptr && cfd->user_comparator() != nullptr);
+  if (cfd->user_comparator()->timestamp_size() == 0) {
+    return Status::InvalidArgument(
+        "Timestamp is not enabled in this column family");
+  }
+  if (cfd->user_comparator()->timestamp_size() != ts_low.size()) {
+    return Status::InvalidArgument("ts_low size mismatch");
+  }
+  return IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+}
+
+Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd,
+                                            std::string ts_low) {
+  VersionEdit edit;
+  edit.SetColumnFamily(cfd->GetID());
+  edit.SetFullHistoryTsLow(ts_low);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+                           &edit);
+
+  InstrumentedMutexLock l(&mutex_);
+  std::string current_ts_low = cfd->GetFullHistoryTsLow();
+  const Comparator* ucmp = cfd->user_comparator();
+  assert(ucmp->timestamp_size() == ts_low.size() && !ts_low.empty());
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(ts_low, current_ts_low) < 0) {
+    return Status::InvalidArgument("Cannot decrease full_history_ts_low");
+  }
+
+  Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+  if (!s.ok()) {
+    return s;
+  }
+  current_ts_low = cfd->GetFullHistoryTsLow();
+  if (!current_ts_low.empty() &&
+      ucmp->CompareTimestamp(current_ts_low, ts_low) > 0) {
+    std::stringstream oss;
+    oss << "full_history_ts_low: " << Slice(current_ts_low).ToString(true)
+        << " is set to be higher than the requested "
+           "timestamp: "
+        << Slice(ts_low).ToString(true) << std::endl;
+    return Status::TryAgain(oss.str());
+  }
+  return Status::OK();
+}
+
+Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
+                                    ColumnFamilyHandle* column_family,
+                                    const Slice* begin, const Slice* end,
+                                    const std::string& trim_ts) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+
+  if (options.target_path_id >= cfd->ioptions()->cf_paths.size()) {
+    return Status::InvalidArgument("Invalid target path ID");
+  }
+
+  bool flush_needed = true;
+
+  // Update full_history_ts_low if it's set
+  if (options.full_history_ts_low != nullptr &&
+      !options.full_history_ts_low->empty()) {
+    std::string ts_low = options.full_history_ts_low->ToString();
+    if (begin != nullptr || end != nullptr) {
+      return Status::InvalidArgument(
+          "Cannot specify compaction range with full_history_ts_low");
+    }
+    Status s = IncreaseFullHistoryTsLowImpl(cfd, ts_low);
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  Status s;
+  if (begin != nullptr && end != nullptr) {
+    // TODO(ajkr): We could also optimize away the flush in certain cases where
+    // one/both sides of the interval are unbounded. But it requires more
+    // changes to RangesOverlapWithMemtables.
+    Range range(*begin, *end);
+    SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+    s = cfd->RangesOverlapWithMemtables(
+        {range}, super_version, immutable_db_options_.allow_data_in_errors,
+        &flush_needed);
+    CleanupSuperVersion(super_version);
+  }
+
+  if (s.ok() && flush_needed) {
+    FlushOptions fo;
+    fo.allow_write_stall = options.allow_write_stall;
+    if (immutable_db_options_.atomic_flush) {
+      autovector<ColumnFamilyData*> cfds;
+      mutex_.Lock();
+      SelectColumnFamiliesForAtomicFlush(&cfds);
+      mutex_.Unlock();
+      s = AtomicFlushMemTables(cfds, fo, FlushReason::kManualCompaction,
+                               false /* entered_write_thread */);
+    } else {
+      s = FlushMemTable(cfd, fo, FlushReason::kManualCompaction,
+                        false /* entered_write_thread */);
+    }
+    if (!s.ok()) {
+      LogFlush(immutable_db_options_.info_log);
+      return s;
+    }
+  }
+
+  constexpr int kInvalidLevel = -1;
+  int final_output_level = kInvalidLevel;
+  bool exclusive = options.exclusive_manual_compaction;
+  if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal &&
+      cfd->NumberLevels() > 1) {
+    // Always compact all files together.
+    final_output_level = cfd->NumberLevels() - 1;
+    // if bottom most level is reserved
+    if (immutable_db_options_.allow_ingest_behind) {
+      final_output_level--;
+    }
+    s = RunManualCompaction(cfd, ColumnFamilyData::kCompactAllLevels,
+                            final_output_level, options, begin, end, exclusive,
+                            false, std::numeric_limits<uint64_t>::max(),
+                            trim_ts);
+  } else {
+    int first_overlapped_level = kInvalidLevel;
+    int max_overlapped_level = kInvalidLevel;
+    {
+      SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+      Version* current_version = super_version->current;
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      bool overlap;
+      for (int level = 0;
+           level < current_version->storage_info()->num_non_empty_levels();
+           level++) {
+        overlap = true;
+        if (begin != nullptr && end != nullptr) {
+          Status status = current_version->OverlapWithLevelIterator(
+              ro, file_options_, *begin, *end, level, &overlap);
+          if (!status.ok()) {
+            overlap = current_version->storage_info()->OverlapInLevel(
+                level, begin, end);
+          }
+        } else {
+          overlap = current_version->storage_info()->OverlapInLevel(level,
+                                                                    begin, end);
+        }
+        if (overlap) {
+          if (first_overlapped_level == kInvalidLevel) {
+            first_overlapped_level = level;
+          }
+          max_overlapped_level = level;
+        }
+      }
+      CleanupSuperVersion(super_version);
+    }
+    if (s.ok() && first_overlapped_level != kInvalidLevel) {
+      // max_file_num_to_ignore can be used to filter out newly created SST
+      // files, useful for bottom level compaction in a manual compaction
+      uint64_t max_file_num_to_ignore = std::numeric_limits<uint64_t>::max();
+      uint64_t next_file_number = versions_->current_next_file_number();
+      final_output_level = max_overlapped_level;
+      int output_level;
+      for (int level = first_overlapped_level; level <= max_overlapped_level;
+           level++) {
+        bool disallow_trivial_move = false;
+        // in case the compaction is universal or if we're compacting the
+        // bottom-most level, the output level will be the same as input one.
+        // level 0 can never be the bottommost level (i.e. if all files are in
+        // level 0, we will compact to level 1)
+        if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+            cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+          output_level = level;
+        } else if (level == max_overlapped_level && level > 0) {
+          if (options.bottommost_level_compaction ==
+              BottommostLevelCompaction::kSkip) {
+            // Skip bottommost level compaction
+            continue;
+          } else if (options.bottommost_level_compaction ==
+                         BottommostLevelCompaction::kIfHaveCompactionFilter &&
+                     cfd->ioptions()->compaction_filter == nullptr &&
+                     cfd->ioptions()->compaction_filter_factory == nullptr) {
+            // Skip bottommost level compaction since we don't have a compaction
+            // filter
+            continue;
+          }
+          output_level = level;
+          // update max_file_num_to_ignore only for bottom level compaction
+          // because data in newly compacted files in middle levels may still
+          // need to be pushed down
+          max_file_num_to_ignore = next_file_number;
+        } else {
+          output_level = level + 1;
+          if (cfd->ioptions()->compaction_style == kCompactionStyleLevel &&
+              cfd->ioptions()->level_compaction_dynamic_level_bytes &&
+              level == 0) {
+            output_level = ColumnFamilyData::kCompactToBaseLevel;
+          }
+          // if it's a BottommostLevel compaction and `kForce*` compaction is
+          // set, disallow trivial move
+          if (level == max_overlapped_level &&
+              (options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForce ||
+               options.bottommost_level_compaction ==
+                   BottommostLevelCompaction::kForceOptimized)) {
+            disallow_trivial_move = true;
+          }
+        }
+        // trim_ts need real compaction to remove latest record
+        if (!trim_ts.empty()) {
+          disallow_trivial_move = true;
+        }
+        s = RunManualCompaction(cfd, level, output_level, options, begin, end,
+                                exclusive, disallow_trivial_move,
+                                max_file_num_to_ignore, trim_ts);
+        if (!s.ok()) {
+          break;
+        }
+        if (output_level == ColumnFamilyData::kCompactToBaseLevel) {
+          final_output_level = cfd->NumberLevels() - 1;
+        } else if (output_level > final_output_level) {
+          final_output_level = output_level;
+        }
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::1");
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction()::2");
+      }
+    }
+  }
+  if (!s.ok() || final_output_level == kInvalidLevel) {
+    LogFlush(immutable_db_options_.info_log);
+    return s;
+  }
+
+  if (options.change_level) {
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:1");
+    TEST_SYNC_POINT("DBImpl::CompactRange:BeforeRefit:2");
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[RefitLevel] waiting for background threads to stop");
+    DisableManualCompaction();
+    s = PauseBackgroundWork();
+    if (s.ok()) {
+      TEST_SYNC_POINT("DBImpl::CompactRange:PreRefitLevel");
+      s = ReFitLevel(cfd, final_output_level, options.target_level);
+      TEST_SYNC_POINT("DBImpl::CompactRange:PostRefitLevel");
+      // ContinueBackgroundWork always return Status::OK().
+      Status temp_s = ContinueBackgroundWork();
+      assert(temp_s.ok());
+    }
+    EnableManualCompaction();
+    TEST_SYNC_POINT(
+        "DBImpl::CompactRange:PostRefitLevel:ManualCompactionEnabled");
+  }
+  LogFlush(immutable_db_options_.info_log);
+
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // an automatic compaction that has been scheduled might have been
+    // preempted by the manual compactions. Need to schedule it back.
+    MaybeScheduleFlushOrCompaction();
+  }
+
+  return s;
+}
+
+Status DBImpl::CompactFiles(const CompactionOptions& compact_options,
+                            ColumnFamilyHandle* column_family,
+                            const std::vector<std::string>& input_file_names,
+                            const int output_level, const int output_path_id,
+                            std::vector<std::string>* const output_file_names,
+                            CompactionJobInfo* compaction_job_info) {
+#ifdef ROCKSDB_LITE
+  (void)compact_options;
+  (void)column_family;
+  (void)input_file_names;
+  (void)output_level;
+  (void)output_path_id;
+  (void)output_file_names;
+  (void)compaction_job_info;
+  // not supported in lite version
+  return Status::NotSupported("Not supported in ROCKSDB LITE");
+#else
+  if (column_family == nullptr) {
+    return Status::InvalidArgument("ColumnFamilyHandle must be non-null.");
+  }
+
+  auto cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+  assert(cfd);
+
+  Status s;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  // Perform CompactFiles
+  TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile2");
+  TEST_SYNC_POINT_CALLBACK(
+      "TestCompactFiles:PausingManualCompaction:3",
+      reinterpret_cast<void*>(
+          const_cast<std::atomic<int>*>(&manual_compaction_paused_)));
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    // We need to get current after `WaitForIngestFile`, because
+    // `IngestExternalFile` may add files that overlap with `input_file_names`
+    auto* current = cfd->current();
+    current->Ref();
+
+    s = CompactFilesImpl(compact_options, cfd, current, input_file_names,
+                         output_file_names, output_level, output_path_id,
+                         &job_context, &log_buffer, compaction_job_info);
+
+    current->Unref();
+  }
+
+  // Find and delete obsolete files
+  {
+    InstrumentedMutexLock l(&mutex_);
+    // If !s.ok(), this means that Compaction failed. In that case, we want
+    // to delete all obsolete files we might have created and we force
+    // FindObsoleteFiles(). This is because job_context does not
+    // catch all created files if compaction failed.
+    FindObsoleteFiles(&job_context, !s.ok());
+  }  // release the mutex
+
+  // delete unnecessary files if any, this is done outside the mutex
+  if (job_context.HaveSomethingToClean() ||
+      job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+    // Have to flush the info logs before bg_compaction_scheduled_--
+    // because if bg_flush_scheduled_ becomes 0 and the lock is
+    // released, the deconstructor of DB can kick in and destroy all the
+    // states of DB so info_log might not be available after that point.
+    // It also applies to access other states that DB owns.
+    log_buffer.FlushBufferToLog();
+    if (job_context.HaveSomethingToDelete()) {
+      // no mutex is locked here.  No need to Unlock() and Lock() here.
+      PurgeObsoleteFiles(job_context);
+    }
+    job_context.Clean();
+  }
+
+  return s;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::CompactFilesImpl(
+    const CompactionOptions& compact_options, ColumnFamilyData* cfd,
+    Version* version, const std::vector<std::string>& input_file_names,
+    std::vector<std::string>* const output_file_names, const int output_level,
+    int output_path_id, JobContext* job_context, LogBuffer* log_buffer,
+    CompactionJobInfo* compaction_job_info) {
+  mutex_.AssertHeld();
+
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return Status::ShutdownInProgress();
+  }
+  if (manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input_file_names) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  ColumnFamilyMetaData cf_meta;
+  // TODO(yhchiang): can directly use version here if none of the
+  // following functions call is pluggable to external developers.
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  if (output_path_id < 0) {
+    if (cfd->ioptions()->cf_paths.size() == 1U) {
+      output_path_id = 0;
+    } else {
+      return Status::NotSupported(
+          "Automatic output path selection is not "
+          "yet supported in CompactFiles()");
+    }
+  }
+
+  Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles(
+      &input_set, cf_meta, output_level);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::vector<CompactionInputFiles> input_files;
+  s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, version->storage_info(), compact_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  for (const auto& inputs : input_files) {
+    if (cfd->compaction_picker()->AreFilesInCompaction(inputs.files)) {
+      return Status::Aborted(
+          "Some of the necessary compaction input "
+          "files are already being compacted");
+    }
+  }
+  bool sfm_reserved_compact_space = false;
+  // First check if we have enough room to do the compaction
+  bool enough_room = EnoughRoomForCompaction(
+      cfd, input_files, &sfm_reserved_compact_space, log_buffer);
+
+  if (!enough_room) {
+    // m's vars will get set properly at the end of this function,
+    // as long as status == CompactionTooLarge
+    return Status::CompactionTooLarge();
+  }
+
+  // At this point, CompactFiles will be run.
+  bg_compaction_scheduled_++;
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      compact_options, input_files, output_level, version->storage_info(),
+      *cfd->GetLatestMutableCFOptions(), mutable_db_options_, output_path_id));
+  // we already sanitized the set of input files and checked for conflicts
+  // without releasing the lock, so we're guaranteed a compaction can be formed.
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+  // deletion compaction currently not allowed in CompactFiles.
+  assert(!c->deletion_compaction());
+
+  std::vector<SequenceNumber> snapshot_seqs;
+  SequenceNumber earliest_write_conflict_snapshot;
+  SnapshotChecker* snapshot_checker;
+  GetSnapshotContext(job_context, &snapshot_seqs,
+                     &earliest_write_conflict_snapshot, &snapshot_checker);
+
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+
+  assert(is_snapshot_supported_ || snapshots_.empty());
+  CompactionJobStats compaction_job_stats;
+  CompactionJob compaction_job(
+      job_context->job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      log_buffer, directories_.GetDbDir(),
+      GetDataDir(c->column_family_data(), c->output_path_id()),
+      GetDataDir(c->column_family_data(), 0), stats_, &mutex_, &error_handler_,
+      snapshot_seqs, earliest_write_conflict_snapshot, snapshot_checker,
+      job_context, table_cache_, &event_logger_,
+      c->mutable_cf_options()->paranoid_file_checks,
+      c->mutable_cf_options()->report_bg_io_stats, dbname_,
+      &compaction_job_stats, Env::Priority::USER, io_tracer_,
+      kManualCompactionCanceledFalse_, db_id_, db_session_id_,
+      c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(),
+      &blob_callback_, &bg_compaction_scheduled_,
+      &bg_bottom_compaction_scheduled_);
+
+  // Creating a compaction influences the compaction score because the score
+  // takes running compactions into account (by skipping files that are already
+  // being compacted). Since we just changed compaction score, we recalculate it
+  // here.
+  version->storage_info()->ComputeCompactionScore(*cfd->ioptions(),
+                                                  *c->mutable_cf_options());
+
+  compaction_job.Prepare();
+
+  mutex_.Unlock();
+  TEST_SYNC_POINT("CompactFilesImpl:0");
+  TEST_SYNC_POINT("CompactFilesImpl:1");
+  // Ignore the status here, as it will be checked in the Install down below...
+  compaction_job.Run().PermitUncheckedError();
+  TEST_SYNC_POINT("CompactFilesImpl:2");
+  TEST_SYNC_POINT("CompactFilesImpl:3");
+  mutex_.Lock();
+
+  Status status = compaction_job.Install(*c->mutable_cf_options());
+  if (status.ok()) {
+    assert(compaction_job.io_status().ok());
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+  }
+  // status above captures any error during compaction_job.Install, so its ok
+  // not check compaction_job.io_status() explicitly if we're not calling
+  // SetBGError
+  compaction_job.io_status().PermitUncheckedError();
+  c->ReleaseCompactionFiles(s);
+#ifndef ROCKSDB_LITE
+  // Need to make sure SstFileManager does its bookkeeping
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      immutable_db_options_.sst_file_manager.get());
+  if (sfm && sfm_reserved_compact_space) {
+    sfm->OnCompactionCompletion(c.get());
+  }
+#endif  // ROCKSDB_LITE
+
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  if (compaction_job_info != nullptr) {
+    BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
+                           job_context->job_id, version, compaction_job_info);
+  }
+
+  if (status.ok()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else if (status.IsManualCompactionPaused()) {
+    // Don't report stopping manual compaction as error
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Stopping manual compaction",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id);
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "[%s] [JOB %d] Compaction error: %s",
+                   c->column_family_data()->GetName().c_str(),
+                   job_context->job_id, status.ToString().c_str());
+    IOStatus io_s = compaction_job.io_status();
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kCompaction);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+  }
+
+  if (output_file_names != nullptr) {
+    for (const auto& newf : c->edit()->GetNewFiles()) {
+      output_file_names->push_back(TableFileName(
+          c->immutable_options()->cf_paths, newf.second.fd.GetNumber(),
+          newf.second.fd.GetPathId()));
+    }
+
+    for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+      output_file_names->push_back(
+          BlobFileName(c->immutable_options()->cf_paths.front().path,
+                       blob_file.GetBlobFileNumber()));
+    }
+  }
+
+  c.reset();
+
+  bg_compaction_scheduled_--;
+  if (bg_compaction_scheduled_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  MaybeScheduleFlushOrCompaction();
+  TEST_SYNC_POINT("CompactFilesImpl:End");
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+Status DBImpl::PauseBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  bg_compaction_paused_++;
+  while (bg_bottom_compaction_scheduled_ > 0 || bg_compaction_scheduled_ > 0 ||
+         bg_flush_scheduled_ > 0) {
+    bg_cv_.Wait();
+  }
+  bg_work_paused_++;
+  return Status::OK();
+}
+
+Status DBImpl::ContinueBackgroundWork() {
+  InstrumentedMutexLock guard_lock(&mutex_);
+  if (bg_work_paused_ == 0) {
+    return Status::InvalidArgument();
+  }
+  assert(bg_work_paused_ > 0);
+  assert(bg_compaction_paused_ > 0);
+  bg_compaction_paused_--;
+  bg_work_paused_--;
+  // It's sufficient to check just bg_work_paused_ here since
+  // bg_work_paused_ is always no greater than bg_compaction_paused_
+  if (bg_work_paused_ == 0) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
+                                     const Status& st,
+                                     const CompactionJobStats& job_stats,
+                                     int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.empty()) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+  if (c->is_manual_compaction() &&
+      manual_compaction_paused_.load(std::memory_order_acquire) > 0) {
+    return;
+  }
+
+  c->SetNotifyOnCompactionCompleted();
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info);
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionBegin(this, info);
+    }
+    info.status.PermitUncheckedError();
+  }
+  mutex_.Lock();
+  current->Unref();
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+void DBImpl::NotifyOnCompactionCompleted(
+    ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  mutex_.AssertHeld();
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  if (c->ShouldNotifyOnCompactionCompleted() == false) {
+    return;
+  }
+
+  Version* current = cfd->current();
+  current->Ref();
+  // release lock while notifying events
+  mutex_.Unlock();
+  TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
+  {
+    CompactionJobInfo info{};
+    BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current,
+                           &info);
+    for (auto listener : immutable_db_options_.listeners) {
+      listener->OnCompactionCompleted(this, info);
+    }
+  }
+  mutex_.Lock();
+  current->Unref();
+  // no need to signal bg_cv_ as it will be signaled at the end of the
+  // flush process.
+#else
+  (void)cfd;
+  (void)c;
+  (void)st;
+  (void)compaction_job_stats;
+  (void)job_id;
+#endif  // ROCKSDB_LITE
+}
+
+// REQUIREMENT: block all background work by calling PauseBackgroundWork()
+// before calling this function
+Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
+  assert(level < cfd->NumberLevels());
+  if (target_level >= cfd->NumberLevels()) {
+    return Status::InvalidArgument("Target level exceeds number of levels");
+  }
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+
+  InstrumentedMutexLock guard_lock(&mutex_);
+
+  // only allow one thread refitting
+  if (refitting_level_) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[ReFitLevel] another thread is refitting");
+    return Status::NotSupported("another thread is refitting");
+  }
+  refitting_level_ = true;
+
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+  // move to a smaller level
+  int to_level = target_level;
+  if (target_level < 0) {
+    to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
+  }
+
+  auto* vstorage = cfd->current()->storage_info();
+  if (to_level != level) {
+    if (to_level > level) {
+      if (level == 0) {
+        refitting_level_ = false;
+        return Status::NotSupported(
+            "Cannot change from level 0 to other levels.");
+      }
+      // Check levels are empty for a trivial move
+      for (int l = level + 1; l <= to_level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
+      }
+    } else {
+      // to_level < level
+      // Check levels are empty for a trivial move
+      for (int l = to_level; l < level; l++) {
+        if (vstorage->NumLevelFiles(l) > 0) {
+          refitting_level_ = false;
+          return Status::NotSupported(
+              "Levels between source and target are not empty for a move.");
+        }
+      }
+    }
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Before refitting:\n%s", cfd->GetName().c_str(),
+                    cfd->current()->DebugString().data());
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : vstorage->LevelFiles(level)) {
+      edit.DeleteFile(level, f->fd.GetNumber());
+      edit.AddFile(
+          to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(),
+          f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno,
+          f->marked_for_compaction, f->temperature, f->oldest_blob_file_number,
+          f->oldest_ancester_time, f->file_creation_time, f->file_checksum,
+          f->file_checksum_func_name, f->unique_id);
+    }
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] Apply version edit:\n%s", cfd->GetName().c_str(),
+                    edit.DebugString().data());
+
+    Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit,
+                                           &mutex_, directories_.GetDbDir());
+
+    InstallSuperVersionAndScheduleWork(cfd, &sv_context, mutable_cf_options);
+
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] LogAndApply: %s\n",
+                    cfd->GetName().c_str(), status.ToString().data());
+
+    if (status.ok()) {
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] After refitting:\n%s", cfd->GetName().c_str(),
+                      cfd->current()->DebugString().data());
+    }
+    sv_context.Clean();
+    refitting_level_ = false;
+
+    return status;
+  }
+
+  refitting_level_ = false;
+  return Status::OK();
+}
+
+int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  return cfh->cfd()->NumberLevels();
+}
+
+int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) {
+  return 0;
+}
+
+int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  InstrumentedMutexLock l(&mutex_);
+  return cfh->cfd()
+      ->GetSuperVersion()
+      ->mutable_cf_options.level0_stop_writes_trigger;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     ColumnFamilyHandle* column_family) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "[%s] Manual flush start.",
+                 cfh->GetName().c_str());
+  Status s;
+  if (immutable_db_options_.atomic_flush) {
+    s = AtomicFlushMemTables({cfh->cfd()}, flush_options,
+                             FlushReason::kManualFlush);
+  } else {
+    s = FlushMemTable(cfh->cfd(), flush_options, FlushReason::kManualFlush);
+  }
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] Manual flush finished, status: %s\n",
+                 cfh->GetName().c_str(), s.ToString().c_str());
+  return s;
+}
+
+Status DBImpl::Flush(const FlushOptions& flush_options,
+                     const std::vector<ColumnFamilyHandle*>& column_families) {
+  Status s;
+  if (!immutable_db_options_.atomic_flush) {
+    for (auto cfh : column_families) {
+      s = Flush(flush_options, cfh);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush start.\n"
+                   "=====Column families:=====");
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+    autovector<ColumnFamilyData*> cfds;
+    std::for_each(column_families.begin(), column_families.end(),
+                  [&cfds](ColumnFamilyHandle* elem) {
+                    auto cfh = static_cast<ColumnFamilyHandleImpl*>(elem);
+                    cfds.emplace_back(cfh->cfd());
+                  });
+    s = AtomicFlushMemTables(cfds, flush_options, FlushReason::kManualFlush);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Manual atomic flush finished, status: %s\n"
+                   "=====Column families:=====",
+                   s.ToString().c_str());
+    for (auto cfh : column_families) {
+      auto cfhi = static_cast<ColumnFamilyHandleImpl*>(cfh);
+      ROCKS_LOG_INFO(immutable_db_options_.info_log, "%s",
+                     cfhi->GetName().c_str());
+    }
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "=====End of column families list=====");
+  }
+  return s;
+}
+
+Status DBImpl::RunManualCompaction(
+    ColumnFamilyData* cfd, int input_level, int output_level,
+    const CompactRangeOptions& compact_range_options, const Slice* begin,
+    const Slice* end, bool exclusive, bool disallow_trivial_move,
+    uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
+  assert(input_level == ColumnFamilyData::kCompactAllLevels ||
+         input_level >= 0);
+
+  InternalKey begin_storage, end_storage;
+  CompactionArg* ca = nullptr;
+
+  bool scheduled = false;
+  bool unscheduled = false;
+  Env::Priority thread_pool_priority = Env::Priority::TOTAL;
+  bool manual_conflict = false;
+
+  ManualCompactionState manual(
+      cfd, input_level, output_level, compact_range_options.target_path_id,
+      exclusive, disallow_trivial_move, compact_range_options.canceled);
+  // For universal compaction, we enforce every manual compaction to compact
+  // all files.
+  if (begin == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.begin = nullptr;
+  } else {
+    begin_storage.SetMinPossibleForUserKey(*begin);
+    manual.begin = &begin_storage;
+  }
+  if (end == nullptr ||
+      cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+      cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
+    manual.end = nullptr;
+  } else {
+    end_storage.SetMaxPossibleForUserKey(*end);
+    manual.end = &end_storage;
+  }
+
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
+  TEST_SYNC_POINT("DBImpl::RunManualCompaction:1");
+  InstrumentedMutexLock l(&mutex_);
+
+  if (manual_compaction_paused_ > 0) {
+    // Does not make sense to `AddManualCompaction()` in this scenario since
+    // `DisableManualCompaction()` just waited for the manual compaction queue
+    // to drain. So return immediately.
+    TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
+    manual.status =
+        Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    manual.done = true;
+    return manual.status;
+  }
+
+  // When a manual compaction arrives, temporarily disable scheduling of
+  // non-manual compactions and wait until the number of scheduled compaction
+  // jobs drops to zero. This used to be needed to ensure that this manual
+  // compaction can compact any range of keys/files. Now it is optional
+  // (see `CompactRangeOptions::exclusive_manual_compaction`). The use case for
+  // `exclusive_manual_compaction=true` is unclear beyond not trusting the code.
+  //
+  // HasPendingManualCompaction() is true when at least one thread is inside
+  // RunManualCompaction(), i.e. during that time no other compaction will
+  // get scheduled (see MaybeScheduleFlushOrCompaction).
+  //
+  // Note that the following loop doesn't stop more that one thread calling
+  // RunManualCompaction() from getting to the second while loop below.
+  // However, only one of them will actually schedule compaction, while
+  // others will wait on a condition variable until it completes.
+
+  AddManualCompaction(&manual);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
+  if (exclusive) {
+    // Limitation: there's no way to wake up the below loop when user sets
+    // `*manual.canceled`. So `CompactRangeOptions::exclusive_manual_compaction`
+    // and `CompactRangeOptions::canceled` might not work well together.
+    while (bg_bottom_compaction_scheduled_ > 0 ||
+           bg_compaction_scheduled_ > 0) {
+      if (manual_compaction_paused_ > 0 || manual.canceled == true) {
+        // Pretend the error came from compaction so the below cleanup/error
+        // handling code can process it.
+        manual.done = true;
+        manual.status =
+            Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+        break;
+      }
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:WaitScheduled");
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "[%s] Manual compaction waiting for all other scheduled background "
+          "compactions to finish",
+          cfd->GetName().c_str());
+      bg_cv_.Wait();
+    }
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  ROCKS_LOG_BUFFER(&log_buffer, "[%s] Manual compaction starting",
+                   cfd->GetName().c_str());
+
+  // We don't check bg_error_ here, because if we get the error in compaction,
+  // the compaction will set manual.status to bg_error_ and set manual.done to
+  // true.
+  while (!manual.done) {
+    assert(HasPendingManualCompaction());
+    manual_conflict = false;
+    Compaction* compaction = nullptr;
+    if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+        scheduled ||
+        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+         ((compaction = manual.cfd->CompactRange(
+               *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+               manual.input_level, manual.output_level, compact_range_options,
+               manual.begin, manual.end, &manual.manual_end, &manual_conflict,
+               max_file_num_to_ignore, trim_ts)) == nullptr &&
+          manual_conflict))) {
+      // exclusive manual compactions should not see a conflict during
+      // CompactRange
+      assert(!exclusive || !manual_conflict);
+      // Running either this or some other manual compaction
+      bg_cv_.Wait();
+      if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        // unschedule all manual compactions
+        auto unscheduled_task_num = env_->UnSchedule(
+            GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+        if (unscheduled_task_num > 0) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[%s] Unscheduled %d number of manual compactions from the "
+              "thread-pool",
+              cfd->GetName().c_str(), unscheduled_task_num);
+          // it may unschedule other manual compactions, notify others.
+          bg_cv_.SignalAll();
+        }
+        unscheduled = true;
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
+      }
+      if (scheduled && manual.incomplete == true) {
+        assert(!manual.in_progress);
+        scheduled = false;
+        manual.incomplete = false;
+      }
+    } else if (!scheduled) {
+      if (compaction == nullptr) {
+        manual.done = true;
+        bg_cv_.SignalAll();
+        continue;
+      }
+      ca = new CompactionArg;
+      ca->db = this;
+      ca->prepicked_compaction = new PrepickedCompaction;
+      ca->prepicked_compaction->manual_compaction_state = &manual;
+      ca->prepicked_compaction->compaction = compaction;
+      if (!RequestCompactionToken(
+              cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
+        // Don't throttle manual compaction, only count outstanding tasks.
+        assert(false);
+      }
+      manual.incomplete = false;
+      if (compaction->bottommost_level() &&
+          env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+        bg_bottom_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::BOTTOM;
+        env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
+                       Env::Priority::BOTTOM,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::BOTTOM;
+      } else {
+        bg_compaction_scheduled_++;
+        ca->compaction_pri_ = Env::Priority::LOW;
+        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+                       GetTaskTag(TaskType::kManualCompaction),
+                       &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::LOW;
+      }
+      scheduled = true;
+      TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
+    }
+  }
+
+  log_buffer.FlushBufferToLog();
+  assert(!manual.in_progress);
+  assert(HasPendingManualCompaction());
+  RemoveManualCompaction(&manual);
+  // if the manual job is unscheduled, try schedule other jobs in case there's
+  // any unscheduled compaction job which was blocked by exclusive manual
+  // compaction.
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+    MaybeScheduleFlushOrCompaction();
+  }
+  bg_cv_.SignalAll();
+  return manual.status;
+}
+
+void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
+                                  FlushRequest* req) {
+  assert(req != nullptr);
+  req->reserve(cfds.size());
+  for (const auto cfd : cfds) {
+    if (nullptr == cfd) {
+      // cfd may be null, see DBImpl::ScheduleFlushes
+      continue;
+    }
+    uint64_t max_memtable_id = cfd->imm()->GetLatestMemTableID();
+    req->emplace_back(cfd, max_memtable_id);
+  }
+}
+
+Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
+                             const FlushOptions& flush_options,
+                             FlushReason flush_reason,
+                             bool entered_write_thread) {
+  // This method should not be called if atomic_flush is true.
+  assert(!immutable_db_options_.atomic_flush);
+  if (!flush_options.wait && write_controller_.IsStopped()) {
+    std::ostringstream oss;
+    oss << "Writes have been stopped, thus unable to perform manual flush. "
+           "Please try again later after writes are resumed";
+    return Status::TryAgain(oss.str());
+  }
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    bool flush_needed = true;
+    s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+    TEST_SYNC_POINT("DBImpl::FlushMemTable:StallWaitDone");
+    if (!s.ok() || !flush_needed) {
+      return s;
+    }
+  }
+
+  const bool needs_to_join_write_thread = !entered_write_thread;
+  autovector<FlushRequest> flush_reqs;
+  autovector<uint64_t> memtable_ids_to_wait;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (needs_to_join_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    if (flush_reason != FlushReason::kErrorRecoveryRetryFlush &&
+        (!cfd->mem()->IsEmpty() || !cached_recoverable_state_empty_.load())) {
+      // Note that, when flush reason is kErrorRecoveryRetryFlush, during the
+      // auto retry resume, we want to avoid creating new small memtables.
+      // Therefore, SwitchMemtable will not be called. Also, since ResumeImpl
+      // will iterate through all the CFs and call FlushMemtable during auto
+      // retry resume, it is possible that in some CFs,
+      // cfd->imm()->NumNotFlushed() = 0. In this case, so no flush request will
+      // be created and scheduled, status::OK() will be returned.
+      s = SwitchMemtable(cfd, &context);
+    }
+    const uint64_t flush_memtable_id = std::numeric_limits<uint64_t>::max();
+    if (s.ok()) {
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        FlushRequest req{{cfd, flush_memtable_id}};
+        flush_reqs.emplace_back(std::move(req));
+        memtable_ids_to_wait.emplace_back(cfd->imm()->GetLatestMemTableID());
+      }
+      if (immutable_db_options_.persist_stats_to_disk &&
+          flush_reason != FlushReason::kErrorRecoveryRetryFlush) {
+        ColumnFamilyData* cfd_stats =
+            versions_->GetColumnFamilySet()->GetColumnFamily(
+                kPersistentStatsColumnFamilyName);
+        if (cfd_stats != nullptr && cfd_stats != cfd &&
+            !cfd_stats->mem()->IsEmpty()) {
+          // only force flush stats CF when it will be the only CF lagging
+          // behind after the current flush
+          bool stats_cf_flush_needed = true;
+          for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+            if (loop_cfd == cfd_stats || loop_cfd == cfd) {
+              continue;
+            }
+            if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+              stats_cf_flush_needed = false;
+            }
+          }
+          if (stats_cf_flush_needed) {
+            ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                           "Force flushing stats CF with manual flush of %s "
+                           "to avoid holding old logs",
+                           cfd->GetName().c_str());
+            s = SwitchMemtable(cfd_stats, &context);
+            FlushRequest req{{cfd_stats, flush_memtable_id}};
+            flush_reqs.emplace_back(std::move(req));
+            memtable_ids_to_wait.emplace_back(
+                cfd->imm()->GetLatestMemTableID());
+          }
+        }
+      }
+    }
+
+    if (s.ok() && !flush_reqs.empty()) {
+      for (const auto& req : flush_reqs) {
+        assert(req.size() == 1);
+        ColumnFamilyData* loop_cfd = req[0].first;
+        loop_cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (const auto& req : flush_reqs) {
+          assert(req.size() == 1);
+          ColumnFamilyData* loop_cfd = req[0].first;
+          loop_cfd->Ref();
+        }
+      }
+      for (const auto& req : flush_reqs) {
+        SchedulePendingFlush(req, flush_reason);
+      }
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (needs_to_join_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<ColumnFamilyData*> cfds;
+    autovector<const uint64_t*> flush_memtable_ids;
+    assert(flush_reqs.size() == memtable_ids_to_wait.size());
+    for (size_t i = 0; i < flush_reqs.size(); ++i) {
+      assert(flush_reqs[i].size() == 1);
+      cfds.push_back(flush_reqs[i][0].first);
+      flush_memtable_ids.push_back(&(memtable_ids_to_wait[i]));
+    }
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* tmp_cfd : cfds) {
+      tmp_cfd->UnrefAndTryDelete();
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::FlushMemTable:FlushMemTableFinished");
+  return s;
+}
+
+// Flush all elements in 'column_family_datas'
+// and atomically record the result to the MANIFEST.
+Status DBImpl::AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const FlushOptions& flush_options, FlushReason flush_reason,
+    bool entered_write_thread) {
+  assert(immutable_db_options_.atomic_flush);
+  if (!flush_options.wait && write_controller_.IsStopped()) {
+    std::ostringstream oss;
+    oss << "Writes have been stopped, thus unable to perform manual flush. "
+           "Please try again later after writes are resumed";
+    return Status::TryAgain(oss.str());
+  }
+  Status s;
+  if (!flush_options.allow_write_stall) {
+    int num_cfs_to_flush = 0;
+    for (auto cfd : column_family_datas) {
+      bool flush_needed = true;
+      s = WaitUntilFlushWouldNotStallWrites(cfd, &flush_needed);
+      if (!s.ok()) {
+        return s;
+      } else if (flush_needed) {
+        ++num_cfs_to_flush;
+      }
+    }
+    if (0 == num_cfs_to_flush) {
+      return s;
+    }
+  }
+  const bool needs_to_join_write_thread = !entered_write_thread;
+  FlushRequest flush_req;
+  autovector<ColumnFamilyData*> cfds;
+  {
+    WriteContext context;
+    InstrumentedMutexLock guard_lock(&mutex_);
+
+    WriteThread::Writer w;
+    WriteThread::Writer nonmem_w;
+    if (needs_to_join_write_thread) {
+      write_thread_.EnterUnbatched(&w, &mutex_);
+      if (two_write_queues_) {
+        nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+      }
+    }
+    WaitForPendingWrites();
+
+    for (auto cfd : column_family_datas) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+          !cached_recoverable_state_empty_.load()) {
+        cfds.emplace_back(cfd);
+      }
+    }
+    for (auto cfd : cfds) {
+      if ((cfd->mem()->IsEmpty() && cached_recoverable_state_empty_.load()) ||
+          flush_reason == FlushReason::kErrorRecoveryRetryFlush) {
+        continue;
+      }
+      cfd->Ref();
+      s = SwitchMemtable(cfd, &context);
+      cfd->UnrefAndTryDelete();
+      if (!s.ok()) {
+        break;
+      }
+    }
+    if (s.ok()) {
+      AssignAtomicFlushSeq(cfds);
+      for (auto cfd : cfds) {
+        cfd->imm()->FlushRequested();
+      }
+      // If the caller wants to wait for this flush to complete, it indicates
+      // that the caller expects the ColumnFamilyData not to be free'ed by
+      // other threads which may drop the column family concurrently.
+      // Therefore, we increase the cfd's ref count.
+      if (flush_options.wait) {
+        for (auto cfd : cfds) {
+          cfd->Ref();
+        }
+      }
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, flush_reason);
+      MaybeScheduleFlushOrCompaction();
+    }
+
+    if (needs_to_join_write_thread) {
+      write_thread_.ExitUnbatched(&w);
+      if (two_write_queues_) {
+        nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:AfterScheduleFlush");
+  TEST_SYNC_POINT("DBImpl::AtomicFlushMemTables:BeforeWaitForBgFlush");
+  if (s.ok() && flush_options.wait) {
+    autovector<const uint64_t*> flush_memtable_ids;
+    for (auto& iter : flush_req) {
+      flush_memtable_ids.push_back(&(iter.second));
+    }
+    s = WaitForFlushMemTables(
+        cfds, flush_memtable_ids,
+        (flush_reason == FlushReason::kErrorRecovery ||
+         flush_reason == FlushReason::kErrorRecoveryRetryFlush));
+    InstrumentedMutexLock lock_guard(&mutex_);
+    for (auto* cfd : cfds) {
+      cfd->UnrefAndTryDelete();
+    }
+  }
+  return s;
+}
+
+// Calling FlushMemTable(), whether from DB::Flush() or from Backup Engine, can
+// cause write stall, for example if one memtable is being flushed already.
+// This method tries to avoid write stall (similar to CompactRange() behavior)
+// it emulates how the SuperVersion / LSM would change if flush happens, checks
+// it against various constrains and delays flush if it'd cause write stall.
+// Caller should check status and flush_needed to see if flush already happened.
+Status DBImpl::WaitUntilFlushWouldNotStallWrites(ColumnFamilyData* cfd,
+                                                 bool* flush_needed) {
+  {
+    *flush_needed = true;
+    InstrumentedMutexLock l(&mutex_);
+    uint64_t orig_active_memtable_id = cfd->mem()->GetID();
+    WriteStallCondition write_stall_condition = WriteStallCondition::kNormal;
+    do {
+      if (write_stall_condition != WriteStallCondition::kNormal) {
+        // Same error handling as user writes: Don't wait if there's a
+        // background error, even if it's a soft error. We might wait here
+        // indefinitely as the pending flushes/compactions may never finish
+        // successfully, resulting in the stall condition lasting indefinitely
+        if (error_handler_.IsBGWorkStopped()) {
+          return error_handler_.GetBGError();
+        }
+
+        TEST_SYNC_POINT("DBImpl::WaitUntilFlushWouldNotStallWrites:StallWait");
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "[%s] WaitUntilFlushWouldNotStallWrites"
+                       " waiting on stall conditions to clear",
+                       cfd->GetName().c_str());
+        bg_cv_.Wait();
+      }
+      if (cfd->IsDropped()) {
+        return Status::ColumnFamilyDropped();
+      }
+      if (shutting_down_.load(std::memory_order_acquire)) {
+        return Status::ShutdownInProgress();
+      }
+
+      uint64_t earliest_memtable_id =
+          std::min(cfd->mem()->GetID(), cfd->imm()->GetEarliestMemTableID());
+      if (earliest_memtable_id > orig_active_memtable_id) {
+        // We waited so long that the memtable we were originally waiting on was
+        // flushed.
+        *flush_needed = false;
+        return Status::OK();
+      }
+
+      const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+      const auto* vstorage = cfd->current()->storage_info();
+
+      // Skip stalling check if we're below auto-flush and auto-compaction
+      // triggers. If it stalled in these conditions, that'd mean the stall
+      // triggers are so low that stalling is needed for any background work. In
+      // that case we shouldn't wait since background work won't be scheduled.
+      if (cfd->imm()->NumNotFlushed() <
+              cfd->ioptions()->min_write_buffer_number_to_merge &&
+          vstorage->l0_delay_trigger_count() <
+              mutable_cf_options.level0_file_num_compaction_trigger) {
+        break;
+      }
+
+      // check whether one extra immutable memtable or an extra L0 file would
+      // cause write stalling mode to be entered. It could still enter stall
+      // mode due to pending compaction bytes, but that's less common
+      write_stall_condition = ColumnFamilyData::GetWriteStallConditionAndCause(
+                                  cfd->imm()->NumNotFlushed() + 1,
+                                  vstorage->l0_delay_trigger_count() + 1,
+                                  vstorage->estimated_compaction_needed_bytes(),
+                                  mutable_cf_options, *cfd->ioptions())
+                                  .first;
+    } while (write_stall_condition != WriteStallCondition::kNormal);
+  }
+  return Status::OK();
+}
+
+// Wait for memtables to be flushed for multiple column families.
+// let N = cfds.size()
+// for i in [0, N),
+//  1) if flush_memtable_ids[i] is not null, then the memtables with lower IDs
+//     have to be flushed for THIS column family;
+//  2) if flush_memtable_ids[i] is null, then all memtables in THIS column
+//     family have to be flushed.
+// Finish waiting when ALL column families finish flushing memtables.
+// resuming_from_bg_err indicates whether the caller is trying to resume from
+// background error or in normal processing.
+Status DBImpl::WaitForFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const uint64_t*>& flush_memtable_ids,
+    bool resuming_from_bg_err) {
+  int num = static_cast<int>(cfds.size());
+  // Wait until the compaction completes
+  InstrumentedMutexLock l(&mutex_);
+  Status s;
+  // If the caller is trying to resume from bg error, then
+  // error_handler_.IsDBStopped() is true.
+  while (resuming_from_bg_err || !error_handler_.IsDBStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      s = Status::ShutdownInProgress();
+      return s;
+    }
+    // If an error has occurred during resumption, then no need to wait.
+    // But flush operation may fail because of this error, so need to
+    // return the status.
+    if (!error_handler_.GetRecoveryError().ok()) {
+      s = error_handler_.GetRecoveryError();
+      break;
+    }
+    // If BGWorkStopped, which indicate that there is a BG error and
+    // 1) soft error but requires no BG work, 2) no in auto_recovery_
+    if (!resuming_from_bg_err && error_handler_.IsBGWorkStopped() &&
+        error_handler_.GetBGError().severity() < Status::Severity::kHardError) {
+      s = error_handler_.GetBGError();
+      return s;
+    }
+
+    // Number of column families that have been dropped.
+    int num_dropped = 0;
+    // Number of column families that have finished flush.
+    int num_finished = 0;
+    for (int i = 0; i < num; ++i) {
+      if (cfds[i]->IsDropped()) {
+        ++num_dropped;
+      } else if (cfds[i]->imm()->NumNotFlushed() == 0 ||
+                 (flush_memtable_ids[i] != nullptr &&
+                  cfds[i]->imm()->GetEarliestMemTableID() >
+                      *flush_memtable_ids[i])) {
+        ++num_finished;
+      }
+    }
+    if (1 == num_dropped && 1 == num) {
+      s = Status::ColumnFamilyDropped();
+      return s;
+    }
+    // Column families involved in this flush request have either been dropped
+    // or finished flush. Then it's time to finish waiting.
+    if (num_dropped + num_finished == num) {
+      break;
+    }
+    bg_cv_.Wait();
+  }
+  // If not resuming from bg error, and an error has caused the DB to stop,
+  // then report the bg error to caller.
+  if (!resuming_from_bg_err && error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+Status DBImpl::EnableAutoCompaction(
+    const std::vector<ColumnFamilyHandle*>& column_family_handles) {
+  Status s;
+  for (auto cf_ptr : column_family_handles) {
+    Status status =
+        this->SetOptions(cf_ptr, {{"disable_auto_compactions", "false"}});
+    if (!status.ok()) {
+      s = status;
+    }
+  }
+
+  return s;
+}
+
+// NOTE: Calling DisableManualCompaction() may overwrite the
+// user-provided canceled variable in CompactRangeOptions
+void DBImpl::DisableManualCompaction() {
+  InstrumentedMutexLock l(&mutex_);
+  manual_compaction_paused_.fetch_add(1, std::memory_order_release);
+
+  // Mark the canceled as true when the cancellation is triggered by
+  // manual_compaction_paused (may overwrite user-provided `canceled`)
+  for (const auto& manual_compaction : manual_compaction_dequeue_) {
+    manual_compaction->canceled = true;
+  }
+
+  // Wake up manual compactions waiting to start.
+  bg_cv_.SignalAll();
+
+  // Wait for any pending manual compactions to finish (typically through
+  // failing with `Status::Incomplete`) prior to returning. This way we are
+  // guaranteed no pending manual compaction will commit while manual
+  // compactions are "disabled".
+  while (HasPendingManualCompaction()) {
+    bg_cv_.Wait();
+  }
+}
+
+// NOTE: In contrast to DisableManualCompaction(), calling
+// EnableManualCompaction() does NOT overwrite the user-provided *canceled
+// variable to be false since there is NO CHANCE a canceled compaction
+// is uncanceled. In other words, a canceled compaction must have been
+// dropped out of the manual compaction queue, when we disable it.
+void DBImpl::EnableManualCompaction() {
+  InstrumentedMutexLock l(&mutex_);
+  assert(manual_compaction_paused_ > 0);
+  manual_compaction_paused_.fetch_sub(1, std::memory_order_release);
+}
+
+void DBImpl::MaybeScheduleFlushOrCompaction() {
+  mutex_.AssertHeld();
+  if (!opened_successfully_) {
+    // Compaction may introduce data race to DB open
+    return;
+  }
+  if (bg_work_paused_ > 0) {
+    // we paused the background work
+    return;
+  } else if (error_handler_.IsBGWorkStopped() &&
+             !error_handler_.IsRecoveryInProgress()) {
+    // There has been a hard error and this call is not part of the recovery
+    // sequence. Bail out here so we don't get into an endless loop of
+    // scheduling BG work which will again call this function
+    return;
+  } else if (shutting_down_.load(std::memory_order_acquire)) {
+    // DB is being deleted; no more background compactions
+    return;
+  }
+  auto bg_job_limits = GetBGJobLimits();
+  bool is_flush_pool_empty =
+      env_->GetBackgroundThreads(Env::Priority::HIGH) == 0;
+  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
+         bg_flush_scheduled_ < bg_job_limits.max_flushes) {
+    bg_flush_scheduled_++;
+    FlushThreadArg* fta = new FlushThreadArg;
+    fta->db_ = this;
+    fta->thread_pri_ = Env::Priority::HIGH;
+    env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::HIGH, this,
+                   &DBImpl::UnscheduleFlushCallback);
+    --unscheduled_flushes_;
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::MaybeScheduleFlushOrCompaction:AfterSchedule:0",
+        &unscheduled_flushes_);
+  }
+
+  // special case -- if high-pri (flush) thread pool is empty, then schedule
+  // flushes in low-pri (compaction) thread pool.
+  if (is_flush_pool_empty) {
+    while (unscheduled_flushes_ > 0 &&
+           bg_flush_scheduled_ + bg_compaction_scheduled_ <
+               bg_job_limits.max_flushes) {
+      bg_flush_scheduled_++;
+      FlushThreadArg* fta = new FlushThreadArg;
+      fta->db_ = this;
+      fta->thread_pri_ = Env::Priority::LOW;
+      env_->Schedule(&DBImpl::BGWorkFlush, fta, Env::Priority::LOW, this,
+                     &DBImpl::UnscheduleFlushCallback);
+      --unscheduled_flushes_;
+    }
+  }
+
+  if (bg_compaction_paused_ > 0) {
+    // we paused the background compaction
+    return;
+  } else if (error_handler_.IsBGWorkStopped()) {
+    // Compaction is not part of the recovery sequence from a hard error. We
+    // might get here because recovery might do a flush and install a new
+    // super version, which will try to schedule pending compactions. Bail
+    // out here and let the higher level recovery handle compactions
+    return;
+  }
+
+  if (HasExclusiveManualCompaction()) {
+    // only manual compactions are allowed to run. don't schedule automatic
+    // compactions
+    TEST_SYNC_POINT("DBImpl::MaybeScheduleFlushOrCompaction:Conflict");
+    return;
+  }
+
+  while (bg_compaction_scheduled_ + bg_bottom_compaction_scheduled_ <
+             bg_job_limits.max_compactions &&
+         unscheduled_compactions_ > 0) {
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->compaction_pri_ = Env::Priority::LOW;
+    ca->prepicked_compaction = nullptr;
+    bg_compaction_scheduled_++;
+    unscheduled_compactions_--;
+    env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+                   &DBImpl::UnscheduleCompactionCallback);
+  }
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits() const {
+  mutex_.AssertHeld();
+  return GetBGJobLimits(mutable_db_options_.max_background_flushes,
+                        mutable_db_options_.max_background_compactions,
+                        mutable_db_options_.max_background_jobs,
+                        write_controller_.NeedSpeedupCompaction());
+}
+
+DBImpl::BGJobLimits DBImpl::GetBGJobLimits(int max_background_flushes,
+                                           int max_background_compactions,
+                                           int max_background_jobs,
+                                           bool parallelize_compactions) {
+  BGJobLimits res;
+  if (max_background_flushes == -1 && max_background_compactions == -1) {
+    // for our first stab implementing max_background_jobs, simply allocate a
+    // quarter of the threads to flushes.
+    res.max_flushes = std::max(1, max_background_jobs / 4);
+    res.max_compactions = std::max(1, max_background_jobs - res.max_flushes);
+  } else {
+    // compatibility code in case users haven't migrated to max_background_jobs,
+    // which automatically computes flush/compaction limits
+    res.max_flushes = std::max(1, max_background_flushes);
+    res.max_compactions = std::max(1, max_background_compactions);
+  }
+  if (!parallelize_compactions) {
+    // throttle background compactions until we deem necessary
+    res.max_compactions = 1;
+  }
+  return res;
+}
+
+void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) {
+  assert(!cfd->queued_for_compaction());
+  cfd->Ref();
+  compaction_queue_.push_back(cfd);
+  cfd->set_queued_for_compaction(true);
+}
+
+ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() {
+  assert(!compaction_queue_.empty());
+  auto cfd = *compaction_queue_.begin();
+  compaction_queue_.pop_front();
+  assert(cfd->queued_for_compaction());
+  cfd->set_queued_for_compaction(false);
+  return cfd;
+}
+
+DBImpl::FlushRequest DBImpl::PopFirstFromFlushQueue() {
+  assert(!flush_queue_.empty());
+  FlushRequest flush_req = flush_queue_.front();
+  flush_queue_.pop_front();
+  if (!immutable_db_options_.atomic_flush) {
+    assert(flush_req.size() == 1);
+  }
+  for (const auto& elem : flush_req) {
+    if (!immutable_db_options_.atomic_flush) {
+      ColumnFamilyData* cfd = elem.first;
+      assert(cfd);
+      assert(cfd->queued_for_flush());
+      cfd->set_queued_for_flush(false);
+    }
+  }
+  // TODO: need to unset flush reason?
+  return flush_req;
+}
+
+ColumnFamilyData* DBImpl::PickCompactionFromQueue(
+    std::unique_ptr<TaskLimiterToken>* token, LogBuffer* log_buffer) {
+  assert(!compaction_queue_.empty());
+  assert(*token == nullptr);
+  autovector<ColumnFamilyData*> throttled_candidates;
+  ColumnFamilyData* cfd = nullptr;
+  while (!compaction_queue_.empty()) {
+    auto first_cfd = *compaction_queue_.begin();
+    compaction_queue_.pop_front();
+    assert(first_cfd->queued_for_compaction());
+    if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) {
+      throttled_candidates.push_back(first_cfd);
+      continue;
+    }
+    cfd = first_cfd;
+    cfd->set_queued_for_compaction(false);
+    break;
+  }
+  // Add throttled compaction candidates back to queue in the original order.
+  for (auto iter = throttled_candidates.rbegin();
+       iter != throttled_candidates.rend(); ++iter) {
+    compaction_queue_.push_front(*iter);
+  }
+  return cfd;
+}
+
+void DBImpl::SchedulePendingFlush(const FlushRequest& flush_req,
+                                  FlushReason flush_reason) {
+  mutex_.AssertHeld();
+  if (flush_req.empty()) {
+    return;
+  }
+  if (!immutable_db_options_.atomic_flush) {
+    // For the non-atomic flush case, we never schedule multiple column
+    // families in the same flush request.
+    assert(flush_req.size() == 1);
+    ColumnFamilyData* cfd = flush_req[0].first;
+    assert(cfd);
+
+    if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
+      cfd->Ref();
+      cfd->set_queued_for_flush(true);
+      cfd->SetFlushReason(flush_reason);
+      ++unscheduled_flushes_;
+      flush_queue_.push_back(flush_req);
+    }
+  } else {
+    for (auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      cfd->Ref();
+      cfd->SetFlushReason(flush_reason);
+    }
+    ++unscheduled_flushes_;
+    flush_queue_.push_back(flush_req);
+  }
+}
+
+void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) {
+  mutex_.AssertHeld();
+  if (!cfd->queued_for_compaction() && cfd->NeedsCompaction()) {
+    AddToCompactionQueue(cfd);
+    ++unscheduled_compactions_;
+  }
+}
+
+void DBImpl::SchedulePendingPurge(std::string fname, std::string dir_to_sync,
+                                  FileType type, uint64_t number, int job_id) {
+  mutex_.AssertHeld();
+  PurgeFileInfo file_info(fname, dir_to_sync, type, number, job_id);
+  purge_files_.insert({{number, std::move(file_info)}});
+}
+
+void DBImpl::BGWorkFlush(void* arg) {
+  FlushThreadArg fta = *(reinterpret_cast<FlushThreadArg*>(arg));
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+
+  IOSTATS_SET_THREAD_POOL_ID(fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush");
+  static_cast_with_check<DBImpl>(fta.db_)->BackgroundCallFlush(fta.thread_pri_);
+  TEST_SYNC_POINT("DBImpl::BGWorkFlush:done");
+}
+
+void DBImpl::BGWorkCompaction(void* arg) {
+  CompactionArg ca = *(reinterpret_cast<CompactionArg*>(arg));
+  delete reinterpret_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
+  TEST_SYNC_POINT("DBImpl::BGWorkCompaction");
+  auto prepicked_compaction =
+      static_cast<PrepickedCompaction*>(ca.prepicked_compaction);
+  static_cast_with_check<DBImpl>(ca.db)->BackgroundCallCompaction(
+      prepicked_compaction, Env::Priority::LOW);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkBottomCompaction(void* arg) {
+  CompactionArg ca = *(static_cast<CompactionArg*>(arg));
+  delete static_cast<CompactionArg*>(arg);
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::BOTTOM);
+  TEST_SYNC_POINT("DBImpl::BGWorkBottomCompaction");
+  auto* prepicked_compaction = ca.prepicked_compaction;
+  assert(prepicked_compaction && prepicked_compaction->compaction);
+  ca.db->BackgroundCallCompaction(prepicked_compaction, Env::Priority::BOTTOM);
+  delete prepicked_compaction;
+}
+
+void DBImpl::BGWorkPurge(void* db) {
+  IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:start");
+  reinterpret_cast<DBImpl*>(db)->BackgroundCallPurge();
+  TEST_SYNC_POINT("DBImpl::BGWorkPurge:end");
+}
+
+void DBImpl::UnscheduleCompactionCallback(void* arg) {
+  CompactionArg* ca_ptr = reinterpret_cast<CompactionArg*>(arg);
+  Env::Priority compaction_pri = ca_ptr->compaction_pri_;
+  if (Env::Priority::BOTTOM == compaction_pri) {
+    // Decrement bg_bottom_compaction_scheduled_ if priority is BOTTOM
+    ca_ptr->db->bg_bottom_compaction_scheduled_--;
+  } else if (Env::Priority::LOW == compaction_pri) {
+    // Decrement bg_compaction_scheduled_ if priority is LOW
+    ca_ptr->db->bg_compaction_scheduled_--;
+  }
+  CompactionArg ca = *(ca_ptr);
+  delete reinterpret_cast<CompactionArg*>(arg);
+  if (ca.prepicked_compaction != nullptr) {
+    // if it's a manual compaction, set status to ManualCompactionPaused
+    if (ca.prepicked_compaction->manual_compaction_state) {
+      ca.prepicked_compaction->manual_compaction_state->done = true;
+      ca.prepicked_compaction->manual_compaction_state->status =
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+    if (ca.prepicked_compaction->compaction != nullptr) {
+      ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
+      delete ca.prepicked_compaction->compaction;
+    }
+    delete ca.prepicked_compaction;
+  }
+  TEST_SYNC_POINT("DBImpl::UnscheduleCompactionCallback");
+}
+
+void DBImpl::UnscheduleFlushCallback(void* arg) {
+  // Decrement bg_flush_scheduled_ in flush callback
+  reinterpret_cast<FlushThreadArg*>(arg)->db_->bg_flush_scheduled_--;
+  Env::Priority flush_pri = reinterpret_cast<FlushThreadArg*>(arg)->thread_pri_;
+  if (Env::Priority::LOW == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleLowFlushCallback");
+  } else if (Env::Priority::HIGH == flush_pri) {
+    TEST_SYNC_POINT("DBImpl::UnscheduleHighFlushCallback");
+  }
+  delete reinterpret_cast<FlushThreadArg*>(arg);
+  TEST_SYNC_POINT("DBImpl::UnscheduleFlushCallback");
+}
+
+Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
+                               LogBuffer* log_buffer, FlushReason* reason,
+                               Env::Priority thread_pri) {
+  mutex_.AssertHeld();
+
+  Status status;
+  *reason = FlushReason::kOthers;
+  // If BG work is stopped due to an error, but a recovery is in progress,
+  // that means this flush is part of the recovery. So allow it to go through
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    }
+  } else if (!error_handler_.IsRecoveryInProgress()) {
+    status = error_handler_.GetBGError();
+  }
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  autovector<BGFlushArg> bg_flush_args;
+  std::vector<SuperVersionContext>& superversion_contexts =
+      job_context->superversion_contexts;
+  autovector<ColumnFamilyData*> column_families_not_to_flush;
+  while (!flush_queue_.empty()) {
+    // This cfd is already referenced
+    const FlushRequest& flush_req = PopFirstFromFlushQueue();
+    superversion_contexts.clear();
+    superversion_contexts.reserve(flush_req.size());
+
+    for (const auto& iter : flush_req) {
+      ColumnFamilyData* cfd = iter.first;
+      if (cfd->GetMempurgeUsed()) {
+        // If imm() contains silent memtables (e.g.: because
+        // MemPurge was activated), requesting a flush will
+        // mark the imm_needed as true.
+        cfd->imm()->FlushRequested();
+      }
+
+      if (cfd->IsDropped() || !cfd->imm()->IsFlushPending()) {
+        // can't flush this CF, try next one
+        column_families_not_to_flush.push_back(cfd);
+        continue;
+      }
+      superversion_contexts.emplace_back(SuperVersionContext(true));
+      bg_flush_args.emplace_back(cfd, iter.second,
+                                 &(superversion_contexts.back()));
+    }
+    if (!bg_flush_args.empty()) {
+      break;
+    }
+  }
+
+  if (!bg_flush_args.empty()) {
+    auto bg_job_limits = GetBGJobLimits();
+    for (const auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "Calling FlushMemTableToOutputFile with column "
+          "family [%s], flush slots available %d, compaction slots available "
+          "%d, "
+          "flush slots scheduled %d, compaction slots scheduled %d",
+          cfd->GetName().c_str(), bg_job_limits.max_flushes,
+          bg_job_limits.max_compactions, bg_flush_scheduled_,
+          bg_compaction_scheduled_);
+    }
+    status = FlushMemTablesToOutputFiles(bg_flush_args, made_progress,
+                                         job_context, log_buffer, thread_pri);
+    TEST_SYNC_POINT("DBImpl::BackgroundFlush:BeforeFlush");
+    // All the CFDs in the FlushReq must have the same flush reason, so just
+    // grab the first one
+    *reason = bg_flush_args[0].cfd_->GetFlushReason();
+    for (auto& arg : bg_flush_args) {
+      ColumnFamilyData* cfd = arg.cfd_;
+      if (cfd->UnrefAndTryDelete()) {
+        arg.cfd_ = nullptr;
+      }
+    }
+  }
+  for (auto cfd : column_families_not_to_flush) {
+    cfd->UnrefAndTryDelete();
+  }
+  return status;
+}
+
+void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+
+  TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCallFlush:start", nullptr);
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:1");
+  TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:Start:2");
+  {
+    InstrumentedMutexLock l(&mutex_);
+    assert(bg_flush_scheduled_);
+    num_running_flushes_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+    FlushReason reason;
+
+    Status s = BackgroundFlush(&made_progress, &job_context, &log_buffer,
+                               &reason, thread_pri);
+    if (!s.ok() && !s.IsShutdownInProgress() && !s.IsColumnFamilyDropped() &&
+        reason != FlushReason::kErrorRecovery) {
+      // Wait a little bit before retrying background flush in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed flushes for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background flush error: %s"
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      log_buffer.FlushBufferToLog();
+      LogFlush(immutable_db_options_.info_log);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    }
+
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FlushFinish:0");
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If flush failed, we want to delete all temporary files that we might have
+    // created. Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsColumnFamilyDropped());
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:FilesFound");
+      // Have to flush the info logs before bg_flush_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+    TEST_SYNC_POINT("DBImpl::BackgroundCallFlush:ContextCleanedUp");
+
+    assert(num_running_flushes_ > 0);
+    num_running_flushes_--;
+    bg_flush_scheduled_--;
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+    atomic_flush_install_cv_.SignalAll();
+    bg_cv_.SignalAll();
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
+                                      Env::Priority bg_thread_pri) {
+  bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  TEST_SYNC_POINT("BackgroundCallCompaction:0");
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+  {
+    InstrumentedMutexLock l(&mutex_);
+
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    num_running_compactions_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+
+    assert((bg_thread_pri == Env::Priority::BOTTOM &&
+            bg_bottom_compaction_scheduled_) ||
+           (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+                                    prepicked_compaction, bg_thread_pri);
+    TEST_SYNC_POINT("BackgroundCallCompaction:1");
+    if (s.IsBusy()) {
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      immutable_db_options_.clock->SleepForMicroseconds(
+          10000);  // prevent hot loop
+      mutex_.Lock();
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      log_buffer.FlushBufferToLog();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background compaction error: %s, "
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      LogFlush(immutable_db_options_.info_log);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      assert(prepicked_compaction);
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(), job_context.job_id);
+    }
+
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+    // If compaction failed, we want to delete all temporary files that we
+    // might have created (they might not be all recorded in job_context in
+    // case of a failure). Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped() &&
+                                        !s.IsBusy());
+    TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_compaction_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+        TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+      }
+      job_context.Clean();
+      mutex_.Lock();
+    }
+
+    assert(num_running_compactions_ > 0);
+    num_running_compactions_--;
+
+    if (bg_thread_pri == Env::Priority::LOW) {
+      bg_compaction_scheduled_--;
+    } else {
+      assert(bg_thread_pri == Env::Priority::BOTTOM);
+      bg_bottom_compaction_scheduled_--;
+    }
+
+    // See if there's more work to be done
+    MaybeScheduleFlushOrCompaction();
+
+    if (prepicked_compaction != nullptr &&
+        prepicked_compaction->task_token != nullptr) {
+      // Releasing task tokens affects (and asserts on) the DB state, so
+      // must be done before we potentially signal the DB close process to
+      // proceed below.
+      prepicked_compaction->task_token.reset();
+    }
+
+    if (made_progress ||
+        (bg_compaction_scheduled_ == 0 &&
+         bg_bottom_compaction_scheduled_ == 0) ||
+        HasPendingManualCompaction() || unscheduled_compactions_ == 0) {
+      // signal if
+      // * made_progress -- need to wakeup DelayWrite
+      // * bg_{bottom,}_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
+      // * HasPendingManualCompaction -- need to wakeup RunManualCompaction
+      // If none of this is true, there is no need to signal since nobody is
+      // waiting for it
+      bg_cv_.SignalAll();
+    }
+    // IMPORTANT: there should be no code after calling SignalAll. This call may
+    // signal the DB destructor that it's OK to proceed with destruction. In
+    // that case, all DB variables will be dealloacated and referencing them
+    // will cause trouble.
+  }
+}
+
+Status DBImpl::BackgroundCompaction(bool* made_progress,
+                                    JobContext* job_context,
+                                    LogBuffer* log_buffer,
+                                    PrepickedCompaction* prepicked_compaction,
+                                    Env::Priority thread_pri) {
+  ManualCompactionState* manual_compaction =
+      prepicked_compaction == nullptr
+          ? nullptr
+          : prepicked_compaction->manual_compaction_state;
+  *made_progress = false;
+  mutex_.AssertHeld();
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start");
+
+  bool is_manual = (manual_compaction != nullptr);
+  std::unique_ptr<Compaction> c;
+  if (prepicked_compaction != nullptr &&
+      prepicked_compaction->compaction != nullptr) {
+    c.reset(prepicked_compaction->compaction);
+  }
+  bool is_prepicked = is_manual || c;
+
+  // (manual_compaction->in_progress == false);
+  bool trivial_move_disallowed =
+      is_manual && manual_compaction->disallow_trivial_move;
+
+  CompactionJobStats compaction_job_stats;
+  Status status;
+  if (!error_handler_.IsBGWorkStopped()) {
+    if (shutting_down_.load(std::memory_order_acquire)) {
+      status = Status::ShutdownInProgress();
+    } else if (is_manual &&
+               manual_compaction->canceled.load(std::memory_order_acquire)) {
+      status = Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
+  } else {
+    status = error_handler_.GetBGError();
+    // If we get here, it means a hard error happened after this compaction
+    // was scheduled by MaybeScheduleFlushOrCompaction(), but before it got
+    // a chance to execute. Since we didn't pop a cfd from the compaction
+    // queue, increment unscheduled_compactions_
+    unscheduled_compactions_++;
+  }
+
+  if (!status.ok()) {
+    if (is_manual) {
+      manual_compaction->status = status;
+      manual_compaction->done = true;
+      manual_compaction->in_progress = false;
+      manual_compaction = nullptr;
+    }
+    if (c) {
+      c->ReleaseCompactionFiles(status);
+      c.reset();
+    }
+    return status;
+  }
+
+  if (is_manual) {
+    // another thread cannot pick up the same work
+    manual_compaction->in_progress = true;
+  }
+
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
+  std::unique_ptr<TaskLimiterToken> task_token;
+
+  // InternalKey manual_end_storage;
+  // InternalKey* manual_end = &manual_end_storage;
+  bool sfm_reserved_compact_space = false;
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    assert(m->in_progress);
+    if (!c) {
+      m->done = true;
+      m->manual_end = nullptr;
+      ROCKS_LOG_BUFFER(
+          log_buffer,
+          "[%s] Manual compaction from level-%d from %s .. "
+          "%s; nothing to do\n",
+          m->cfd->GetName().c_str(), m->input_level,
+          (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+          (m->end ? m->end->DebugString(true).c_str() : "(end)"));
+    } else {
+      // First check if we have enough room to do the compaction
+      bool enough_room = EnoughRoomForCompaction(
+          m->cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+      if (!enough_room) {
+        // Then don't do the compaction
+        c->ReleaseCompactionFiles(status);
+        c.reset();
+        // m's vars will get set properly at the end of this function,
+        // as long as status == CompactionTooLarge
+        status = Status::CompactionTooLarge();
+      } else {
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Manual compaction from level-%d to level-%d from %s .. "
+            "%s; will stop at %s\n",
+            m->cfd->GetName().c_str(), m->input_level, c->output_level(),
+            (m->begin ? m->begin->DebugString(true).c_str() : "(begin)"),
+            (m->end ? m->end->DebugString(true).c_str() : "(end)"),
+            ((m->done || m->manual_end == nullptr)
+                 ? "(end)"
+                 : m->manual_end->DebugString(true).c_str()));
+      }
+    }
+  } else if (!is_prepicked && !compaction_queue_.empty()) {
+    if (HasExclusiveManualCompaction()) {
+      // Can't compact right now, but try again later
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction()::Conflict");
+
+      // Stay in the compaction queue.
+      unscheduled_compactions_++;
+
+      return Status::OK();
+    }
+
+    auto cfd = PickCompactionFromQueue(&task_token, log_buffer);
+    if (cfd == nullptr) {
+      // Can't find any executable task from the compaction queue.
+      // All tasks have been throttled by compaction thread limiter.
+      ++unscheduled_compactions_;
+      return Status::Busy();
+    }
+
+    // We unreference here because the following code will take a Ref() on
+    // this cfd if it is going to use it (Compaction class holds a
+    // reference).
+    // This will all happen under a mutex so we don't have to be afraid of
+    // somebody else deleting it.
+    if (cfd->UnrefAndTryDelete()) {
+      // This was the last reference of the column family, so no need to
+      // compact.
+      return Status::OK();
+    }
+
+    // Pick up latest mutable CF Options and use it throughout the
+    // compaction job
+    // Compaction makes a copy of the latest MutableCFOptions. It should be used
+    // throughout the compaction procedure to make sure consistency. It will
+    // eventually be installed into SuperVersion
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) {
+      // NOTE: try to avoid unnecessary copy of MutableCFOptions if
+      // compaction is not necessary. Need to make sure mutex is held
+      // until we make a copy in the following code
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():BeforePickCompaction");
+      c.reset(cfd->PickCompaction(*mutable_cf_options, mutable_db_options_,
+                                  log_buffer));
+      TEST_SYNC_POINT("DBImpl::BackgroundCompaction():AfterPickCompaction");
+
+      if (c != nullptr) {
+        bool enough_room = EnoughRoomForCompaction(
+            cfd, *(c->inputs()), &sfm_reserved_compact_space, log_buffer);
+
+        if (!enough_room) {
+          // Then don't do the compaction
+          c->ReleaseCompactionFiles(status);
+          c->column_family_data()
+              ->current()
+              ->storage_info()
+              ->ComputeCompactionScore(*(c->immutable_options()),
+                                       *(c->mutable_cf_options()));
+          AddToCompactionQueue(cfd);
+          ++unscheduled_compactions_;
+
+          c.reset();
+          // Don't need to sleep here, because BackgroundCallCompaction
+          // will sleep if !s.ok()
+          status = Status::CompactionTooLarge();
+        } else {
+          // update statistics
+          size_t num_files = 0;
+          for (auto& each_level : *c->inputs()) {
+            num_files += each_level.files.size();
+          }
+          RecordInHistogram(stats_, NUM_FILES_IN_SINGLE_COMPACTION, num_files);
+
+          // There are three things that can change compaction score:
+          // 1) When flush or compaction finish. This case is covered by
+          // InstallSuperVersionAndScheduleWork
+          // 2) When MutableCFOptions changes. This case is also covered by
+          // InstallSuperVersionAndScheduleWork, because this is when the new
+          // options take effect.
+          // 3) When we Pick a new compaction, we "remove" those files being
+          // compacted from the calculation, which then influences compaction
+          // score. Here we check if we need the new compaction even without the
+          // files that are currently being compacted. If we need another
+          // compaction, we might be able to execute it in parallel, so we add
+          // it to the queue and schedule a new thread.
+          if (cfd->NeedsCompaction()) {
+            // Yes, we need more compactions!
+            AddToCompactionQueue(cfd);
+            ++unscheduled_compactions_;
+            MaybeScheduleFlushOrCompaction();
+          }
+        }
+      }
+    }
+  }
+
+  IOStatus io_s;
+  if (!c) {
+    // Nothing to do
+    ROCKS_LOG_BUFFER(log_buffer, "Compaction nothing to do");
+  } else if (c->deletion_compaction()) {
+    // TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
+    // file if there is alive snapshot pointing to it
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    assert(c->num_input_files(1) == 0);
+    assert(c->column_family_data()->ioptions()->compaction_style ==
+           kCompactionStyleFIFO);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    for (const auto& f : *c->inputs(0)) {
+      c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+    ROCKS_LOG_BUFFER(log_buffer, "[%s] Deleted %d files\n",
+                     c->column_family_data()->GetName().c_str(),
+                     c->num_input_files(0));
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!trivial_move_disallowed && c->IsTrivialMove()) {
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove");
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    // Instrument for event update
+    // TODO(yhchiang): add op details for showing trivial-move.
+    ThreadStatusUtil::SetColumnFamily(
+        c->column_family_data(), c->column_family_data()->ioptions()->env,
+        immutable_db_options_.enable_thread_tracking);
+    ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+
+    compaction_job_stats.num_input_files = c->num_input_files(0);
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+
+    // Move files to next level
+    int32_t moved_files = 0;
+    int64_t moved_bytes = 0;
+    for (unsigned int l = 0; l < c->num_input_levels(); l++) {
+      if (c->level(l) == c->output_level()) {
+        continue;
+      }
+      for (size_t i = 0; i < c->num_input_files(l); i++) {
+        FileMetaData* f = c->input(l, i);
+        c->edit()->DeleteFile(c->level(l), f->fd.GetNumber());
+        c->edit()->AddFile(
+            c->output_level(), f->fd.GetNumber(), f->fd.GetPathId(),
+            f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno,
+            f->fd.largest_seqno, f->marked_for_compaction, f->temperature,
+            f->oldest_blob_file_number, f->oldest_ancester_time,
+            f->file_creation_time, f->file_checksum, f->file_checksum_func_name,
+            f->unique_id);
+
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] Moving #%" PRIu64 " to level-%d %" PRIu64 " bytes\n",
+            c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
+            c->output_level(), f->fd.GetFileSize());
+        ++moved_files;
+        moved_bytes += f->fd.GetFileSize();
+      }
+    }
+    if (c->compaction_reason() == CompactionReason::kLevelMaxLevelSize &&
+        c->immutable_options()->compaction_pri == kRoundRobin) {
+      int start_level = c->start_level();
+      if (start_level > 0) {
+        auto vstorage = c->input_version()->storage_info();
+        c->edit()->AddCompactCursor(
+            start_level,
+            vstorage->GetNextCompactCursor(start_level, c->num_input_files(0)));
+      }
+    }
+    status = versions_->LogAndApply(c->column_family_data(),
+                                    *c->mutable_cf_options(), c->edit(),
+                                    &mutex_, directories_.GetDbDir());
+    io_s = versions_->io_status();
+    // Use latest MutableCFOptions
+    InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                       &job_context->superversion_contexts[0],
+                                       *c->mutable_cf_options());
+
+    VersionStorageInfo::LevelSummaryStorage tmp;
+    c->column_family_data()->internal_stats()->IncBytesMoved(c->output_level(),
+                                                             moved_bytes);
+    {
+      event_logger_.LogToBuffer(log_buffer)
+          << "job" << job_context->job_id << "event"
+          << "trivial_move"
+          << "destination_level" << c->output_level() << "files" << moved_files
+          << "total_files_size" << moved_bytes;
+    }
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] Moved #%d files to level-%d %" PRIu64 " bytes %s: %s\n",
+        c->column_family_data()->GetName().c_str(), moved_files,
+        c->output_level(), moved_bytes, status.ToString().c_str(),
+        c->column_family_data()->current()->storage_info()->LevelSummary(&tmp));
+    *made_progress = true;
+
+    // Clear Instrument
+    ThreadStatusUtil::ResetThreadStatus();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  } else if (!is_prepicked && c->output_level() > 0 &&
+             c->output_level() ==
+                 c->column_family_data()
+                     ->current()
+                     ->storage_info()
+                     ->MaxOutputLevel(
+                         immutable_db_options_.allow_ingest_behind) &&
+             env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
+    // Forward compactions involving last level to the bottom pool if it exists,
+    // such that compactions unlikely to contribute to write stalls can be
+    // delayed or deprioritized.
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:ForwardToBottomPriPool");
+    CompactionArg* ca = new CompactionArg;
+    ca->db = this;
+    ca->compaction_pri_ = Env::Priority::BOTTOM;
+    ca->prepicked_compaction = new PrepickedCompaction;
+    ca->prepicked_compaction->compaction = c.release();
+    ca->prepicked_compaction->manual_compaction_state = nullptr;
+    // Transfer requested token, so it doesn't need to do it again.
+    ca->prepicked_compaction->task_token = std::move(task_token);
+    ++bg_bottom_compaction_scheduled_;
+    env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca, Env::Priority::BOTTOM,
+                   this, &DBImpl::UnscheduleCompactionCallback);
+  } else {
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:BeforeCompaction",
+                             c->column_family_data());
+    int output_level __attribute__((__unused__));
+    output_level = c->output_level();
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:NonTrivial",
+                             &output_level);
+    std::vector<SequenceNumber> snapshot_seqs;
+    SequenceNumber earliest_write_conflict_snapshot;
+    SnapshotChecker* snapshot_checker;
+    GetSnapshotContext(job_context, &snapshot_seqs,
+                       &earliest_write_conflict_snapshot, &snapshot_checker);
+    assert(is_snapshot_supported_ || snapshots_.empty());
+
+    CompactionJob compaction_job(
+        job_context->job_id, c.get(), immutable_db_options_,
+        mutable_db_options_, file_options_for_compaction_, versions_.get(),
+        &shutting_down_, log_buffer, directories_.GetDbDir(),
+        GetDataDir(c->column_family_data(), c->output_path_id()),
+        GetDataDir(c->column_family_data(), 0), stats_, &mutex_,
+        &error_handler_, snapshot_seqs, earliest_write_conflict_snapshot,
+        snapshot_checker, job_context, table_cache_, &event_logger_,
+        c->mutable_cf_options()->paranoid_file_checks,
+        c->mutable_cf_options()->report_bg_io_stats, dbname_,
+        &compaction_job_stats, thread_pri, io_tracer_,
+        is_manual ? manual_compaction->canceled
+                  : kManualCompactionCanceledFalse_,
+        db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(),
+        c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_,
+        &bg_bottom_compaction_scheduled_);
+    compaction_job.Prepare();
+
+    NotifyOnCompactionBegin(c->column_family_data(), c.get(), status,
+                            compaction_job_stats, job_context->job_id);
+    mutex_.Unlock();
+    TEST_SYNC_POINT_CALLBACK(
+        "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr);
+    // Should handle erorr?
+    compaction_job.Run().PermitUncheckedError();
+    TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun");
+    mutex_.Lock();
+
+    status = compaction_job.Install(*c->mutable_cf_options());
+    io_s = compaction_job.io_status();
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(c->column_family_data(),
+                                         &job_context->superversion_contexts[0],
+                                         *c->mutable_cf_options());
+    }
+    *made_progress = true;
+    TEST_SYNC_POINT_CALLBACK("DBImpl::BackgroundCompaction:AfterCompaction",
+                             c->column_family_data());
+  }
+
+  if (status.ok() && !io_s.ok()) {
+    status = io_s;
+  } else {
+    io_s.PermitUncheckedError();
+  }
+
+  if (c != nullptr) {
+    c->ReleaseCompactionFiles(status);
+    *made_progress = true;
+
+#ifndef ROCKSDB_LITE
+    // Need to make sure SstFileManager does its bookkeeping
+    auto sfm = static_cast<SstFileManagerImpl*>(
+        immutable_db_options_.sst_file_manager.get());
+    if (sfm && sfm_reserved_compact_space) {
+      sfm->OnCompactionCompletion(c.get());
+    }
+#endif  // ROCKSDB_LITE
+
+    NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status,
+                                compaction_job_stats, job_context->job_id);
+  }
+
+  if (status.ok() || status.IsCompactionTooLarge() ||
+      status.IsManualCompactionPaused()) {
+    // Done
+  } else if (status.IsColumnFamilyDropped() || status.IsShutdownInProgress()) {
+    // Ignore compaction errors found during shutting down
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log, "Compaction error: %s",
+                   status.ToString().c_str());
+    if (!io_s.ok()) {
+      // Error while writing to MANIFEST.
+      // In fact, versions_->io_status() can also be the result of renaming
+      // CURRENT file. With current code, it's just difficult to tell. So just
+      // be pessimistic and try write to a new MANIFEST.
+      // TODO: distinguish between MANIFEST write and CURRENT renaming
+      auto err_reason = versions_->io_status().ok()
+                            ? BackgroundErrorReason::kCompaction
+                            : BackgroundErrorReason::kManifestWrite;
+      error_handler_.SetBGError(io_s, err_reason);
+    } else {
+      error_handler_.SetBGError(status, BackgroundErrorReason::kCompaction);
+    }
+    if (c != nullptr && !is_manual && !error_handler_.IsBGWorkStopped()) {
+      // Put this cfd back in the compaction queue so we can retry after some
+      // time
+      auto cfd = c->column_family_data();
+      assert(cfd != nullptr);
+      // Since this compaction failed, we need to recompute the score so it
+      // takes the original input files into account
+      c->column_family_data()
+          ->current()
+          ->storage_info()
+          ->ComputeCompactionScore(*(c->immutable_options()),
+                                   *(c->mutable_cf_options()));
+      if (!cfd->queued_for_compaction()) {
+        AddToCompactionQueue(cfd);
+        ++unscheduled_compactions_;
+      }
+    }
+  }
+  // this will unref its input_version and column_family_data
+  c.reset();
+
+  if (is_manual) {
+    ManualCompactionState* m = manual_compaction;
+    if (!status.ok()) {
+      m->status = status;
+      m->done = true;
+    }
+    // For universal compaction:
+    //   Because universal compaction always happens at level 0, so one
+    //   compaction will pick up all overlapped files. No files will be
+    //   filtered out due to size limit and left for a successive compaction.
+    //   So we can safely conclude the current compaction.
+    //
+    //   Also note that, if we don't stop here, then the current compaction
+    //   writes a new file back to level 0, which will be used in successive
+    //   compaction. Hence the manual compaction will never finish.
+    //
+    // Stop the compaction if manual_end points to nullptr -- this means
+    // that we compacted the whole range. manual_end should always point
+    // to nullptr in case of universal compaction
+    if (m->manual_end == nullptr) {
+      m->done = true;
+    }
+    if (!m->done) {
+      // We only compacted part of the requested range.  Update *m
+      // to the range that is left to be compacted.
+      // Universal and FIFO compactions should always compact the whole range
+      assert(m->cfd->ioptions()->compaction_style !=
+                 kCompactionStyleUniversal ||
+             m->cfd->ioptions()->num_levels > 1);
+      assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
+      m->tmp_storage = *m->manual_end;
+      m->begin = &m->tmp_storage;
+      m->incomplete = true;
+    }
+    m->in_progress = false;  // not being processed anymore
+  }
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Finish");
+  return status;
+}
+
+bool DBImpl::HasPendingManualCompaction() {
+  return (!manual_compaction_dequeue_.empty());
+}
+
+void DBImpl::AddManualCompaction(DBImpl::ManualCompactionState* m) {
+  assert(manual_compaction_paused_ == 0);
+  manual_compaction_dequeue_.push_back(m);
+}
+
+void DBImpl::RemoveManualCompaction(DBImpl::ManualCompactionState* m) {
+  // Remove from queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      it = manual_compaction_dequeue_.erase(it);
+      return;
+    }
+    ++it;
+  }
+  assert(false);
+  return;
+}
+
+bool DBImpl::ShouldntRunManualCompaction(ManualCompactionState* m) {
+  if (num_running_ingest_file_ > 0) {
+    // We need to wait for other IngestExternalFile() calls to finish
+    // before running a manual compaction.
+    return true;
+  }
+  if (m->exclusive) {
+    return (bg_bottom_compaction_scheduled_ > 0 ||
+            bg_compaction_scheduled_ > 0);
+  }
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  bool seen = false;
+  while (it != manual_compaction_dequeue_.end()) {
+    if (m == (*it)) {
+      ++it;
+      seen = true;
+      continue;
+    } else if (MCOverlap(m, (*it)) && (!seen && !(*it)->in_progress)) {
+      // Consider the other manual compaction *it, conflicts if:
+      // overlaps with m
+      // and (*it) is ahead in the queue and is not yet in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HaveManualCompaction(ColumnFamilyData* cfd) {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    if ((cfd == (*it)->cfd) && (!((*it)->in_progress || (*it)->done))) {
+      // Allow automatic compaction if manual compaction is
+      // in progress
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::HasExclusiveManualCompaction() {
+  // Remove from priority queue
+  std::deque<ManualCompactionState*>::iterator it =
+      manual_compaction_dequeue_.begin();
+  while (it != manual_compaction_dequeue_.end()) {
+    if ((*it)->exclusive) {
+      return true;
+    }
+    ++it;
+  }
+  return false;
+}
+
+bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
+  if ((m->exclusive) || (m1->exclusive)) {
+    return true;
+  }
+  if (m->cfd != m1->cfd) {
+    return false;
+  }
+  return false;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::BuildCompactionJobInfo(
+    const ColumnFamilyData* cfd, Compaction* c, const Status& st,
+    const CompactionJobStats& compaction_job_stats, const int job_id,
+    const Version* current, CompactionJobInfo* compaction_job_info) const {
+  assert(compaction_job_info != nullptr);
+  compaction_job_info->cf_id = cfd->GetID();
+  compaction_job_info->cf_name = cfd->GetName();
+  compaction_job_info->status = st;
+  compaction_job_info->thread_id = env_->GetThreadID();
+  compaction_job_info->job_id = job_id;
+  compaction_job_info->base_input_level = c->start_level();
+  compaction_job_info->output_level = c->output_level();
+  compaction_job_info->stats = compaction_job_stats;
+  compaction_job_info->table_properties = c->GetOutputTableProperties();
+  compaction_job_info->compaction_reason = c->compaction_reason();
+  compaction_job_info->compression = c->output_compression();
+  for (size_t i = 0; i < c->num_input_levels(); ++i) {
+    for (const auto fmd : *c->inputs(i)) {
+      const FileDescriptor& desc = fmd->fd;
+      const uint64_t file_number = desc.GetNumber();
+      auto fn = TableFileName(c->immutable_options()->cf_paths, file_number,
+                              desc.GetPathId());
+      compaction_job_info->input_files.push_back(fn);
+      compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
+          static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
+      if (compaction_job_info->table_properties.count(fn) == 0) {
+        std::shared_ptr<const TableProperties> tp;
+        auto s = current->GetTableProperties(&tp, fmd, &fn);
+        if (s.ok()) {
+          compaction_job_info->table_properties[fn] = tp;
+        }
+      }
+    }
+  }
+  for (const auto& newf : c->edit()->GetNewFiles()) {
+    const FileMetaData& meta = newf.second;
+    const FileDescriptor& desc = meta.fd;
+    const uint64_t file_number = desc.GetNumber();
+    compaction_job_info->output_files.push_back(TableFileName(
+        c->immutable_options()->cf_paths, file_number, desc.GetPathId()));
+    compaction_job_info->output_file_infos.push_back(CompactionFileInfo{
+        newf.first, file_number, meta.oldest_blob_file_number});
+  }
+  compaction_job_info->blob_compression_type =
+      c->mutable_cf_options()->blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    compaction_job_info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
+
+  // Update BlobFilesGarbageInfo.
+  for (const auto& blob_file : c->edit()->GetBlobFileGarbages()) {
+    BlobFileGarbageInfo blob_file_garbage_info(
+        BlobFileName(c->immutable_options()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetGarbageBlobCount(),
+        blob_file.GetGarbageBlobBytes());
+    compaction_job_info->blob_file_garbage_infos.emplace_back(
+        std::move(blob_file_garbage_info));
+  }
+}
+#endif
+
+// SuperVersionContext gets created and destructed outside of the lock --
+// we use this conveniently to:
+// * malloc one SuperVersion() outside of the lock -- new_superversion
+// * delete SuperVersion()s outside of the lock -- superversions_to_free
+//
+// However, if InstallSuperVersionAndScheduleWork() gets called twice with the
+// same sv_context, we can't reuse the SuperVersion() that got
+// malloced because
+// first call already used it. In that rare case, we take a hit and create a
+// new SuperVersion() inside of the mutex. We do similar thing
+// for superversion_to_free
+
+void DBImpl::InstallSuperVersionAndScheduleWork(
+    ColumnFamilyData* cfd, SuperVersionContext* sv_context,
+    const MutableCFOptions& mutable_cf_options) {
+  mutex_.AssertHeld();
+
+  // Update max_total_in_memory_state_
+  size_t old_memtable_size = 0;
+  auto* old_sv = cfd->GetSuperVersion();
+  if (old_sv) {
+    old_memtable_size = old_sv->mutable_cf_options.write_buffer_size *
+                        old_sv->mutable_cf_options.max_write_buffer_number;
+  }
+
+  // this branch is unlikely to step in
+  if (UNLIKELY(sv_context->new_superversion == nullptr)) {
+    sv_context->NewSuperVersion();
+  }
+  cfd->InstallSuperVersion(sv_context, mutable_cf_options);
+
+  // There may be a small data race here. The snapshot tricking bottommost
+  // compaction may already be released here. But assuming there will always be
+  // newer snapshot created and released frequently, the compaction will be
+  // triggered soon anyway.
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
+    if (!my_cfd->ioptions()->allow_ingest_behind) {
+      bottommost_files_mark_threshold_ = std::min(
+          bottommost_files_mark_threshold_,
+          my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
+    }
+  }
+
+  // Whenever we install new SuperVersion, we might need to issue new flushes or
+  // compactions.
+  SchedulePendingCompaction(cfd);
+  MaybeScheduleFlushOrCompaction();
+
+  // Update max_total_in_memory_state_
+  max_total_in_memory_state_ = max_total_in_memory_state_ - old_memtable_size +
+                               mutable_cf_options.write_buffer_size *
+                                   mutable_cf_options.max_write_buffer_number;
+}
+
+// ShouldPurge is called by FindObsoleteFiles when doing a full scan,
+// and db mutex (mutex_) should already be held.
+// Actually, the current implementation of FindObsoleteFiles with
+// full_scan=true can issue I/O requests to obtain list of files in
+// directories, e.g. env_->getChildren while holding db mutex.
+bool DBImpl::ShouldPurge(uint64_t file_number) const {
+  return files_grabbed_for_purge_.find(file_number) ==
+             files_grabbed_for_purge_.end() &&
+         purge_files_.find(file_number) == purge_files_.end();
+}
+
+// MarkAsGrabbedForPurge is called by FindObsoleteFiles, and db mutex
+// (mutex_) should already be held.
+void DBImpl::MarkAsGrabbedForPurge(uint64_t file_number) {
+  files_grabbed_for_purge_.insert(file_number);
+}
+
+void DBImpl::SetSnapshotChecker(SnapshotChecker* snapshot_checker) {
+  InstrumentedMutexLock l(&mutex_);
+  // snapshot_checker_ should only set once. If we need to set it multiple
+  // times, we need to make sure the old one is not deleted while it is still
+  // using by a compaction job.
+  assert(!snapshot_checker_);
+  snapshot_checker_.reset(snapshot_checker);
+}
+
+void DBImpl::GetSnapshotContext(
+    JobContext* job_context, std::vector<SequenceNumber>* snapshot_seqs,
+    SequenceNumber* earliest_write_conflict_snapshot,
+    SnapshotChecker** snapshot_checker_ptr) {
+  mutex_.AssertHeld();
+  assert(job_context != nullptr);
+  assert(snapshot_seqs != nullptr);
+  assert(earliest_write_conflict_snapshot != nullptr);
+  assert(snapshot_checker_ptr != nullptr);
+
+  *snapshot_checker_ptr = snapshot_checker_.get();
+  if (use_custom_gc_ && *snapshot_checker_ptr == nullptr) {
+    *snapshot_checker_ptr = DisableGCSnapshotChecker::Instance();
+  }
+  if (*snapshot_checker_ptr != nullptr) {
+    // If snapshot_checker is used, that means the flush/compaction may
+    // contain values not visible to snapshot taken after
+    // flush/compaction job starts. Take a snapshot and it will appear
+    // in snapshot_seqs and force compaction iterator to consider such
+    // snapshots.
+    const Snapshot* job_snapshot =
+        GetSnapshotImpl(false /*write_conflict_boundary*/, false /*lock*/);
+    job_context->job_snapshot.reset(new ManagedSnapshot(this, job_snapshot));
+  }
+  *snapshot_seqs = snapshots_.GetAll(earliest_write_conflict_snapshot);
+}
+
+Status DBImpl::WaitForCompact(bool wait_unscheduled) {
+  // Wait until the compaction completes
+  InstrumentedMutexLock l(&mutex_);
+  while ((bg_bottom_compaction_scheduled_ || bg_compaction_scheduled_ ||
+          bg_flush_scheduled_ ||
+          (wait_unscheduled && unscheduled_compactions_)) &&
+         (error_handler_.GetBGError().ok())) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_debug.cc b/src/rocksdb/db/db_impl/db_impl_debug.cc
new file mode 100644
index 000000000..7054b0669
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_debug.cc
@@ -0,0 +1,312 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef NDEBUG
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "monitoring/thread_status_updater.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+uint64_t DBImpl::TEST_GetLevel0TotalSize() {
+  InstrumentedMutexLock l(&mutex_);
+  return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0);
+}
+
+Status DBImpl::TEST_SwitchWAL() {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  void* writer = TEST_BeginWrite();
+  auto s = SwitchWAL(&write_context);
+  TEST_EndWrite(writer);
+  return s;
+}
+
+uint64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
+    ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  InstrumentedMutexLock l(&mutex_);
+  return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes();
+}
+
+void DBImpl::TEST_GetFilesMetaData(
+    ColumnFamilyHandle* column_family,
+    std::vector<std::vector<FileMetaData>>* metadata,
+    std::vector<std::shared_ptr<BlobFileMetaData>>* blob_metadata) {
+  assert(metadata);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  assert(cfh);
+
+  auto cfd = cfh->cfd();
+  assert(cfd);
+
+  InstrumentedMutexLock l(&mutex_);
+
+  const auto* current = cfd->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  metadata->resize(NumberLevels());
+
+  for (int level = 0; level < NumberLevels(); ++level) {
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(level);
+
+    (*metadata)[level].clear();
+    (*metadata)[level].reserve(files.size());
+
+    for (const auto& f : files) {
+      (*metadata)[level].push_back(*f);
+    }
+  }
+
+  if (blob_metadata) {
+    *blob_metadata = vstorage->GetBlobFiles();
+  }
+}
+
+uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
+  return versions_->manifest_file_number();
+}
+
+uint64_t DBImpl::TEST_Current_Next_FileNo() {
+  return versions_->current_next_file_number();
+}
+
+Status DBImpl::TEST_CompactRange(int level, const Slice* begin,
+                                 const Slice* end,
+                                 ColumnFamilyHandle* column_family,
+                                 bool disallow_trivial_move) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  int output_level =
+      (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
+       cfd->ioptions()->compaction_style == kCompactionStyleFIFO)
+          ? level
+          : level + 1;
+  return RunManualCompaction(
+      cfd, level, output_level, CompactRangeOptions(), begin, end, true,
+      disallow_trivial_move,
+      std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
+      "" /*trim_ts*/);
+}
+
+Status DBImpl::TEST_SwitchMemtable(ColumnFamilyData* cfd) {
+  WriteContext write_context;
+  InstrumentedMutexLock l(&mutex_);
+  if (cfd == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  }
+
+  Status s;
+  void* writer = TEST_BeginWrite();
+  if (two_write_queues_) {
+    WriteThread::Writer nonmem_w;
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+    s = SwitchMemtable(cfd, &write_context);
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  } else {
+    s = SwitchMemtable(cfd, &write_context);
+  }
+  TEST_EndWrite(writer);
+  return s;
+}
+
+Status DBImpl::TEST_FlushMemTable(bool wait, bool allow_write_stall,
+                                  ColumnFamilyHandle* cfh) {
+  FlushOptions fo;
+  fo.wait = wait;
+  fo.allow_write_stall = allow_write_stall;
+  ColumnFamilyData* cfd;
+  if (cfh == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+    cfd = cfhi->cfd();
+  }
+  return FlushMemTable(cfd, fo, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_FlushMemTable(ColumnFamilyData* cfd,
+                                  const FlushOptions& flush_opts) {
+  return FlushMemTable(cfd, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_AtomicFlushMemTables(
+    const autovector<ColumnFamilyData*>& cfds, const FlushOptions& flush_opts) {
+  return AtomicFlushMemTables(cfds, flush_opts, FlushReason::kTest);
+}
+
+Status DBImpl::TEST_WaitForBackgroundWork() {
+  InstrumentedMutexLock l(&mutex_);
+  WaitForBackgroundWork();
+  return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_WaitForFlushMemTable(ColumnFamilyHandle* column_family) {
+  ColumnFamilyData* cfd;
+  if (column_family == nullptr) {
+    cfd = default_cf_handle_->cfd();
+  } else {
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+    cfd = cfh->cfd();
+  }
+  return WaitForFlushMemTable(cfd, nullptr, false);
+}
+
+Status DBImpl::TEST_WaitForCompact(bool wait_unscheduled) {
+  // Wait until the compaction completes
+  return WaitForCompact(wait_unscheduled);
+}
+
+Status DBImpl::TEST_WaitForPurge() {
+  InstrumentedMutexLock l(&mutex_);
+  while (bg_purge_scheduled_ && error_handler_.GetBGError().ok()) {
+    bg_cv_.Wait();
+  }
+  return error_handler_.GetBGError();
+}
+
+Status DBImpl::TEST_GetBGError() {
+  InstrumentedMutexLock l(&mutex_);
+  return error_handler_.GetBGError();
+}
+
+void DBImpl::TEST_LockMutex() { mutex_.Lock(); }
+
+void DBImpl::TEST_UnlockMutex() { mutex_.Unlock(); }
+
+void* DBImpl::TEST_BeginWrite() {
+  auto w = new WriteThread::Writer();
+  write_thread_.EnterUnbatched(w, &mutex_);
+  return reinterpret_cast<void*>(w);
+}
+
+void DBImpl::TEST_EndWrite(void* w) {
+  auto writer = reinterpret_cast<WriteThread::Writer*>(w);
+  write_thread_.ExitUnbatched(writer);
+  delete writer;
+}
+
+size_t DBImpl::TEST_LogsToFreeSize() {
+  InstrumentedMutexLock l(&log_write_mutex_);
+  return logs_to_free_.size();
+}
+
+uint64_t DBImpl::TEST_LogfileNumber() {
+  InstrumentedMutexLock l(&mutex_);
+  return logfile_number_;
+}
+
+Status DBImpl::TEST_GetAllImmutableCFOptions(
+    std::unordered_map<std::string, const ImmutableCFOptions*>* iopts_map) {
+  std::vector<std::string> cf_names;
+  std::vector<const ImmutableCFOptions*> iopts;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      cf_names.push_back(cfd->GetName());
+      iopts.push_back(cfd->ioptions());
+    }
+  }
+  iopts_map->clear();
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    iopts_map->insert({cf_names[i], iopts[i]});
+  }
+
+  return Status::OK();
+}
+
+uint64_t DBImpl::TEST_FindMinLogContainingOutstandingPrep() {
+  return logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+}
+
+size_t DBImpl::TEST_PreparedSectionCompletedSize() {
+  return logs_with_prep_tracker_.TEST_PreparedSectionCompletedSize();
+}
+
+size_t DBImpl::TEST_LogsWithPrepSize() {
+  return logs_with_prep_tracker_.TEST_LogsWithPrepSize();
+}
+
+uint64_t DBImpl::TEST_FindMinPrepLogReferencedByMemTable() {
+  autovector<MemTable*> empty_list;
+  return FindMinPrepLogReferencedByMemTable(versions_.get(), empty_list);
+}
+
+Status DBImpl::TEST_GetLatestMutableCFOptions(
+    ColumnFamilyHandle* column_family, MutableCFOptions* mutable_cf_options) {
+  InstrumentedMutexLock l(&mutex_);
+
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  *mutable_cf_options = *cfh->cfd()->GetLatestMutableCFOptions();
+  return Status::OK();
+}
+
+int DBImpl::TEST_BGCompactionsAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_compactions;
+}
+
+int DBImpl::TEST_BGFlushesAllowed() const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetBGJobLimits().max_flushes;
+}
+
+SequenceNumber DBImpl::TEST_GetLastVisibleSequence() const {
+  if (last_seq_same_as_publish_seq_) {
+    return versions_->LastSequence();
+  } else {
+    return versions_->LastAllocatedSequence();
+  }
+}
+
+size_t DBImpl::TEST_GetWalPreallocateBlockSize(
+    uint64_t write_buffer_size) const {
+  InstrumentedMutexLock l(&mutex_);
+  return GetWalPreallocateBlockSize(write_buffer_size);
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::TEST_WaitForPeridicTaskRun(std::function<void()> callback) const {
+  periodic_task_scheduler_.TEST_WaitForRun(callback);
+}
+
+const PeriodicTaskScheduler& DBImpl::TEST_GetPeriodicTaskScheduler() const {
+  return periodic_task_scheduler_;
+}
+
+SeqnoToTimeMapping DBImpl::TEST_GetSeqnoToTimeMapping() const {
+  InstrumentedMutexLock l(&mutex_);
+  return seqno_time_mapping_;
+}
+
+#endif  // !ROCKSDB_LITE
+
+size_t DBImpl::TEST_EstimateInMemoryStatsHistorySize() const {
+  return EstimateInMemoryStatsHistorySize();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // NDEBUG
diff --git a/src/rocksdb/db/db_impl/db_impl_experimental.cc b/src/rocksdb/db/db_impl/db_impl_experimental.cc
new file mode 100644
index 000000000..c1b1e4137
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_experimental.cc
@@ -0,0 +1,158 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <cinttypes>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "logging/logging.h"
+#include "rocksdb/status.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::SuggestCompactRange(ColumnFamilyHandle* column_family,
+                                   const Slice* begin, const Slice* end) {
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  InternalKey start_key, end_key;
+  if (begin != nullptr) {
+    start_key.SetMinPossibleForUserKey(*begin);
+  }
+  if (end != nullptr) {
+    end_key.SetMaxPossibleForUserKey(*end);
+  }
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto vstorage = cfd->current()->storage_info();
+    for (int level = 0; level < vstorage->num_non_empty_levels() - 1; ++level) {
+      std::vector<FileMetaData*> inputs;
+      vstorage->GetOverlappingInputs(
+          level, begin == nullptr ? nullptr : &start_key,
+          end == nullptr ? nullptr : &end_key, &inputs);
+      for (auto f : inputs) {
+        f->marked_for_compaction = true;
+      }
+    }
+    // Since we have some more files to compact, we should also recompute
+    // compaction score
+    vstorage->ComputeCompactionScore(*cfd->ioptions(),
+                                     *cfd->GetLatestMutableCFOptions());
+    SchedulePendingCompaction(cfd);
+    MaybeScheduleFlushOrCompaction();
+  }
+  return Status::OK();
+}
+
+Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) {
+  assert(column_family);
+
+  if (target_level < 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "PromoteL0 FAILED. Invalid target level %d\n", target_level);
+    return Status::InvalidArgument("Invalid target level");
+  }
+
+  Status status;
+  VersionEdit edit;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    auto* cfd = static_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
+    const auto* vstorage = cfd->current()->storage_info();
+
+    if (target_level >= vstorage->num_levels()) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "PromoteL0 FAILED. Target level %d does not exist\n",
+                     target_level);
+      job_context.Clean();
+      status = Status::InvalidArgument("Target level does not exist");
+      return status;
+    }
+
+    // Sort L0 files by range.
+    const InternalKeyComparator* icmp = &cfd->internal_comparator();
+    auto l0_files = vstorage->LevelFiles(0);
+    std::sort(l0_files.begin(), l0_files.end(),
+              [icmp](FileMetaData* f1, FileMetaData* f2) {
+                return icmp->Compare(f1->largest, f2->largest) < 0;
+              });
+
+    // Check that no L0 file is being compacted and that they have
+    // non-overlapping ranges.
+    for (size_t i = 0; i < l0_files.size(); ++i) {
+      auto f = l0_files[i];
+      if (f->being_compacted) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. File %" PRIu64 " being compacted\n",
+                       f->fd.GetNumber());
+        job_context.Clean();
+        status =
+            Status::InvalidArgument("PromoteL0 called during L0 compaction");
+        return status;
+      }
+
+      if (i == 0) continue;
+      auto prev_f = l0_files[i - 1];
+      if (icmp->Compare(prev_f->largest, f->smallest) >= 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Files %" PRIu64 " and %" PRIu64
+                       " have overlapping ranges\n",
+                       prev_f->fd.GetNumber(), f->fd.GetNumber());
+        job_context.Clean();
+        status = Status::InvalidArgument("L0 has overlapping files");
+        return status;
+      }
+    }
+
+    // Check that all levels up to target_level are empty.
+    for (int level = 1; level <= target_level; ++level) {
+      if (vstorage->NumLevelFiles(level) > 0) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "PromoteL0 FAILED. Level %d not empty\n", level);
+        job_context.Clean();
+        status = Status::InvalidArgument(
+            "All levels up to target_level "
+            "must be empty");
+        return status;
+      }
+    }
+
+    edit.SetColumnFamily(cfd->GetID());
+    for (const auto& f : l0_files) {
+      edit.DeleteFile(0, f->fd.GetNumber());
+      edit.AddFile(target_level, f->fd.GetNumber(), f->fd.GetPathId(),
+                   f->fd.GetFileSize(), f->smallest, f->largest,
+                   f->fd.smallest_seqno, f->fd.largest_seqno,
+                   f->marked_for_compaction, f->temperature,
+                   f->oldest_blob_file_number, f->oldest_ancester_time,
+                   f->file_creation_time, f->file_checksum,
+                   f->file_checksum_func_name, f->unique_id);
+    }
+
+    status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                    &edit, &mutex_, directories_.GetDbDir());
+    if (status.ok()) {
+      InstallSuperVersionAndScheduleWork(cfd,
+                                         &job_context.superversion_contexts[0],
+                                         *cfd->GetLatestMutableCFOptions());
+    }
+  }  // lock released here
+  LogFlush(immutable_db_options_.info_log);
+  job_context.Clean();
+
+  return status;
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_files.cc b/src/rocksdb/db/db_impl/db_impl_files.cc
new file mode 100644
index 000000000..058df4da7
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_files.cc
@@ -0,0 +1,1013 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+#include <set>
+#include <unordered_set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "db/memtable_list.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "util/autovector.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t DBImpl::MinLogNumberToKeep() {
+  return versions_->min_log_number_to_keep();
+}
+
+uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
+  mutex_.AssertHeld();
+  if (!pending_outputs_.empty()) {
+    return *pending_outputs_.begin();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+Status DBImpl::DisableFileDeletions() {
+  Status s;
+  int my_disable_delete_obsolete_files;
+  {
+    InstrumentedMutexLock l(&mutex_);
+    s = DisableFileDeletionsWithLock();
+    my_disable_delete_obsolete_files = disable_delete_obsolete_files_;
+  }
+  if (my_disable_delete_obsolete_files == 1) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Disabled");
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Disabled, but already disabled. Counter: %d",
+                   my_disable_delete_obsolete_files);
+  }
+  return s;
+}
+
+// FIXME: can be inconsistent with DisableFileDeletions in cases like
+// DBImplReadOnly
+Status DBImpl::DisableFileDeletionsWithLock() {
+  mutex_.AssertHeld();
+  ++disable_delete_obsolete_files_;
+  return Status::OK();
+}
+
+Status DBImpl::EnableFileDeletions(bool force) {
+  // Job id == 0 means that this is not our background process, but rather
+  // user thread
+  JobContext job_context(0);
+  int saved_counter;  // initialize on all paths
+  {
+    InstrumentedMutexLock l(&mutex_);
+    if (force) {
+      // if force, we need to enable file deletions right away
+      disable_delete_obsolete_files_ = 0;
+    } else if (disable_delete_obsolete_files_ > 0) {
+      --disable_delete_obsolete_files_;
+    }
+    saved_counter = disable_delete_obsolete_files_;
+    if (saved_counter == 0) {
+      FindObsoleteFiles(&job_context, true);
+      bg_cv_.SignalAll();
+    }
+  }
+  if (saved_counter == 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "File Deletions Enabled");
+    if (job_context.HaveSomethingToDelete()) {
+      PurgeObsoleteFiles(job_context);
+    }
+  } else {
+    ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                   "File Deletions Enable, but not really enabled. Counter: %d",
+                   saved_counter);
+  }
+  job_context.Clean();
+  LogFlush(immutable_db_options_.info_log);
+  return Status::OK();
+}
+
+bool DBImpl::IsFileDeletionsEnabled() const {
+  return 0 == disable_delete_obsolete_files_;
+}
+
+// * Returns the list of live files in 'sst_live' and 'blob_live'.
+// If it's doing full scan:
+// * Returns the list of all files in the filesystem in
+// 'full_scan_candidate_files'.
+// Otherwise, gets obsolete files from VersionSet.
+// no_full_scan = true -- never do the full scan using GetChildren()
+// force = false -- don't force the full scan, except every
+//  mutable_db_options_.delete_obsolete_files_period_micros
+// force = true -- force the full scan
+void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
+                               bool no_full_scan) {
+  mutex_.AssertHeld();
+
+  // if deletion is disabled, do nothing
+  if (disable_delete_obsolete_files_ > 0) {
+    return;
+  }
+
+  bool doing_the_full_scan = false;
+
+  // logic for figuring out if we're doing the full scan
+  if (no_full_scan) {
+    doing_the_full_scan = false;
+  } else if (force ||
+             mutable_db_options_.delete_obsolete_files_period_micros == 0) {
+    doing_the_full_scan = true;
+  } else {
+    const uint64_t now_micros = immutable_db_options_.clock->NowMicros();
+    if ((delete_obsolete_files_last_run_ +
+         mutable_db_options_.delete_obsolete_files_period_micros) <
+        now_micros) {
+      doing_the_full_scan = true;
+      delete_obsolete_files_last_run_ = now_micros;
+    }
+  }
+
+  // don't delete files that might be currently written to from compaction
+  // threads
+  // Since job_context->min_pending_output is set, until file scan finishes,
+  // mutex_ cannot be released. Otherwise, we might see no min_pending_output
+  // here but later find newer generated unfinalized files while scanning.
+  job_context->min_pending_output = MinObsoleteSstNumberToKeep();
+
+  // Get obsolete files.  This function will also update the list of
+  // pending files in VersionSet().
+  versions_->GetObsoleteFiles(
+      &job_context->sst_delete_files, &job_context->blob_delete_files,
+      &job_context->manifest_delete_files, job_context->min_pending_output);
+
+  // Mark the elements in job_context->sst_delete_files and
+  // job_context->blob_delete_files as "grabbed for purge" so that other threads
+  // calling FindObsoleteFiles with full_scan=true will not add these files to
+  // candidate list for purge.
+  for (const auto& sst_to_del : job_context->sst_delete_files) {
+    MarkAsGrabbedForPurge(sst_to_del.metadata->fd.GetNumber());
+  }
+
+  for (const auto& blob_file : job_context->blob_delete_files) {
+    MarkAsGrabbedForPurge(blob_file.GetBlobFileNumber());
+  }
+
+  // store the current filenum, lognum, etc
+  job_context->manifest_file_number = versions_->manifest_file_number();
+  job_context->pending_manifest_file_number =
+      versions_->pending_manifest_file_number();
+  job_context->log_number = MinLogNumberToKeep();
+  job_context->prev_log_number = versions_->prev_log_number();
+
+  if (doing_the_full_scan) {
+    versions_->AddLiveFiles(&job_context->sst_live, &job_context->blob_live);
+    InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                  dbname_);
+    std::set<std::string> paths;
+    for (size_t path_id = 0; path_id < immutable_db_options_.db_paths.size();
+         path_id++) {
+      paths.insert(immutable_db_options_.db_paths[path_id].path);
+    }
+
+    // Note that if cf_paths is not specified in the ColumnFamilyOptions
+    // of a particular column family, we use db_paths as the cf_paths
+    // setting. Hence, there can be multiple duplicates of files from db_paths
+    // in the following code. The duplicate are removed while identifying
+    // unique files in PurgeObsoleteFiles.
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (size_t path_id = 0; path_id < cfd->ioptions()->cf_paths.size();
+           path_id++) {
+        auto& path = cfd->ioptions()->cf_paths[path_id].path;
+
+        if (paths.find(path) == paths.end()) {
+          paths.insert(path);
+        }
+      }
+    }
+
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (auto& path : paths) {
+      // set of all files in the directory. We'll exclude files that are still
+      // alive in the subsequent processings.
+      std::vector<std::string> files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          path, io_opts, &files, /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (const std::string& file : files) {
+        uint64_t number;
+        FileType type;
+        // 1. If we cannot parse the file name, we skip;
+        // 2. If the file with file_number equals number has already been
+        // grabbed for purge by another compaction job, or it has already been
+        // schedule for purge, we also skip it if we
+        // are doing full scan in order to avoid double deletion of the same
+        // file under race conditions. See
+        // https://github.com/facebook/rocksdb/issues/3573
+        if (!ParseFileName(file, &number, info_log_prefix.prefix, &type) ||
+            !ShouldPurge(number)) {
+          continue;
+        }
+
+        // TODO(icanadi) clean up this mess to avoid having one-off "/"
+        // prefixes
+        job_context->full_scan_candidate_files.emplace_back("/" + file, path);
+      }
+    }
+
+    // Add log files in wal_dir
+    if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
+      std::vector<std::string> log_files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          immutable_db_options_.wal_dir, io_opts, &log_files,
+          /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (const std::string& log_file : log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.wal_dir);
+      }
+    }
+
+    // Add info log files in db_log_dir
+    if (!immutable_db_options_.db_log_dir.empty() &&
+        immutable_db_options_.db_log_dir != dbname_) {
+      std::vector<std::string> info_log_files;
+      Status s = immutable_db_options_.fs->GetChildren(
+          immutable_db_options_.db_log_dir, io_opts, &info_log_files,
+          /*IODebugContext*=*/nullptr);
+      s.PermitUncheckedError();  // TODO: What should we do on error?
+      for (std::string& log_file : info_log_files) {
+        job_context->full_scan_candidate_files.emplace_back(
+            log_file, immutable_db_options_.db_log_dir);
+      }
+    }
+  } else {
+    // Instead of filling ob_context->sst_live and job_context->blob_live,
+    // directly remove files that show up in any Version. This is because
+    // candidate files tend to be a small percentage of all files, so it is
+    // usually cheaper to check them against every version, compared to
+    // building a map for all files.
+    versions_->RemoveLiveFiles(job_context->sst_delete_files,
+                               job_context->blob_delete_files);
+  }
+
+  // Before potentially releasing mutex and waiting on condvar, increment
+  // pending_purge_obsolete_files_ so that another thread executing
+  // `GetSortedWals` will wait until this thread finishes execution since the
+  // other thread will be waiting for `pending_purge_obsolete_files_`.
+  // pending_purge_obsolete_files_ MUST be decremented if there is nothing to
+  // delete.
+  ++pending_purge_obsolete_files_;
+
+  Defer cleanup([job_context, this]() {
+    assert(job_context != nullptr);
+    if (!job_context->HaveSomethingToDelete()) {
+      mutex_.AssertHeld();
+      --pending_purge_obsolete_files_;
+    }
+  });
+
+  // logs_ is empty when called during recovery, in which case there can't yet
+  // be any tracked obsolete logs
+  log_write_mutex_.Lock();
+
+  if (alive_log_files_.empty() || logs_.empty()) {
+    mutex_.AssertHeld();
+    // We may reach here if the db is DBImplSecondary
+    log_write_mutex_.Unlock();
+    return;
+  }
+
+  if (!alive_log_files_.empty() && !logs_.empty()) {
+    uint64_t min_log_number = job_context->log_number;
+    size_t num_alive_log_files = alive_log_files_.size();
+    // find newly obsoleted log files
+    while (alive_log_files_.begin()->number < min_log_number) {
+      auto& earliest = *alive_log_files_.begin();
+      if (immutable_db_options_.recycle_log_file_num >
+          log_recycle_files_.size()) {
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "adding log %" PRIu64 " to recycle list\n",
+                       earliest.number);
+        log_recycle_files_.push_back(earliest.number);
+      } else {
+        job_context->log_delete_files.push_back(earliest.number);
+      }
+      if (job_context->size_log_to_delete == 0) {
+        job_context->prev_total_log_size = total_log_size_;
+        job_context->num_alive_log_files = num_alive_log_files;
+      }
+      job_context->size_log_to_delete += earliest.size;
+      total_log_size_ -= earliest.size;
+      alive_log_files_.pop_front();
+
+      // Current log should always stay alive since it can't have
+      // number < MinLogNumber().
+      assert(alive_log_files_.size());
+    }
+    log_write_mutex_.Unlock();
+    mutex_.Unlock();
+    log_write_mutex_.Lock();
+    while (!logs_.empty() && logs_.front().number < min_log_number) {
+      auto& log = logs_.front();
+      if (log.IsSyncing()) {
+        log_sync_cv_.Wait();
+        // logs_ could have changed while we were waiting.
+        continue;
+      }
+      logs_to_free_.push_back(log.ReleaseWriter());
+      logs_.pop_front();
+    }
+    // Current log cannot be obsolete.
+    assert(!logs_.empty());
+  }
+
+  // We're just cleaning up for DB::Write().
+  assert(job_context->logs_to_free.empty());
+  job_context->logs_to_free = logs_to_free_;
+
+  logs_to_free_.clear();
+  log_write_mutex_.Unlock();
+  mutex_.Lock();
+  job_context->log_recycle_files.assign(log_recycle_files_.begin(),
+                                        log_recycle_files_.end());
+}
+
+// Delete obsolete files and log status and information of file deletion
+void DBImpl::DeleteObsoleteFileImpl(int job_id, const std::string& fname,
+                                    const std::string& path_to_sync,
+                                    FileType type, uint64_t number) {
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl::BeforeDeletion",
+                           const_cast<std::string*>(&fname));
+
+  Status file_deletion_status;
+  if (type == kTableFile || type == kBlobFile || type == kWalFile) {
+    // Rate limit WAL deletion only if its in the DB dir
+    file_deletion_status = DeleteDBFile(
+        &immutable_db_options_, fname, path_to_sync,
+        /*force_bg=*/false,
+        /*force_fg=*/(type == kWalFile) ? !wal_in_db_path_ : false);
+  } else {
+    file_deletion_status = env_->DeleteFile(fname);
+  }
+  TEST_SYNC_POINT_CALLBACK("DBImpl::DeleteObsoleteFileImpl:AfterDeletion",
+                           &file_deletion_status);
+  if (file_deletion_status.ok()) {
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[JOB %d] Delete %s type=%d #%" PRIu64 " -- %s\n", job_id,
+                    fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  } else if (env_->FileExists(fname).IsNotFound()) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "[JOB %d] Tried to delete a non-existing file %s type=%d #%" PRIu64
+        " -- %s\n",
+        job_id, fname.c_str(), type, number,
+        file_deletion_status.ToString().c_str());
+  } else {
+    ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                    "[JOB %d] Failed to delete %s type=%d #%" PRIu64 " -- %s\n",
+                    job_id, fname.c_str(), type, number,
+                    file_deletion_status.ToString().c_str());
+  }
+  if (type == kTableFile) {
+    EventHelpers::LogAndNotifyTableFileDeletion(
+        &event_logger_, job_id, number, fname, file_deletion_status, GetName(),
+        immutable_db_options_.listeners);
+  }
+  if (type == kBlobFile) {
+    EventHelpers::LogAndNotifyBlobFileDeletion(
+        &event_logger_, immutable_db_options_.listeners, job_id, number, fname,
+        file_deletion_status, GetName());
+  }
+}
+
+// Diffs the files listed in filenames and those that do not
+// belong to live files are possibly removed. Also, removes all the
+// files in sst_delete_files and log_delete_files.
+// It is not necessary to hold the mutex when invoking this method.
+void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:Begin");
+  // we'd better have sth to delete
+  assert(state.HaveSomethingToDelete());
+
+  // FindObsoleteFiles() should've populated this so nonzero
+  assert(state.manifest_file_number != 0);
+
+  // Now, convert lists to unordered sets, WITHOUT mutex held; set is slow.
+  std::unordered_set<uint64_t> sst_live_set(state.sst_live.begin(),
+                                            state.sst_live.end());
+  std::unordered_set<uint64_t> blob_live_set(state.blob_live.begin(),
+                                             state.blob_live.end());
+  std::unordered_set<uint64_t> log_recycle_files_set(
+      state.log_recycle_files.begin(), state.log_recycle_files.end());
+
+  auto candidate_files = state.full_scan_candidate_files;
+  candidate_files.reserve(
+      candidate_files.size() + state.sst_delete_files.size() +
+      state.blob_delete_files.size() + state.log_delete_files.size() +
+      state.manifest_delete_files.size());
+  // We may ignore the dbname when generating the file names.
+  for (auto& file : state.sst_delete_files) {
+    if (!file.only_delete_metadata) {
+      candidate_files.emplace_back(
+          MakeTableFileName(file.metadata->fd.GetNumber()), file.path);
+    }
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
+    }
+    file.DeleteMetadata();
+  }
+
+  for (const auto& blob_file : state.blob_delete_files) {
+    candidate_files.emplace_back(BlobFileName(blob_file.GetBlobFileNumber()),
+                                 blob_file.GetPath());
+  }
+
+  auto wal_dir = immutable_db_options_.GetWalDir();
+  for (auto file_num : state.log_delete_files) {
+    if (file_num > 0) {
+      candidate_files.emplace_back(LogFileName(file_num), wal_dir);
+    }
+  }
+  for (const auto& filename : state.manifest_delete_files) {
+    candidate_files.emplace_back(filename, dbname_);
+  }
+
+  // dedup state.candidate_files so we don't try to delete the same
+  // file twice
+  std::sort(candidate_files.begin(), candidate_files.end(),
+            [](const JobContext::CandidateFileInfo& lhs,
+               const JobContext::CandidateFileInfo& rhs) {
+              if (lhs.file_name > rhs.file_name) {
+                return true;
+              } else if (lhs.file_name < rhs.file_name) {
+                return false;
+              } else {
+                return (lhs.file_path > rhs.file_path);
+              }
+            });
+  candidate_files.erase(
+      std::unique(candidate_files.begin(), candidate_files.end()),
+      candidate_files.end());
+
+  if (state.prev_total_log_size > 0) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "[JOB %d] Try to delete WAL files size %" PRIu64
+                   ", prev total WAL file size %" PRIu64
+                   ", number of live WAL files %" ROCKSDB_PRIszt ".\n",
+                   state.job_id, state.size_log_to_delete,
+                   state.prev_total_log_size, state.num_alive_log_files);
+  }
+
+  std::vector<std::string> old_info_log_files;
+  InfoLogPrefix info_log_prefix(!immutable_db_options_.db_log_dir.empty(),
+                                dbname_);
+
+  // File numbers of most recent two OPTIONS file in candidate_files (found in
+  // previos FindObsoleteFiles(full_scan=true))
+  // At this point, there must not be any duplicate file numbers in
+  // candidate_files.
+  uint64_t optsfile_num1 = std::numeric_limits<uint64_t>::min();
+  uint64_t optsfile_num2 = std::numeric_limits<uint64_t>::min();
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& fname = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    if (!ParseFileName(fname, &number, info_log_prefix.prefix, &type) ||
+        type != kOptionsFile) {
+      continue;
+    }
+    if (number > optsfile_num1) {
+      optsfile_num2 = optsfile_num1;
+      optsfile_num1 = number;
+    } else if (number > optsfile_num2) {
+      optsfile_num2 = number;
+    }
+  }
+
+  // Close WALs before trying to delete them.
+  for (const auto w : state.logs_to_free) {
+    // TODO: maybe check the return value of Close.
+    auto s = w->Close();
+    s.PermitUncheckedError();
+  }
+
+  bool own_files = OwnTablesAndLogs();
+  std::unordered_set<uint64_t> files_to_del;
+  for (const auto& candidate_file : candidate_files) {
+    const std::string& to_delete = candidate_file.file_name;
+    uint64_t number;
+    FileType type;
+    // Ignore file if we cannot recognize it.
+    if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
+      continue;
+    }
+
+    bool keep = true;
+    switch (type) {
+      case kWalFile:
+        keep = ((number >= state.log_number) ||
+                (number == state.prev_log_number) ||
+                (log_recycle_files_set.find(number) !=
+                 log_recycle_files_set.end()));
+        break;
+      case kDescriptorFile:
+        // Keep my manifest file, and any newer incarnations'
+        // (can happen during manifest roll)
+        keep = (number >= state.manifest_file_number);
+        break;
+      case kTableFile:
+        // If the second condition is not there, this makes
+        // DontDeletePendingOutputs fail
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+               number >= state.min_pending_output;
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
+      case kBlobFile:
+        keep = number >= state.min_pending_output ||
+               (blob_live_set.find(number) != blob_live_set.end());
+        if (!keep) {
+          files_to_del.insert(number);
+        }
+        break;
+      case kTempFile:
+        // Any temp files that are currently being written to must
+        // be recorded in pending_outputs_, which is inserted into "live".
+        // Also, SetCurrentFile creates a temp file when writing out new
+        // manifest, which is equal to state.pending_manifest_file_number. We
+        // should not delete that file
+        //
+        // TODO(yhchiang): carefully modify the third condition to safely
+        //                 remove the temp options files.
+        keep = (sst_live_set.find(number) != sst_live_set.end()) ||
+               (blob_live_set.find(number) != blob_live_set.end()) ||
+               (number == state.pending_manifest_file_number) ||
+               (to_delete.find(kOptionsFileNamePrefix) != std::string::npos);
+        break;
+      case kInfoLogFile:
+        keep = true;
+        if (number != 0) {
+          old_info_log_files.push_back(to_delete);
+        }
+        break;
+      case kOptionsFile:
+        keep = (number >= optsfile_num2);
+        break;
+      case kCurrentFile:
+      case kDBLockFile:
+      case kIdentityFile:
+      case kMetaDatabase:
+        keep = true;
+        break;
+    }
+
+    if (keep) {
+      continue;
+    }
+
+    std::string fname;
+    std::string dir_to_sync;
+    if (type == kTableFile) {
+      // evict from cache
+      TableCache::Evict(table_cache_.get(), number);
+      fname = MakeTableFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
+    } else if (type == kBlobFile) {
+      fname = BlobFileName(candidate_file.file_path, number);
+      dir_to_sync = candidate_file.file_path;
+    } else {
+      dir_to_sync = (type == kWalFile) ? wal_dir : dbname_;
+      fname = dir_to_sync +
+              ((!dir_to_sync.empty() && dir_to_sync.back() == '/') ||
+                       (!to_delete.empty() && to_delete.front() == '/')
+                   ? ""
+                   : "/") +
+              to_delete;
+    }
+
+#ifndef ROCKSDB_LITE
+    if (type == kWalFile && (immutable_db_options_.WAL_ttl_seconds > 0 ||
+                             immutable_db_options_.WAL_size_limit_MB > 0)) {
+      wal_manager_.ArchiveWALFile(fname, number);
+      continue;
+    }
+#endif  // !ROCKSDB_LITE
+
+    // If I do not own these files, e.g. secondary instance with max_open_files
+    // = -1, then no need to delete or schedule delete these files since they
+    // will be removed by their owner, e.g. the primary instance.
+    if (!own_files) {
+      continue;
+    }
+    if (schedule_only) {
+      InstrumentedMutexLock guard_lock(&mutex_);
+      SchedulePendingPurge(fname, dir_to_sync, type, number, state.job_id);
+    } else {
+      DeleteObsoleteFileImpl(state.job_id, fname, dir_to_sync, type, number);
+    }
+  }
+
+  {
+    // After purging obsolete files, remove them from files_grabbed_for_purge_.
+    InstrumentedMutexLock guard_lock(&mutex_);
+    autovector<uint64_t> to_be_removed;
+    for (auto fn : files_grabbed_for_purge_) {
+      if (files_to_del.count(fn) != 0) {
+        to_be_removed.emplace_back(fn);
+      }
+    }
+    for (auto fn : to_be_removed) {
+      files_grabbed_for_purge_.erase(fn);
+    }
+  }
+
+  // Delete old info log files.
+  size_t old_info_log_file_count = old_info_log_files.size();
+  if (old_info_log_file_count != 0 &&
+      old_info_log_file_count >= immutable_db_options_.keep_log_file_num) {
+    std::sort(old_info_log_files.begin(), old_info_log_files.end());
+    size_t end =
+        old_info_log_file_count - immutable_db_options_.keep_log_file_num;
+    for (unsigned int i = 0; i <= end; i++) {
+      std::string& to_delete = old_info_log_files.at(i);
+      std::string full_path_to_delete =
+          (immutable_db_options_.db_log_dir.empty()
+               ? dbname_
+               : immutable_db_options_.db_log_dir) +
+          "/" + to_delete;
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "[JOB %d] Delete info log file %s\n", state.job_id,
+                     full_path_to_delete.c_str());
+      Status s = env_->DeleteFile(full_path_to_delete);
+      if (!s.ok()) {
+        if (env_->FileExists(full_path_to_delete).IsNotFound()) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[JOB %d] Tried to delete non-existing info log file %s FAILED "
+              "-- %s\n",
+              state.job_id, to_delete.c_str(), s.ToString().c_str());
+        } else {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "[JOB %d] Delete info log file %s FAILED -- %s\n",
+                          state.job_id, to_delete.c_str(),
+                          s.ToString().c_str());
+        }
+      }
+    }
+  }
+#ifndef ROCKSDB_LITE
+  wal_manager_.PurgeObsoleteWALFiles();
+#endif  // ROCKSDB_LITE
+  LogFlush(immutable_db_options_.info_log);
+  InstrumentedMutexLock l(&mutex_);
+  --pending_purge_obsolete_files_;
+  assert(pending_purge_obsolete_files_ >= 0);
+  if (schedule_only) {
+    // Must change from pending_purge_obsolete_files_ to bg_purge_scheduled_
+    // while holding mutex (for GetSortedWalFiles() etc.)
+    SchedulePurge();
+  }
+  if (pending_purge_obsolete_files_ == 0) {
+    bg_cv_.SignalAll();
+  }
+  TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:End");
+}
+
+void DBImpl::DeleteObsoleteFiles() {
+  mutex_.AssertHeld();
+  JobContext job_context(next_job_id_.fetch_add(1));
+  FindObsoleteFiles(&job_context, true);
+
+  mutex_.Unlock();
+  if (job_context.HaveSomethingToDelete()) {
+    bool defer_purge = immutable_db_options_.avoid_unnecessary_blocking_io;
+    PurgeObsoleteFiles(job_context, defer_purge);
+  }
+  job_context.Clean();
+  mutex_.Lock();
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset, const autovector<MemTable*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  // we must look through the memtables for two phase transactions
+  // that have been committed but not yet flushed
+  std::unordered_set<MemTable*> memtables_to_flush_set(
+      memtables_to_flush.begin(), memtables_to_flush.end());
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped()) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t FindMinPrepLogReferencedByMemTable(
+    VersionSet* vset,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  std::unordered_set<MemTable*> memtables_to_flush_set;
+  for (const autovector<MemTable*>* memtables : memtables_to_flush) {
+    memtables_to_flush_set.insert(memtables->begin(), memtables->end());
+  }
+  for (auto loop_cfd : *vset->GetColumnFamilySet()) {
+    if (loop_cfd->IsDropped()) {
+      continue;
+    }
+
+    auto log = loop_cfd->imm()->PrecomputeMinLogContainingPrepSection(
+        &memtables_to_flush_set);
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+
+    log = loop_cfd->mem()->GetMinLogContainingPrepSection();
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list) {
+  assert(vset != nullptr);
+
+  // Precompute the min log number containing unflushed data for the column
+  // family being flushed (`cfd_to_flush`).
+  uint64_t cf_min_log_number_to_keep = 0;
+  for (auto& e : edit_list) {
+    if (e->HasLogNumber()) {
+      cf_min_log_number_to_keep =
+          std::max(cf_min_log_number_to_keep, e->GetLogNumber());
+    }
+  }
+  if (cf_min_log_number_to_keep == 0) {
+    // No version edit contains information on log number. The log number
+    // for this column family should stay the same as it is.
+    cf_min_log_number_to_keep = cfd_to_flush.GetLogNumber();
+  }
+
+  // Get min log number containing unflushed data for other column families.
+  uint64_t min_log_number_to_keep =
+      vset->PreComputeMinLogNumberWithUnflushedData(&cfd_to_flush);
+  if (cf_min_log_number_to_keep != 0) {
+    min_log_number_to_keep =
+        std::min(cf_min_log_number_to_keep, min_log_number_to_keep);
+  }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeepNon2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists) {
+  assert(vset != nullptr);
+  assert(!cfds_to_flush.empty());
+  assert(cfds_to_flush.size() == edit_lists.size());
+
+  uint64_t min_log_number_to_keep = std::numeric_limits<uint64_t>::max();
+  for (const auto& edit_list : edit_lists) {
+    uint64_t log = 0;
+    for (const auto& e : edit_list) {
+      if (e->HasLogNumber()) {
+        log = std::max(log, e->GetLogNumber());
+      }
+    }
+    if (log != 0) {
+      min_log_number_to_keep = std::min(min_log_number_to_keep, log);
+    }
+  }
+  if (min_log_number_to_keep == std::numeric_limits<uint64_t>::max()) {
+    min_log_number_to_keep = cfds_to_flush[0]->GetLogNumber();
+    for (size_t i = 1; i < cfds_to_flush.size(); i++) {
+      min_log_number_to_keep =
+          std::min(min_log_number_to_keep, cfds_to_flush[i]->GetLogNumber());
+    }
+  }
+
+  std::unordered_set<const ColumnFamilyData*> flushed_cfds(
+      cfds_to_flush.begin(), cfds_to_flush.end());
+  min_log_number_to_keep =
+      std::min(min_log_number_to_keep,
+               vset->PreComputeMinLogNumberWithUnflushedData(flushed_cfds));
+
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const ColumnFamilyData& cfd_to_flush,
+    const autovector<VersionEdit*>& edit_list,
+    const autovector<MemTable*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  // Calculate updated min_log_number_to_keep
+  // Since the function should only be called in 2pc mode, log number in
+  // the version edit should be sufficient.
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfd_to_flush, edit_list);
+
+  // if are 2pc we must consider logs containing prepared
+  // sections of outstanding transactions.
+  //
+  // We must check min logs with outstanding prep before we check
+  // logs references by memtables because a log referenced by the
+  // first data structure could transition to the second under us.
+  //
+  // TODO: iterating over all column families under db mutex.
+  // should find more optimal solution
+  auto min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+  return min_log_number_to_keep;
+}
+
+uint64_t PrecomputeMinLogNumberToKeep2PC(
+    VersionSet* vset, const autovector<ColumnFamilyData*>& cfds_to_flush,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    const autovector<const autovector<MemTable*>*>& memtables_to_flush,
+    LogsWithPrepTracker* prep_tracker) {
+  assert(vset != nullptr);
+  assert(prep_tracker != nullptr);
+  assert(cfds_to_flush.size() == edit_lists.size());
+  assert(cfds_to_flush.size() == memtables_to_flush.size());
+
+  uint64_t min_log_number_to_keep =
+      PrecomputeMinLogNumberToKeepNon2PC(vset, cfds_to_flush, edit_lists);
+
+  uint64_t min_log_in_prep_heap =
+      prep_tracker->FindMinLogContainingOutstandingPrep();
+
+  if (min_log_in_prep_heap != 0 &&
+      min_log_in_prep_heap < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_in_prep_heap;
+  }
+
+  uint64_t min_log_refed_by_mem =
+      FindMinPrepLogReferencedByMemTable(vset, memtables_to_flush);
+
+  if (min_log_refed_by_mem != 0 &&
+      min_log_refed_by_mem < min_log_number_to_keep) {
+    min_log_number_to_keep = min_log_refed_by_mem;
+  }
+
+  return min_log_number_to_keep;
+}
+
+void DBImpl::SetDBId(std::string&& id, bool read_only,
+                     RecoveryContext* recovery_ctx) {
+  assert(db_id_.empty());
+  assert(!id.empty());
+  db_id_ = std::move(id);
+  if (!read_only && immutable_db_options_.write_dbid_to_manifest) {
+    assert(recovery_ctx != nullptr);
+    assert(versions_->GetColumnFamilySet() != nullptr);
+    VersionEdit edit;
+    edit.SetDBId(db_id_);
+    versions_->db_id_ = db_id_;
+    recovery_ctx->UpdateVersionEdits(
+        versions_->GetColumnFamilySet()->GetDefault(), edit);
+  }
+}
+
+Status DBImpl::SetupDBId(bool read_only, RecoveryContext* recovery_ctx) {
+  Status s;
+  // Check for the IDENTITY file and create it if not there or
+  // broken or not matching manifest
+  std::string db_id_in_file;
+  s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
+  if (s.ok()) {
+    s = GetDbIdentityFromIdentityFile(&db_id_in_file);
+    if (s.ok() && !db_id_in_file.empty()) {
+      if (db_id_.empty()) {
+        // Loaded from file and wasn't already known from manifest
+        SetDBId(std::move(db_id_in_file), read_only, recovery_ctx);
+        return s;
+      } else if (db_id_ == db_id_in_file) {
+        // Loaded from file and matches manifest
+        return s;
+      }
+    }
+  }
+  if (s.IsNotFound()) {
+    s = Status::OK();
+  }
+  if (!s.ok()) {
+    assert(s.IsIOError());
+    return s;
+  }
+  // Otherwise IDENTITY file is missing or no good.
+  // Generate new id if needed
+  if (db_id_.empty()) {
+    SetDBId(env_->GenerateUniqueId(), read_only, recovery_ctx);
+  }
+  // Persist it to IDENTITY file if allowed
+  if (!read_only) {
+    s = SetIdentityFile(env_, dbname_, db_id_);
+  }
+  return s;
+}
+
+Status DBImpl::DeleteUnreferencedSstFiles(RecoveryContext* recovery_ctx) {
+  mutex_.AssertHeld();
+  std::vector<std::string> paths;
+  paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
+  for (const auto& db_path : immutable_db_options_.db_paths) {
+    paths.push_back(
+        NormalizePath(db_path.path + std::string(1, kFilePathSeparator)));
+  }
+  for (const auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (const auto& cf_path : cfd->ioptions()->cf_paths) {
+      paths.push_back(
+          NormalizePath(cf_path.path + std::string(1, kFilePathSeparator)));
+    }
+  }
+  // Dedup paths
+  std::sort(paths.begin(), paths.end());
+  paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+
+  uint64_t next_file_number = versions_->current_next_file_number();
+  uint64_t largest_file_number = next_file_number;
+  Status s;
+  for (const auto& path : paths) {
+    std::vector<std::string> files;
+    s = env_->GetChildren(path, &files);
+    if (!s.ok()) {
+      break;
+    }
+    for (const auto& fname : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (!ParseFileName(fname, &number, &type)) {
+        continue;
+      }
+      // path ends with '/' or '\\'
+      const std::string normalized_fpath = path + fname;
+      largest_file_number = std::max(largest_file_number, number);
+      if (type == kTableFile && number >= next_file_number &&
+          recovery_ctx->files_to_delete_.find(normalized_fpath) ==
+              recovery_ctx->files_to_delete_.end()) {
+        recovery_ctx->files_to_delete_.emplace(normalized_fpath);
+      }
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (largest_file_number >= next_file_number) {
+    versions_->next_file_number_.store(largest_file_number + 1);
+  }
+
+  VersionEdit edit;
+  edit.SetNextFile(versions_->next_file_number_.load());
+  assert(versions_->GetColumnFamilySet());
+  ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(default_cfd);
+  recovery_ctx->UpdateVersionEdits(default_cfd, edit);
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_open.cc b/src/rocksdb/db/db_impl/db_impl_open.cc
new file mode 100644
index 000000000..40ffa2e85
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_open.cc
@@ -0,0 +1,2106 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/periodic_task_scheduler.h"
+#include "env/composite_env_wrapper.h"
+#include "file/filename.h"
+#include "file/read_write_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/table.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/sync_point.h"
+#include "util/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+Options SanitizeOptions(const std::string& dbname, const Options& src,
+                        bool read_only, Status* logger_creation_s) {
+  auto db_options =
+      SanitizeOptions(dbname, DBOptions(src), read_only, logger_creation_s);
+  ImmutableDBOptions immutable_db_options(db_options);
+  auto cf_options =
+      SanitizeOptions(immutable_db_options, ColumnFamilyOptions(src));
+  return Options(db_options, cf_options);
+}
+
+DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
+                          bool read_only, Status* logger_creation_s) {
+  DBOptions result(src);
+
+  if (result.env == nullptr) {
+    result.env = Env::Default();
+  }
+
+  // result.max_open_files means an "infinite" open files.
+  if (result.max_open_files != -1) {
+    int max_max_open_files = port::GetMaxOpenFiles();
+    if (max_max_open_files == -1) {
+      max_max_open_files = 0x400000;
+    }
+    ClipToRange(&result.max_open_files, 20, max_max_open_files);
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions::AfterChangeMaxOpenFiles",
+                             &result.max_open_files);
+  }
+
+  if (result.info_log == nullptr && !read_only) {
+    Status s = CreateLoggerFromOptions(dbname, result, &result.info_log);
+    if (!s.ok()) {
+      // No place suitable for logging
+      result.info_log = nullptr;
+      if (logger_creation_s) {
+        *logger_creation_s = s;
+      }
+    }
+  }
+
+  if (!result.write_buffer_manager) {
+    result.write_buffer_manager.reset(
+        new WriteBufferManager(result.db_write_buffer_size));
+  }
+  auto bg_job_limits = DBImpl::GetBGJobLimits(
+      result.max_background_flushes, result.max_background_compactions,
+      result.max_background_jobs, true /* parallelize_compactions */);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_compactions,
+                                           Env::Priority::LOW);
+  result.env->IncBackgroundThreadsIfNeeded(bg_job_limits.max_flushes,
+                                           Env::Priority::HIGH);
+
+  if (result.rate_limiter.get() != nullptr) {
+    if (result.bytes_per_sync == 0) {
+      result.bytes_per_sync = 1024 * 1024;
+    }
+  }
+
+  if (result.delayed_write_rate == 0) {
+    if (result.rate_limiter.get() != nullptr) {
+      result.delayed_write_rate = result.rate_limiter->GetBytesPerSecond();
+    }
+    if (result.delayed_write_rate == 0) {
+      result.delayed_write_rate = 16 * 1024 * 1024;
+    }
+  }
+
+  if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
+    result.recycle_log_file_num = false;
+  }
+
+  if (result.recycle_log_file_num &&
+      (result.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords ||
+       result.wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery ||
+       result.wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency)) {
+    // - kTolerateCorruptedTailRecords is inconsistent with recycle log file
+    //   feature. WAL recycling expects recovery success upon encountering a
+    //   corrupt record at the point where new data ends and recycled data
+    //   remains at the tail. However, `kTolerateCorruptedTailRecords` must fail
+    //   upon encountering any such corrupt record, as it cannot differentiate
+    //   between this and a real corruption, which would cause committed updates
+    //   to be truncated -- a violation of the recovery guarantee.
+    // - kPointInTimeRecovery and kAbsoluteConsistency are incompatible with
+    //   recycle log file feature temporarily due to a bug found introducing a
+    //   hole in the recovered data
+    //   (https://github.com/facebook/rocksdb/pull/7252#issuecomment-673766236).
+    //   Besides this bug, we believe the features are fundamentally compatible.
+    result.recycle_log_file_num = 0;
+  }
+
+  if (result.db_paths.size() == 0) {
+    result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
+  } else if (result.wal_dir.empty()) {
+    // Use dbname as default
+    result.wal_dir = dbname;
+  }
+  if (!result.wal_dir.empty()) {
+    // If there is a wal_dir already set, check to see if the wal_dir is the
+    // same as the dbname AND the same as the db_path[0] (which must exist from
+    // a few lines ago). If the wal_dir matches both of these values, then clear
+    // the wal_dir value, which will make wal_dir == dbname.  Most likely this
+    // condition was the result of reading an old options file where we forced
+    // wal_dir to be set (to dbname).
+    auto npath = NormalizePath(dbname + "/");
+    if (npath == NormalizePath(result.wal_dir + "/") &&
+        npath == NormalizePath(result.db_paths[0].path + "/")) {
+      result.wal_dir.clear();
+    }
+  }
+
+  if (!result.wal_dir.empty() && result.wal_dir.back() == '/') {
+    result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
+  }
+
+  if (result.use_direct_reads && result.compaction_readahead_size == 0) {
+    TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
+    result.compaction_readahead_size = 1024 * 1024 * 2;
+  }
+
+  // Force flush on DB open if 2PC is enabled, since with 2PC we have no
+  // guarantee that consecutive log files have consecutive sequence id, which
+  // make recovery complicated.
+  if (result.allow_2pc) {
+    result.avoid_flush_during_recovery = false;
+  }
+
+#ifndef ROCKSDB_LITE
+  ImmutableDBOptions immutable_db_options(result);
+  if (!immutable_db_options.IsWalDirSameAsDBPath()) {
+    // Either the WAL dir and db_paths[0]/db_name are not the same, or we
+    // cannot tell for sure. In either case, assume they're different and
+    // explicitly cleanup the trash log files (bypass DeleteScheduler)
+    // Do this first so even if we end up calling
+    // DeleteScheduler::CleanupDirectory on the same dir later, it will be
+    // safe
+    std::vector<std::string> filenames;
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    auto wal_dir = immutable_db_options.GetWalDir();
+    Status s = immutable_db_options.fs->GetChildren(
+        wal_dir, io_opts, &filenames, /*IODebugContext*=*/nullptr);
+    s.PermitUncheckedError();  //**TODO: What to do on error?
+    for (std::string& filename : filenames) {
+      if (filename.find(".log.trash", filename.length() -
+                                          std::string(".log.trash").length()) !=
+          std::string::npos) {
+        std::string trash_file = wal_dir + "/" + filename;
+        result.env->DeleteFile(trash_file).PermitUncheckedError();
+      }
+    }
+  }
+  // When the DB is stopped, it's possible that there are some .trash files that
+  // were not deleted yet, when we open the DB we will find these .trash files
+  // and schedule them to be deleted (or delete immediately if SstFileManager
+  // was not used)
+  auto sfm = static_cast<SstFileManagerImpl*>(result.sst_file_manager.get());
+  for (size_t i = 0; i < result.db_paths.size(); i++) {
+    DeleteScheduler::CleanupDirectory(result.env, sfm, result.db_paths[i].path)
+        .PermitUncheckedError();
+  }
+
+  // Create a default SstFileManager for purposes of tracking compaction size
+  // and facilitating recovery from out of space errors.
+  if (result.sst_file_manager.get() == nullptr) {
+    std::shared_ptr<SstFileManager> sst_file_manager(
+        NewSstFileManager(result.env, result.info_log));
+    result.sst_file_manager = sst_file_manager;
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Supported wal compression types
+  if (!StreamingCompressionTypeSupported(result.wal_compression)) {
+    result.wal_compression = kNoCompression;
+    ROCKS_LOG_WARN(result.info_log,
+                   "wal_compression is disabled since only zstd is supported");
+  }
+
+  if (!result.paranoid_checks) {
+    result.skip_checking_sst_file_sizes_on_db_open = true;
+    ROCKS_LOG_INFO(result.info_log,
+                   "file size check will be skipped during open.");
+  }
+
+  return result;
+}
+
+namespace {
+Status ValidateOptionsByTable(
+    const DBOptions& db_opts,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto cf : column_families) {
+    s = ValidateOptions(db_opts, cf.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+}  // namespace
+
+Status DBImpl::ValidateOptions(
+    const DBOptions& db_options,
+    const std::vector<ColumnFamilyDescriptor>& column_families) {
+  Status s;
+  for (auto& cfd : column_families) {
+    s = ColumnFamilyData::ValidateOptions(db_options, cfd.options);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  s = ValidateOptions(db_options);
+  return s;
+}
+
+Status DBImpl::ValidateOptions(const DBOptions& db_options) {
+  if (db_options.db_paths.size() > 4) {
+    return Status::NotSupported(
+        "More than four DB paths are not supported yet. ");
+  }
+
+  if (db_options.allow_mmap_reads && db_options.use_direct_reads) {
+    // Protect against assert in PosixMMapReadableFile constructor
+    return Status::NotSupported(
+        "If memory mapped reads (allow_mmap_reads) are enabled "
+        "then direct I/O reads (use_direct_reads) must be disabled. ");
+  }
+
+  if (db_options.allow_mmap_writes &&
+      db_options.use_direct_io_for_flush_and_compaction) {
+    return Status::NotSupported(
+        "If memory mapped writes (allow_mmap_writes) are enabled "
+        "then direct I/O writes (use_direct_io_for_flush_and_compaction) must "
+        "be disabled. ");
+  }
+
+  if (db_options.keep_log_file_num == 0) {
+    return Status::InvalidArgument("keep_log_file_num must be greater than 0");
+  }
+
+  if (db_options.unordered_write &&
+      !db_options.allow_concurrent_memtable_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with "
+        "!allow_concurrent_memtable_write");
+  }
+
+  if (db_options.unordered_write && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "unordered_write is incompatible with enable_pipelined_write");
+  }
+
+  if (db_options.atomic_flush && db_options.enable_pipelined_write) {
+    return Status::InvalidArgument(
+        "atomic_flush is incompatible with enable_pipelined_write");
+  }
+
+  // TODO remove this restriction
+  if (db_options.atomic_flush && db_options.best_efforts_recovery) {
+    return Status::InvalidArgument(
+        "atomic_flush is currently incompatible with best-efforts recovery");
+  }
+
+  if (db_options.use_direct_io_for_flush_and_compaction &&
+      0 == db_options.writable_file_max_buffer_size) {
+    return Status::InvalidArgument(
+        "writes in direct IO require writable_file_max_buffer_size > 0");
+  }
+
+  return Status::OK();
+}
+
+Status DBImpl::NewDB(std::vector<std::string>* new_filenames) {
+  VersionEdit new_db;
+  Status s = SetIdentityFile(env_, dbname_);
+  if (!s.ok()) {
+    return s;
+  }
+  if (immutable_db_options_.write_dbid_to_manifest) {
+    std::string temp_db_id;
+    GetDbIdentityFromIdentityFile(&temp_db_id);
+    new_db.SetDBId(temp_db_id);
+  }
+  new_db.SetLogNumber(0);
+  new_db.SetNextFile(2);
+  new_db.SetLastSequence(0);
+
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "Creating manifest 1 \n");
+  const std::string manifest = DescriptorFileName(dbname_, 1);
+  {
+    if (fs_->FileExists(manifest, IOOptions(), nullptr).ok()) {
+      fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+    }
+    std::unique_ptr<FSWritableFile> file;
+    FileOptions file_options = fs_->OptimizeForManifestWrite(file_options_);
+    s = NewWritableFile(fs_.get(), manifest, &file, file_options);
+    if (!s.ok()) {
+      return s;
+    }
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+    file->SetPreallocationBlockSize(
+        immutable_db_options_.manifest_preallocation_size);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(file), manifest, file_options, immutable_db_options_.clock,
+        io_tracer_, nullptr /* stats */, immutable_db_options_.listeners,
+        nullptr, tmp_set.Contains(FileType::kDescriptorFile),
+        tmp_set.Contains(FileType::kDescriptorFile)));
+    log::Writer log(std::move(file_writer), 0, false);
+    std::string record;
+    new_db.EncodeTo(&record);
+    s = log.AddRecord(record);
+    if (s.ok()) {
+      s = SyncManifest(&immutable_db_options_, log.file());
+    }
+  }
+  if (s.ok()) {
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, directories_.GetDbDir());
+    if (new_filenames) {
+      new_filenames->emplace_back(
+          manifest.substr(manifest.find_last_of("/\\") + 1));
+    }
+  } else {
+    fs_->DeleteFile(manifest, IOOptions(), nullptr).PermitUncheckedError();
+  }
+  return s;
+}
+
+IOStatus DBImpl::CreateAndNewDirectory(
+    FileSystem* fs, const std::string& dirname,
+    std::unique_ptr<FSDirectory>* directory) {
+  // We call CreateDirIfMissing() as the directory may already exist (if we
+  // are reopening a DB), when this happens we don't want creating the
+  // directory to cause an error. However, we need to check if creating the
+  // directory fails or else we may get an obscure message about the lock
+  // file not existing. One real-world example of this occurring is if
+  // env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
+  // when dbname_ is "dir/db" but when "dir" doesn't exist.
+  IOStatus io_s = fs->CreateDirIfMissing(dirname, IOOptions(), nullptr);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  return fs->NewDirectory(dirname, IOOptions(), directory, nullptr);
+}
+
+IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
+                                     const std::string& wal_dir,
+                                     const std::vector<DbPath>& data_paths) {
+  IOStatus io_s = DBImpl::CreateAndNewDirectory(fs, dbname, &db_dir_);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  if (!wal_dir.empty() && dbname != wal_dir) {
+    io_s = DBImpl::CreateAndNewDirectory(fs, wal_dir, &wal_dir_);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  data_dirs_.clear();
+  for (auto& p : data_paths) {
+    const std::string db_path = p.path;
+    if (db_path == dbname) {
+      data_dirs_.emplace_back(nullptr);
+    } else {
+      std::unique_ptr<FSDirectory> path_directory;
+      io_s = DBImpl::CreateAndNewDirectory(fs, db_path, &path_directory);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+      data_dirs_.emplace_back(path_directory.release());
+    }
+  }
+  assert(data_dirs_.size() == data_paths.size());
+  return IOStatus::OK();
+}
+
+Status DBImpl::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
+    uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
+  mutex_.AssertHeld();
+
+  bool is_new_db = false;
+  assert(db_lock_ == nullptr);
+  std::vector<std::string> files_in_dbname;
+  if (!read_only) {
+    Status s = directories_.SetDirectories(fs_.get(), dbname_,
+                                           immutable_db_options_.wal_dir,
+                                           immutable_db_options_.db_paths);
+    if (!s.ok()) {
+      return s;
+    }
+
+    s = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!s.ok()) {
+      return s;
+    }
+
+    std::string current_fname = CurrentFileName(dbname_);
+    // Path to any MANIFEST file in the db dir. It does not matter which one.
+    // Since best-efforts recovery ignores CURRENT file, existence of a
+    // MANIFEST indicates the recovery to recover existing db. If no MANIFEST
+    // can be found, a new db will be created.
+    std::string manifest_path;
+    if (!immutable_db_options_.best_efforts_recovery) {
+      s = env_->FileExists(current_fname);
+    } else {
+      s = Status::NotFound();
+      IOOptions io_opts;
+      io_opts.do_not_recurse = true;
+      Status io_s = immutable_db_options_.fs->GetChildren(
+          dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+      if (!io_s.ok()) {
+        s = io_s;
+        files_in_dbname.clear();
+      }
+      for (const std::string& file : files_in_dbname) {
+        uint64_t number = 0;
+        FileType type = kWalFile;  // initialize
+        if (ParseFileName(file, &number, &type) && type == kDescriptorFile) {
+          uint64_t bytes;
+          s = env_->GetFileSize(DescriptorFileName(dbname_, number), &bytes);
+          if (s.ok() && bytes != 0) {
+            // Found non-empty MANIFEST (descriptor log), thus best-efforts
+            // recovery does not have to treat the db as empty.
+            manifest_path = dbname_ + "/" + file;
+            break;
+          }
+        }
+      }
+    }
+    if (s.IsNotFound()) {
+      if (immutable_db_options_.create_if_missing) {
+        s = NewDB(&files_in_dbname);
+        is_new_db = true;
+        if (!s.ok()) {
+          return s;
+        }
+      } else {
+        return Status::InvalidArgument(
+            current_fname, "does not exist (create_if_missing is false)");
+      }
+    } else if (s.ok()) {
+      if (immutable_db_options_.error_if_exists) {
+        return Status::InvalidArgument(dbname_,
+                                       "exists (error_if_exists is true)");
+      }
+    } else {
+      // Unexpected error reading file
+      assert(s.IsIOError());
+      return s;
+    }
+    // Verify compatibility of file_options_ and filesystem
+    {
+      std::unique_ptr<FSRandomAccessFile> idfile;
+      FileOptions customized_fs(file_options_);
+      customized_fs.use_direct_reads |=
+          immutable_db_options_.use_direct_io_for_flush_and_compaction;
+      const std::string& fname =
+          manifest_path.empty() ? current_fname : manifest_path;
+      s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+      if (!s.ok()) {
+        std::string error_str = s.ToString();
+        // Check if unsupported Direct I/O is the root cause
+        customized_fs.use_direct_reads = false;
+        s = fs_->NewRandomAccessFile(fname, customized_fs, &idfile, nullptr);
+        if (s.ok()) {
+          return Status::InvalidArgument(
+              "Direct I/O is not supported by the specified DB.");
+        } else {
+          return Status::InvalidArgument(
+              "Found options incompatible with filesystem", error_str.c_str());
+        }
+      }
+    }
+  } else if (immutable_db_options_.best_efforts_recovery) {
+    assert(files_in_dbname.empty());
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    Status s = immutable_db_options_.fs->GetChildren(
+        dbname_, io_opts, &files_in_dbname, /*IODebugContext*=*/nullptr);
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument(dbname_,
+                                     "does not exist (open for read only)");
+    } else if (s.IsIOError()) {
+      return s;
+    }
+    assert(s.ok());
+  }
+  assert(db_id_.empty());
+  Status s;
+  bool missing_table_file = false;
+  if (!immutable_db_options_.best_efforts_recovery) {
+    s = versions_->Recover(column_families, read_only, &db_id_);
+  } else {
+    assert(!files_in_dbname.empty());
+    s = versions_->TryRecover(column_families, read_only, files_in_dbname,
+                              &db_id_, &missing_table_file);
+    if (s.ok()) {
+      // TryRecover may delete previous column_family_set_.
+      column_family_memtables_.reset(
+          new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  s = SetupDBId(read_only, recovery_ctx);
+  ROCKS_LOG_INFO(immutable_db_options_.info_log, "DB ID: %s\n", db_id_.c_str());
+  if (s.ok() && !read_only) {
+    s = DeleteUnreferencedSstFiles(recovery_ctx);
+  }
+
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  if (s.ok() && !read_only) {
+    // TODO: share file descriptors (FSDirectory) with SetDirectories above
+    std::map<std::string, std::shared_ptr<FSDirectory>> created_dirs;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      s = cfd->AddDirectories(&created_dirs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  std::vector<std::string> files_in_wal_dir;
+  if (s.ok()) {
+    // Initial max_total_in_memory_state_ before recovery wals. Log recovery
+    // may check this value to decide whether to flush.
+    max_total_in_memory_state_ = 0;
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+      max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                    mutable_cf_options->max_write_buffer_number;
+    }
+
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+    // Recover from all newer log files than the ones named in the
+    // descriptor (new log files may have been added by the previous
+    // incarnation without registering them in the descriptor).
+    //
+    // Note that prev_log_number() is no longer used, but we pay
+    // attention to it in case we are recovering a database
+    // produced by an older version of rocksdb.
+    auto wal_dir = immutable_db_options_.GetWalDir();
+    if (!immutable_db_options_.best_efforts_recovery) {
+      IOOptions io_opts;
+      io_opts.do_not_recurse = true;
+      s = immutable_db_options_.fs->GetChildren(
+          wal_dir, io_opts, &files_in_wal_dir, /*IODebugContext*=*/nullptr);
+    }
+    if (s.IsNotFound()) {
+      return Status::InvalidArgument("wal_dir not found", wal_dir);
+    } else if (!s.ok()) {
+      return s;
+    }
+
+    std::unordered_map<uint64_t, std::string> wal_files;
+    for (const auto& file : files_in_wal_dir) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type) && type == kWalFile) {
+        if (is_new_db) {
+          return Status::Corruption(
+              "While creating a new Db, wal_dir contains "
+              "existing log file: ",
+              file);
+        } else {
+          wal_files[number] = LogFileName(wal_dir, number);
+        }
+      }
+    }
+
+    if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+      if (!immutable_db_options_.best_efforts_recovery) {
+        // Verify WALs in MANIFEST.
+        s = versions_->GetWalSet().CheckWals(env_, wal_files);
+      }  // else since best effort recovery does not recover from WALs, no need
+         // to check WALs.
+    } else if (!versions_->GetWalSet().GetWals().empty()) {
+      // Tracking is disabled, clear previously tracked WALs from MANIFEST,
+      // otherwise, in the future, if WAL tracking is enabled again,
+      // since the WALs deleted when WAL tracking is disabled are not persisted
+      // into MANIFEST, WAL check may fail.
+      VersionEdit edit;
+      WalNumber max_wal_number =
+          versions_->GetWalSet().GetWals().rbegin()->first;
+      edit.DeleteWalsBefore(max_wal_number + 1);
+      assert(recovery_ctx != nullptr);
+      assert(versions_->GetColumnFamilySet() != nullptr);
+      recovery_ctx->UpdateVersionEdits(
+          versions_->GetColumnFamilySet()->GetDefault(), edit);
+    }
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!wal_files.empty()) {
+      if (error_if_wal_file_exists) {
+        return Status::Corruption(
+            "The db was opened in readonly mode with error_if_wal_file_exists"
+            "flag but a WAL file already exists");
+      } else if (error_if_data_exists_in_wals) {
+        for (auto& wal_file : wal_files) {
+          uint64_t bytes;
+          s = env_->GetFileSize(wal_file.second, &bytes);
+          if (s.ok()) {
+            if (bytes > 0) {
+              return Status::Corruption(
+                  "error_if_data_exists_in_wals is set but there are data "
+                  " in WAL files.");
+            }
+          }
+        }
+      }
+    }
+
+    if (!wal_files.empty()) {
+      // Recover in the order in which the wals were generated
+      std::vector<uint64_t> wals;
+      wals.reserve(wal_files.size());
+      for (const auto& wal_file : wal_files) {
+        wals.push_back(wal_file.first);
+      }
+      std::sort(wals.begin(), wals.end());
+
+      bool corrupted_wal_found = false;
+      s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
+                          recovery_ctx);
+      if (corrupted_wal_found && recovered_seq != nullptr) {
+        *recovered_seq = next_sequence;
+      }
+      if (!s.ok()) {
+        // Clear memtables if recovery failed
+        for (auto cfd : *versions_->GetColumnFamilySet()) {
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 kMaxSequenceNumber);
+        }
+      }
+    }
+  }
+
+  if (read_only) {
+    // If we are opening as read-only, we need to update options_file_number_
+    // to reflect the most recent OPTIONS file. It does not matter for regular
+    // read-write db instance because options_file_number_ will later be
+    // updated to versions_->NewFileNumber() in RenameTempFileToOptionsFile.
+    std::vector<std::string> filenames;
+    if (s.ok()) {
+      const std::string normalized_dbname = NormalizePath(dbname_);
+      const std::string normalized_wal_dir =
+          NormalizePath(immutable_db_options_.GetWalDir());
+      if (immutable_db_options_.best_efforts_recovery) {
+        filenames = std::move(files_in_dbname);
+      } else if (normalized_dbname == normalized_wal_dir) {
+        filenames = std::move(files_in_wal_dir);
+      } else {
+        IOOptions io_opts;
+        io_opts.do_not_recurse = true;
+        s = immutable_db_options_.fs->GetChildren(
+            GetName(), io_opts, &filenames, /*IODebugContext*=*/nullptr);
+      }
+    }
+    if (s.ok()) {
+      uint64_t number = 0;
+      uint64_t options_file_number = 0;
+      FileType type;
+      for (const auto& fname : filenames) {
+        if (ParseFileName(fname, &number, &type) && type == kOptionsFile) {
+          options_file_number = std::max(number, options_file_number);
+        }
+      }
+      versions_->options_file_number_ = options_file_number;
+      uint64_t options_file_size = 0;
+      if (options_file_number > 0) {
+        s = env_->GetFileSize(OptionsFileName(GetName(), options_file_number),
+                              &options_file_size);
+      }
+      versions_->options_file_size_ = options_file_size;
+    }
+  }
+  return s;
+}
+
+Status DBImpl::PersistentStatsProcessFormatVersion() {
+  mutex_.AssertHeld();
+  Status s;
+  // persist version when stats CF doesn't exist
+  bool should_persist_format_version = !persistent_stats_cfd_exists_;
+  mutex_.Unlock();
+  if (persistent_stats_cfd_exists_) {
+    // Check persistent stats format version compatibility. Drop and recreate
+    // persistent stats CF if format version is incompatible
+    uint64_t format_version_recovered = 0;
+    Status s_format = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kFormatVersion, &format_version_recovered);
+    uint64_t compatible_version_recovered = 0;
+    Status s_compatible = DecodePersistentStatsVersionNumber(
+        this, StatsVersionKeyType::kCompatibleVersion,
+        &compatible_version_recovered);
+    // abort reading from existing stats CF if any of following is true:
+    // 1. failed to read format version or compatible version from disk
+    // 2. sst's format version is greater than current format version, meaning
+    // this sst is encoded with a newer RocksDB release, and current compatible
+    // version is below the sst's compatible version
+    if (!s_format.ok() || !s_compatible.ok() ||
+        (kStatsCFCurrentFormatVersion < format_version_recovered &&
+         kStatsCFCompatibleFormatVersion < compatible_version_recovered)) {
+      if (!s_format.ok() || !s_compatible.ok()) {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Recreating persistent stats column family since reading "
+            "persistent stats version key failed. Format key: %s, compatible "
+            "key: %s",
+            s_format.ToString().c_str(), s_compatible.ToString().c_str());
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Recreating persistent stats column family due to corrupted or "
+            "incompatible format version. Recovered format: %" PRIu64
+            "; recovered format compatible since: %" PRIu64 "\n",
+            format_version_recovered, compatible_version_recovered);
+      }
+      s = DropColumnFamily(persist_stats_cf_handle_);
+      if (s.ok()) {
+        s = DestroyColumnFamilyHandle(persist_stats_cf_handle_);
+      }
+      ColumnFamilyHandle* handle = nullptr;
+      if (s.ok()) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+      }
+      if (s.ok()) {
+        persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+        // should also persist version here because old stats CF is discarded
+        should_persist_format_version = true;
+      }
+    }
+  }
+  if (should_persist_format_version) {
+    // Persistent stats CF being created for the first time, need to write
+    // format version key
+    WriteBatch batch;
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kFormatVersionKeyString,
+                    std::to_string(kStatsCFCurrentFormatVersion));
+    }
+    if (s.ok()) {
+      s = batch.Put(persist_stats_cf_handle_, kCompatibleVersionKeyString,
+                    std::to_string(kStatsCFCompatibleFormatVersion));
+    }
+    if (s.ok()) {
+      WriteOptions wo;
+      wo.low_pri = true;
+      wo.no_slowdown = true;
+      wo.sync = false;
+      s = Write(wo, &batch);
+    }
+  }
+  mutex_.Lock();
+  return s;
+}
+
+Status DBImpl::InitPersistStatsColumnFamily() {
+  mutex_.AssertHeld();
+  assert(!persist_stats_cf_handle_);
+  ColumnFamilyData* persistent_stats_cfd =
+      versions_->GetColumnFamilySet()->GetColumnFamily(
+          kPersistentStatsColumnFamilyName);
+  persistent_stats_cfd_exists_ = persistent_stats_cfd != nullptr;
+
+  Status s;
+  if (persistent_stats_cfd != nullptr) {
+    // We are recovering from a DB which already contains persistent stats CF,
+    // the CF is already created in VersionSet::ApplyOneVersionEdit, but
+    // column family handle was not. Need to explicitly create handle here.
+    persist_stats_cf_handle_ =
+        new ColumnFamilyHandleImpl(persistent_stats_cfd, this, &mutex_);
+  } else {
+    mutex_.Unlock();
+    ColumnFamilyHandle* handle = nullptr;
+    ColumnFamilyOptions cfo;
+    OptimizeForPersistentStats(&cfo);
+    s = CreateColumnFamily(cfo, kPersistentStatsColumnFamilyName, &handle);
+    persist_stats_cf_handle_ = static_cast<ColumnFamilyHandleImpl*>(handle);
+    mutex_.Lock();
+  }
+  return s;
+}
+
+Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) {
+  mutex_.AssertHeld();
+  assert(versions_->descriptor_log_ == nullptr);
+  Status s = versions_->LogAndApply(
+      recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_,
+      recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir());
+  if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) {
+    mutex_.Unlock();
+    for (const auto& fname : recovery_ctx.files_to_delete_) {
+      s = env_->DeleteFile(fname);
+      if (!s.ok()) {
+        break;
+      }
+    }
+    mutex_.Lock();
+  }
+  return s;
+}
+
+void DBImpl::InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap() {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.wal_filter == nullptr) {
+    return;
+  }
+  assert(immutable_db_options_.wal_filter != nullptr);
+  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+  std::map<std::string, uint32_t> cf_name_id_map;
+  std::map<uint32_t, uint64_t> cf_lognumber_map;
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    assert(cfd);
+    cf_name_id_map.insert(std::make_pair(cfd->GetName(), cfd->GetID()));
+    cf_lognumber_map.insert(std::make_pair(cfd->GetID(), cfd->GetLogNumber()));
+  }
+
+  wal_filter.ColumnFamilyLogNumberMap(cf_lognumber_map, cf_name_id_map);
+#endif  // !ROCKSDB_LITE
+}
+
+bool DBImpl::InvokeWalFilterIfNeededOnWalRecord(uint64_t wal_number,
+                                                const std::string& wal_fname,
+                                                log::Reader::Reporter& reporter,
+                                                Status& status,
+                                                bool& stop_replay,
+                                                WriteBatch& batch) {
+#ifndef ROCKSDB_LITE
+  if (immutable_db_options_.wal_filter == nullptr) {
+    return true;
+  }
+  assert(immutable_db_options_.wal_filter != nullptr);
+  WalFilter& wal_filter = *(immutable_db_options_.wal_filter);
+
+  WriteBatch new_batch;
+  bool batch_changed = false;
+
+  bool process_current_record = true;
+
+  WalFilter::WalProcessingOption wal_processing_option =
+      wal_filter.LogRecordFound(wal_number, wal_fname, batch, &new_batch,
+                                &batch_changed);
+
+  switch (wal_processing_option) {
+    case WalFilter::WalProcessingOption::kContinueProcessing:
+      // do nothing, proceeed normally
+      break;
+    case WalFilter::WalProcessingOption::kIgnoreCurrentRecord:
+      // skip current record
+      process_current_record = false;
+      break;
+    case WalFilter::WalProcessingOption::kStopReplay:
+      // skip current record and stop replay
+      process_current_record = false;
+      stop_replay = true;
+      break;
+    case WalFilter::WalProcessingOption::kCorruptedRecord: {
+      status = Status::Corruption("Corruption reported by Wal Filter ",
+                                  wal_filter.Name());
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        process_current_record = false;
+        reporter.Corruption(batch.GetDataSize(), status);
+      }
+      break;
+    }
+    default: {
+      // logical error which should not happen. If RocksDB throws, we would
+      // just do `throw std::logic_error`.
+      assert(false);
+      status = Status::NotSupported(
+          "Unknown WalProcessingOption returned by Wal Filter ",
+          wal_filter.Name());
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        // Ignore the error with current record processing.
+        stop_replay = true;
+      }
+      break;
+    }
+  }
+
+  if (!process_current_record) {
+    return false;
+  }
+
+  if (batch_changed) {
+    // Make sure that the count in the new batch is
+    // within the orignal count.
+    int new_count = WriteBatchInternal::Count(&new_batch);
+    int original_count = WriteBatchInternal::Count(&batch);
+    if (new_count > original_count) {
+      ROCKS_LOG_FATAL(
+          immutable_db_options_.info_log,
+          "Recovering log #%" PRIu64
+          " mode %d log filter %s returned "
+          "more records (%d) than original (%d) which is not allowed. "
+          "Aborting recovery.",
+          wal_number, static_cast<int>(immutable_db_options_.wal_recovery_mode),
+          wal_filter.Name(), new_count, original_count);
+      status = Status::NotSupported(
+          "More than original # of records "
+          "returned by Wal Filter ",
+          wal_filter.Name());
+      return false;
+    }
+    // Set the same sequence number in the new_batch
+    // as the original batch.
+    WriteBatchInternal::SetSequence(&new_batch,
+                                    WriteBatchInternal::Sequence(&batch));
+    batch = new_batch;
+  }
+  return true;
+#else   // !ROCKSDB_LITE
+  (void)wal_number;
+  (void)wal_fname;
+  (void)reporter;
+  (void)status;
+  (void)stop_replay;
+  (void)batch;
+  return true;
+#endif  // ROCKSDB_LITE
+}
+
+// REQUIRES: wal_numbers are sorted in ascending order
+Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
+                               SequenceNumber* next_sequence, bool read_only,
+                               bool* corrupted_wal_found,
+                               RecoveryContext* recovery_ctx) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (status == nullptr ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (status != nullptr && status->ok()) {
+        *status = s;
+      }
+    }
+  };
+
+  mutex_.AssertHeld();
+  Status status;
+  std::unordered_map<int, VersionEdit> version_edits;
+  // no need to refcount because iteration is under mutex
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+    version_edits.insert({cfd->GetID(), edit});
+  }
+  int job_id = next_job_id_.fetch_add(1);
+  {
+    auto stream = event_logger_.Log();
+    stream << "job" << job_id << "event"
+           << "recovery_started";
+    stream << "wal_files";
+    stream.StartArray();
+    for (auto wal_number : wal_numbers) {
+      stream << wal_number;
+    }
+    stream.EndArray();
+  }
+
+  // No-op for immutable_db_options_.wal_filter == nullptr.
+  InvokeWalFilterIfNeededOnColumnFamilyToWalNumberMap();
+
+  bool stop_replay_by_wal_filter = false;
+  bool stop_replay_for_corruption = false;
+  bool flushed = false;
+  uint64_t corrupted_wal_number = kMaxSequenceNumber;
+  uint64_t min_wal_number = MinLogNumberToKeep();
+  if (!allow_2pc()) {
+    // In non-2pc mode, we skip WALs that do not back unflushed data.
+    min_wal_number =
+        std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+  }
+  for (auto wal_number : wal_numbers) {
+    if (wal_number < min_wal_number) {
+      ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                     "Skipping log #%" PRIu64
+                     " since it is older than min log to keep #%" PRIu64,
+                     wal_number, min_wal_number);
+      continue;
+    }
+    // The previous incarnation may not have written any MANIFEST
+    // records after allocating this log number.  So we manually
+    // update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(wal_number);
+    // Open the log file
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", wal_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+    auto logFileDropped = [this, &fname]() {
+      uint64_t bytes;
+      if (env_->GetFileSize(fname, &bytes).ok()) {
+        auto info_log = immutable_db_options_.info_log.get();
+        ROCKS_LOG_WARN(info_log, "%s: dropping %d bytes", fname.c_str(),
+                       static_cast<int>(bytes));
+      }
+    };
+    if (stop_replay_by_wal_filter) {
+      logFileDropped();
+      continue;
+    }
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      status = fs_->NewSequentialFile(
+          fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+      if (!status.ok()) {
+        MaybeIgnoreError(&status);
+        if (!status.ok()) {
+          return status;
+        } else {
+          // Fail with one log file, but that's ok.
+          // Try next one.
+          continue;
+        }
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = immutable_db_options_.info_log.get();
+    reporter.fname = fname.c_str();
+    if (!immutable_db_options_.paranoid_checks ||
+        immutable_db_options_.wal_recovery_mode ==
+            WALRecoveryMode::kSkipAnyCorruptedRecords) {
+      reporter.status = nullptr;
+    } else {
+      reporter.status = &status;
+    }
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    log::Reader reader(immutable_db_options_.info_log, std::move(file_reader),
+                       &reporter, true /*checksum*/, wal_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+
+    TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:BeforeReadWal",
+                             /*arg=*/nullptr);
+    uint64_t record_checksum;
+    while (!stop_replay_by_wal_filter &&
+           reader.ReadRecord(&record, &scratch,
+                             immutable_db_options_.wal_recovery_mode,
+                             &record_checksum) &&
+           status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+
+      // We create a new batch and initialize with a valid prot_info_ to store
+      // the data checksums
+      WriteBatch batch;
+
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        return status;
+      }
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch", &batch);
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+          &record_checksum);
+      status = WriteBatchInternal::UpdateProtectionInfo(
+          &batch, 8 /* bytes_per_key */, &record_checksum);
+      if (!status.ok()) {
+        return status;
+      }
+
+      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
+
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kPointInTimeRecovery) {
+        // In point-in-time recovery mode, if sequence id of log files are
+        // consecutive, we continue recovery despite corruption. This could
+        // happen when we open and write to a corrupted DB, where sequence id
+        // will start from the last sequence id we recovered.
+        if (sequence == *next_sequence) {
+          stop_replay_for_corruption = false;
+        }
+        if (stop_replay_for_corruption) {
+          logFileDropped();
+          break;
+        }
+      }
+
+      // For the default case of wal_filter == nullptr, always performs no-op
+      // and returns true.
+      if (!InvokeWalFilterIfNeededOnWalRecord(wal_number, fname, reporter,
+                                              status, stop_replay_by_wal_filter,
+                                              batch)) {
+        continue;
+      }
+
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      bool has_valid_writes = false;
+      status = WriteBatchInternal::InsertInto(
+          &batch, column_family_memtables_.get(), &flush_scheduler_,
+          &trim_history_scheduler_, true, wal_number, this,
+          false /* concurrent_memtable_writes */, next_sequence,
+          &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      MaybeIgnoreError(&status);
+      if (!status.ok()) {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reporter.Corruption(record.size(), status);
+        continue;
+      }
+
+      if (has_valid_writes && !read_only) {
+        // we can do this because this is called before client has access to the
+        // DB and there is only a single thread operating on DB
+        ColumnFamilyData* cfd;
+
+        while ((cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+          cfd->UnrefAndTryDelete();
+          // If this asserts, it means that InsertInto failed in
+          // filtering updates to already-flushed column families
+          assert(cfd->GetLogNumber() <= wal_number);
+          auto iter = version_edits.find(cfd->GetID());
+          assert(iter != version_edits.end());
+          VersionEdit* edit = &iter->second;
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Reflect errors immediately so that conditions like full
+            // file-systems cause the DB::Open() to fail.
+            return status;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 *next_sequence);
+        }
+      }
+    }
+
+    if (!status.ok()) {
+      if (status.IsNotSupported()) {
+        // We should not treat NotSupported as corruption. It is rather a clear
+        // sign that we are processing a WAL that is produced by an incompatible
+        // version of the code.
+        return status;
+      }
+      if (immutable_db_options_.wal_recovery_mode ==
+          WALRecoveryMode::kSkipAnyCorruptedRecords) {
+        // We should ignore all errors unconditionally
+        status = Status::OK();
+      } else if (immutable_db_options_.wal_recovery_mode ==
+                 WALRecoveryMode::kPointInTimeRecovery) {
+        if (status.IsIOError()) {
+          ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                          "IOError during point-in-time reading log #%" PRIu64
+                          " seq #%" PRIu64
+                          ". %s. This likely mean loss of synced WAL, "
+                          "thus recovery fails.",
+                          wal_number, *next_sequence,
+                          status.ToString().c_str());
+          return status;
+        }
+        // We should ignore the error but not continue replaying
+        status = Status::OK();
+        stop_replay_for_corruption = true;
+        corrupted_wal_number = wal_number;
+        if (corrupted_wal_found != nullptr) {
+          *corrupted_wal_found = true;
+        }
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Point in time recovered to log #%" PRIu64
+                       " seq #%" PRIu64,
+                       wal_number, *next_sequence);
+      } else {
+        assert(immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kTolerateCorruptedTailRecords ||
+               immutable_db_options_.wal_recovery_mode ==
+                   WALRecoveryMode::kAbsoluteConsistency);
+        return status;
+      }
+    }
+
+    flush_scheduler_.Clear();
+    trim_history_scheduler_.Clear();
+    auto last_sequence = *next_sequence - 1;
+    if ((*next_sequence != kMaxSequenceNumber) &&
+        (versions_->LastSequence() <= last_sequence)) {
+      versions_->SetLastAllocatedSequence(last_sequence);
+      versions_->SetLastPublishedSequence(last_sequence);
+      versions_->SetLastSequence(last_sequence);
+    }
+  }
+  // Compare the corrupted log number to all columnfamily's current log number.
+  // Abort Open() if any column family's log number is greater than
+  // the corrupted log number, which means CF contains data beyond the point of
+  // corruption. This could during PIT recovery when the WAL is corrupted and
+  // some (but not all) CFs are flushed
+  // Exclude the PIT case where no log is dropped after the corruption point.
+  // This is to cover the case for empty wals after corrupted log, in which we
+  // don't reset stop_replay_for_corruption.
+  if (stop_replay_for_corruption == true &&
+      (immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kPointInTimeRecovery ||
+       immutable_db_options_.wal_recovery_mode ==
+           WALRecoveryMode::kTolerateCorruptedTailRecords)) {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      // One special case cause cfd->GetLogNumber() > corrupted_wal_number but
+      // the CF is still consistent: If a new column family is created during
+      // the flush and the WAL sync fails at the same time, the new CF points to
+      // the new WAL but the old WAL is curropted. Since the new CF is empty, it
+      // is still consistent. We add the check of CF sst file size to avoid the
+      // false positive alert.
+
+      // Note that, the check of (cfd->GetLiveSstFilesSize() > 0) may leads to
+      // the ignorance of a very rare inconsistency case caused in data
+      // canclation. One CF is empty due to KV deletion. But those operations
+      // are in the WAL. If the WAL is corrupted, the status of this CF might
+      // not be consistent with others. However, the consistency check will be
+      // bypassed due to empty CF.
+      // TODO: a better and complete implementation is needed to ensure strict
+      // consistency check in WAL recovery including hanlding the tailing
+      // issues.
+      if (cfd->GetLogNumber() > corrupted_wal_number &&
+          cfd->GetLiveSstFilesSize() > 0) {
+        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                        "Column family inconsistency: SST file contains data"
+                        " beyond the point of corruption.");
+        return Status::Corruption("SST file is ahead of WALs in CF " +
+                                  cfd->GetName());
+      }
+    }
+  }
+
+  // True if there's any data in the WALs; if not, we can skip re-processing
+  // them later
+  bool data_seen = false;
+  if (!read_only) {
+    // no need to refcount since client still doesn't have access
+    // to the DB and can not drop column families while we iterate
+    const WalNumber max_wal_number = wal_numbers.back();
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      auto iter = version_edits.find(cfd->GetID());
+      assert(iter != version_edits.end());
+      VersionEdit* edit = &iter->second;
+
+      if (cfd->GetLogNumber() > max_wal_number) {
+        // Column family cfd has already flushed the data
+        // from all wals. Memtable has to be empty because
+        // we filter the updates based on wal_number
+        // (in WriteBatch::InsertInto)
+        assert(cfd->mem()->GetFirstSequenceNumber() == 0);
+        assert(edit->NumEntries() == 0);
+        continue;
+      }
+
+      TEST_SYNC_POINT_CALLBACK(
+          "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", /*arg=*/nullptr);
+
+      // flush the final memtable (if non-empty)
+      if (cfd->mem()->GetFirstSequenceNumber() != 0) {
+        // If flush happened in the middle of recovery (e.g. due to memtable
+        // being full), we flush at the end. Otherwise we'll need to record
+        // where we were on last flush, which make the logic complicated.
+        if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
+          status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
+          if (!status.ok()) {
+            // Recovery failed
+            break;
+          }
+          flushed = true;
+
+          cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                 versions_->LastSequence());
+        }
+        data_seen = true;
+      }
+
+      // Update the log number info in the version edit corresponding to this
+      // column family. Note that the version edits will be written to MANIFEST
+      // together later.
+      // writing wal_number in the manifest means that any log file
+      // with number strongly less than (wal_number + 1) is already
+      // recovered and should be ignored on next reincarnation.
+      // Since we already recovered max_wal_number, we want all wals
+      // with numbers `<= max_wal_number` (includes this one) to be ignored
+      if (flushed || cfd->mem()->GetFirstSequenceNumber() == 0) {
+        edit->SetLogNumber(max_wal_number + 1);
+      }
+    }
+    if (status.ok()) {
+      // we must mark the next log number as used, even though it's
+      // not actually used. that is because VersionSet assumes
+      // VersionSet::next_file_number_ always to be strictly greater than any
+      // log number
+      versions_->MarkFileNumberUsed(max_wal_number + 1);
+      assert(recovery_ctx != nullptr);
+
+      for (auto* cfd : *versions_->GetColumnFamilySet()) {
+        auto iter = version_edits.find(cfd->GetID());
+        assert(iter != version_edits.end());
+        recovery_ctx->UpdateVersionEdits(cfd, iter->second);
+      }
+
+      if (flushed) {
+        VersionEdit wal_deletion;
+        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+          wal_deletion.DeleteWalsBefore(max_wal_number + 1);
+        }
+        if (!allow_2pc()) {
+          // In non-2pc mode, flushing the memtables of the column families
+          // means we can advance min_log_number_to_keep.
+          wal_deletion.SetMinLogNumberToKeep(max_wal_number + 1);
+        }
+        assert(versions_->GetColumnFamilySet() != nullptr);
+        recovery_ctx->UpdateVersionEdits(
+            versions_->GetColumnFamilySet()->GetDefault(), wal_deletion);
+      }
+    }
+  }
+
+  if (status.ok()) {
+    if (data_seen && !flushed) {
+      status = RestoreAliveLogFiles(wal_numbers);
+    } else if (!wal_numbers.empty()) {  // If there's no data in the WAL, or we
+                                        // flushed all the data, still
+      // truncate the log file. If the process goes into a crash loop before
+      // the file is deleted, the preallocated space will never get freed.
+      const bool truncate = !read_only;
+      GetLogSizeAndMaybeTruncate(wal_numbers.back(), truncate, nullptr)
+          .PermitUncheckedError();
+    }
+  }
+
+  event_logger_.Log() << "job" << job_id << "event"
+                      << "recovery_finished";
+
+  return status;
+}
+
+Status DBImpl::GetLogSizeAndMaybeTruncate(uint64_t wal_number, bool truncate,
+                                          LogFileNumberSize* log_ptr) {
+  LogFileNumberSize log(wal_number);
+  std::string fname =
+      LogFileName(immutable_db_options_.GetWalDir(), wal_number);
+  Status s;
+  // This gets the appear size of the wals, not including preallocated space.
+  s = env_->GetFileSize(fname, &log.size);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::GetLogSizeAndMaybeTruncate:0", /*arg=*/&s);
+  if (s.ok() && truncate) {
+    std::unique_ptr<FSWritableFile> last_log;
+    Status truncate_status = fs_->ReopenWritableFile(
+        fname,
+        fs_->OptimizeForLogWrite(
+            file_options_,
+            BuildDBOptions(immutable_db_options_, mutable_db_options_)),
+        &last_log, nullptr);
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Truncate(log.size, IOOptions(), nullptr);
+    }
+    if (truncate_status.ok()) {
+      truncate_status = last_log->Close(IOOptions(), nullptr);
+    }
+    // Not a critical error if fail to truncate.
+    if (!truncate_status.ok() && !truncate_status.IsNotSupported()) {
+      ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                     "Failed to truncate log #%" PRIu64 ": %s", wal_number,
+                     truncate_status.ToString().c_str());
+    }
+  }
+  if (log_ptr) {
+    *log_ptr = log;
+  }
+  return s;
+}
+
+Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
+  if (wal_numbers.empty()) {
+    return Status::OK();
+  }
+  Status s;
+  mutex_.AssertHeld();
+  assert(immutable_db_options_.avoid_flush_during_recovery);
+  // Mark these as alive so they'll be considered for deletion later by
+  // FindObsoleteFiles()
+  total_log_size_ = 0;
+  log_empty_ = false;
+  uint64_t min_wal_with_unflushed_data =
+      versions_->MinLogNumberWithUnflushedData();
+  for (auto wal_number : wal_numbers) {
+    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+      // In non-2pc mode, the WAL files not backing unflushed data are not
+      // alive, thus should not be added to the alive_log_files_.
+      continue;
+    }
+    // We preallocate space for wals, but then after a crash and restart, those
+    // preallocated space are not needed anymore. It is likely only the last
+    // log has such preallocated space, so we only truncate for the last log.
+    LogFileNumberSize log;
+    s = GetLogSizeAndMaybeTruncate(
+        wal_number, /*truncate=*/(wal_number == wal_numbers.back()), &log);
+    if (!s.ok()) {
+      break;
+    }
+    total_log_size_ += log.size;
+    alive_log_files_.push_back(log);
+  }
+  return s;
+}
+
+Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
+                                           MemTable* mem, VersionEdit* edit) {
+  mutex_.AssertHeld();
+  assert(cfd);
+  assert(cfd->imm());
+  // The immutable memtable list must be empty.
+  assert(std::numeric_limits<uint64_t>::max() ==
+         cfd->imm()->GetEarliestMemTableID());
+
+  const uint64_t start_micros = immutable_db_options_.clock->NowMicros();
+
+  FileMetaData meta;
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem(
+      new std::list<uint64_t>::iterator(
+          CaptureCurrentFileNumberInPendingOutputs()));
+  meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  Status s;
+  TableProperties table_properties;
+  {
+    ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+    ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                    "[%s] [WriteLevel0TableForRecovery]"
+                    " Level-0 table #%" PRIu64 ": started",
+                    cfd->GetName().c_str(), meta.fd.GetNumber());
+
+    // Get the latest mutable cf options while the mutex is still locked
+    const MutableCFOptions mutable_cf_options =
+        *cfd->GetLatestMutableCFOptions();
+    bool paranoid_file_checks =
+        cfd->GetLatestMutableCFOptions()->paranoid_file_checks;
+
+    int64_t _current_time = 0;
+    immutable_db_options_.clock->GetCurrentTime(&_current_time)
+        .PermitUncheckedError();  // ignore error
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    meta.oldest_ancester_time = current_time;
+
+    {
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      mutex_.Unlock();
+
+      SequenceNumber earliest_write_conflict_snapshot;
+      std::vector<SequenceNumber> snapshot_seqs =
+          snapshots_.GetAll(&earliest_write_conflict_snapshot);
+      auto snapshot_checker = snapshot_checker_.get();
+      if (use_custom_gc_ && snapshot_checker == nullptr) {
+        snapshot_checker = DisableGCSnapshotChecker::Instance();
+      }
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter =
+          // This is called during recovery, where a live memtable is flushed
+          // directly. In this case, no fragmented tombstone list is cached in
+          // this memtable yet.
+          mem->NewRangeTombstoneIterator(ro, kMaxSequenceNumber,
+                                         false /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+
+      IOStatus io_s;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), mutable_cf_options, cfd->internal_comparator(),
+          cfd->int_tbl_prop_collector_factories(),
+          GetCompressionFlush(*cfd->ioptions(), mutable_cf_options),
+          mutable_cf_options.compression_opts, cfd->GetID(), cfd->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+          0 /* file_creation_time */, db_id_, db_session_id_,
+          0 /* target_file_size */, meta.fd.GetNumber());
+      SeqnoToTimeMapping empty_seqno_time_mapping;
+      s = BuildTable(
+          dbname_, versions_.get(), immutable_db_options_, tboptions,
+          file_options_for_compaction_, cfd->table_cache(), iter.get(),
+          std::move(range_del_iters), &meta, &blob_file_additions,
+          snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber,
+          snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s,
+          io_tracer_, BlobFileCreationReason::kRecovery,
+          empty_seqno_time_mapping, &event_logger_, job_id, Env::IO_HIGH,
+          nullptr /* table_properties */, write_hint,
+          nullptr /*full_history_ts_low*/, &blob_callback_);
+      LogFlush(immutable_db_options_.info_log);
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] [WriteLevel0TableForRecovery]"
+                      " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
+                      cfd->GetName().c_str(), meta.fd.GetNumber(),
+                      meta.fd.GetFileSize(), s.ToString().c_str());
+      mutex_.Lock();
+
+      // TODO(AR) is this ok?
+      if (!io_s.ok() && s.ok()) {
+        s = io_s;
+      }
+    }
+  }
+  ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  const bool has_output = meta.fd.GetFileSize() > 0;
+
+  constexpr int level = 0;
+
+  if (s.ok() && has_output) {
+    edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
+                  meta.fd.GetFileSize(), meta.smallest, meta.largest,
+                  meta.fd.smallest_seqno, meta.fd.largest_seqno,
+                  meta.marked_for_compaction, meta.temperature,
+                  meta.oldest_blob_file_number, meta.oldest_ancester_time,
+                  meta.file_creation_time, meta.file_checksum,
+                  meta.file_checksum_func_name, meta.unique_id);
+
+    for (const auto& blob : blob_file_additions) {
+      edit->AddBlobFile(blob);
+    }
+  }
+
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  stats.micros = immutable_db_options_.clock->NowMicros() - start_micros;
+
+  if (has_output) {
+    stats.bytes_written = meta.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+  cfd->internal_stats()->AddCompactionStats(level, Env::Priority::USER, stats);
+  cfd->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
+  RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
+  return s;
+}
+
+Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  if (db_options.persist_stats_to_disk) {
+    column_families.push_back(
+        ColumnFamilyDescriptor(kPersistentStatsColumnFamilyName, cf_options));
+  }
+  std::vector<ColumnFamilyHandle*> handles;
+  Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    if (db_options.persist_stats_to_disk) {
+      assert(handles.size() == 2);
+    } else {
+      assert(handles.size() == 1);
+    }
+    // i can delete the handle since DBImpl is always holding a reference to
+    // default column family
+    if (db_options.persist_stats_to_disk && handles[1] != nullptr) {
+      delete handles[1];
+    }
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::Open(const DBOptions& db_options, const std::string& dbname,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  const bool kSeqPerBatch = true;
+  const bool kBatchPerTxn = true;
+  return DBImpl::Open(db_options, dbname, column_families, handles, dbptr,
+                      !kSeqPerBatch, kBatchPerTxn);
+}
+
+// TODO: Implement the trimming in flush code path.
+// TODO: Perform trimming before inserting into memtable during recovery.
+// TODO: Pick files with max_timestamp > trim_ts by each file's timestamp meta
+// info, and handle only these files to reduce io.
+Status DB::OpenAndTrimHistory(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    std::string trim_ts) {
+  assert(dbptr != nullptr);
+  assert(handles != nullptr);
+  auto validate_options = [&db_options] {
+    if (db_options.avoid_flush_during_recovery) {
+      return Status::InvalidArgument(
+          "avoid_flush_during_recovery incompatible with "
+          "OpenAndTrimHistory");
+    }
+    return Status::OK();
+  };
+  auto s = validate_options();
+  if (!s.ok()) {
+    return s;
+  }
+
+  DB* db = nullptr;
+  s = DB::Open(db_options, dbname, column_families, handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(db);
+  CompactRangeOptions options;
+  options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  auto db_impl = static_cast_with_check<DBImpl>(db);
+  for (auto handle : *handles) {
+    assert(handle != nullptr);
+    auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(handle);
+    auto cfd = cfh->cfd();
+    assert(cfd != nullptr);
+    // Only compact column families with timestamp enabled
+    if (cfd->user_comparator() != nullptr &&
+        cfd->user_comparator()->timestamp_size() > 0) {
+      s = db_impl->CompactRangeInternal(options, handle, nullptr, nullptr,
+                                        trim_ts);
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  auto clean_op = [&handles, &db] {
+    for (auto handle : *handles) {
+      auto temp_s = db->DestroyColumnFamilyHandle(handle);
+      assert(temp_s.ok());
+    }
+    handles->clear();
+    delete db;
+  };
+  if (!s.ok()) {
+    clean_op();
+    return s;
+  }
+
+  *dbptr = db;
+  return s;
+}
+
+IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
+                           size_t preallocate_block_size,
+                           log::Writer** new_log) {
+  IOStatus io_s;
+  std::unique_ptr<FSWritableFile> lfile;
+
+  DBOptions db_options =
+      BuildDBOptions(immutable_db_options_, mutable_db_options_);
+  FileOptions opt_file_options =
+      fs_->OptimizeForLogWrite(file_options_, db_options);
+  std::string wal_dir = immutable_db_options_.GetWalDir();
+  std::string log_fname = LogFileName(wal_dir, log_file_num);
+
+  if (recycle_log_number) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "reusing log %" PRIu64 " from recycle list\n",
+                   recycle_log_number);
+    std::string old_log_fname = LogFileName(wal_dir, recycle_log_number);
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile1");
+    TEST_SYNC_POINT("DBImpl::CreateWAL:BeforeReuseWritableFile2");
+    io_s = fs_->ReuseWritableFile(log_fname, old_log_fname, opt_file_options,
+                                  &lfile, /*dbg=*/nullptr);
+  } else {
+    io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
+  }
+
+  if (io_s.ok()) {
+    lfile->SetWriteLifeTimeHint(CalculateWALWriteHint());
+    lfile->SetPreallocationBlockSize(preallocate_block_size);
+
+    const auto& listeners = immutable_db_options_.listeners;
+    FileTypeSet tmp_set = immutable_db_options_.checksum_handoff_file_types;
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(lfile), log_fname, opt_file_options,
+        immutable_db_options_.clock, io_tracer_, nullptr /* stats */, listeners,
+        nullptr, tmp_set.Contains(FileType::kWalFile),
+        tmp_set.Contains(FileType::kWalFile)));
+    *new_log = new log::Writer(std::move(file_writer), log_file_num,
+                               immutable_db_options_.recycle_log_file_num > 0,
+                               immutable_db_options_.manual_wal_flush,
+                               immutable_db_options_.wal_compression);
+    io_s = (*new_log)->AddCompressionTypeRecord();
+  }
+  return io_s;
+}
+
+Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
+                    const std::vector<ColumnFamilyDescriptor>& column_families,
+                    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+                    const bool seq_per_batch, const bool batch_per_txn) {
+  Status s = ValidateOptionsByTable(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = ValidateOptions(db_options, column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *dbptr = nullptr;
+  assert(handles);
+  handles->clear();
+
+  size_t max_write_buffer_size = 0;
+  for (auto cf : column_families) {
+    max_write_buffer_size =
+        std::max(max_write_buffer_size, cf.options.write_buffer_size);
+  }
+
+  DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
+  if (!impl->immutable_db_options_.info_log) {
+    s = impl->init_logger_creation_s_;
+    delete impl;
+    return s;
+  } else {
+    assert(impl->init_logger_creation_s_.ok());
+  }
+  s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
+  if (s.ok()) {
+    std::vector<std::string> paths;
+    for (auto& db_path : impl->immutable_db_options_.db_paths) {
+      paths.emplace_back(db_path.path);
+    }
+    for (auto& cf : column_families) {
+      for (auto& cf_path : cf.options.cf_paths) {
+        paths.emplace_back(cf_path.path);
+      }
+    }
+    for (auto& path : paths) {
+      s = impl->env_->CreateDirIfMissing(path);
+      if (!s.ok()) {
+        break;
+      }
+    }
+
+    // For recovery from NoSpace() error, we can only handle
+    // the case where the database is stored in a single path
+    if (paths.size() <= 1) {
+      impl->error_handler_.EnableAutoRecovery();
+    }
+  }
+  if (s.ok()) {
+    s = impl->CreateArchivalDirectory();
+  }
+  if (!s.ok()) {
+    delete impl;
+    return s;
+  }
+
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+  RecoveryContext recovery_ctx;
+  impl->mutex_.Lock();
+
+  // Handles create_if_missing, error_if_exists
+  uint64_t recovered_seq(kMaxSequenceNumber);
+  s = impl->Recover(column_families, false, false, false, &recovered_seq,
+                    &recovery_ctx);
+  if (s.ok()) {
+    uint64_t new_log_number = impl->versions_->NewFileNumber();
+    log::Writer* new_log = nullptr;
+    const size_t preallocate_block_size =
+        impl->GetWalPreallocateBlockSize(max_write_buffer_size);
+    s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
+                        preallocate_block_size, &new_log);
+    if (s.ok()) {
+      InstrumentedMutexLock wl(&impl->log_write_mutex_);
+      impl->logfile_number_ = new_log_number;
+      assert(new_log != nullptr);
+      assert(impl->logs_.empty());
+      impl->logs_.emplace_back(new_log_number, new_log);
+    }
+
+    if (s.ok()) {
+      impl->alive_log_files_.push_back(
+          DBImpl::LogFileNumberSize(impl->logfile_number_));
+      // In WritePrepared there could be gap in sequence numbers. This breaks
+      // the trick we use in kPointInTimeRecovery which assumes the first seq in
+      // the log right after the corrupted log is one larger than the last seq
+      // we read from the wals. To let this trick keep working, we add a dummy
+      // entry with the expected sequence to the first log right after recovery.
+      // In non-WritePrepared case also the new log after recovery could be
+      // empty, and thus missing the consecutive seq hint to distinguish
+      // middle-log corruption to corrupted-log-remained-after-recovery. This
+      // case also will be addressed by a dummy write.
+      if (recovered_seq != kMaxSequenceNumber) {
+        WriteBatch empty_batch;
+        WriteBatchInternal::SetSequence(&empty_batch, recovered_seq);
+        WriteOptions write_options;
+        uint64_t log_used, log_size;
+        log::Writer* log_writer = impl->logs_.back().writer;
+        LogFileNumberSize& log_file_number_size = impl->alive_log_files_.back();
+
+        assert(log_writer->get_log_number() == log_file_number_size.number);
+        impl->mutex_.AssertHeld();
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+                             Env::IO_TOTAL, log_file_number_size);
+        if (s.ok()) {
+          // Need to fsync, otherwise it might get lost after a power reset.
+          s = impl->FlushWAL(false);
+          TEST_SYNC_POINT_CALLBACK("DBImpl::Open::BeforeSyncWAL", /*arg=*/&s);
+          if (s.ok()) {
+            s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+          }
+        }
+      }
+    }
+  }
+  if (s.ok()) {
+    s = impl->LogAndApplyForRecovery(recovery_ctx);
+  }
+
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    impl->mutex_.AssertHeld();
+    s = impl->InitPersistStatsColumnFamily();
+  }
+
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd != nullptr) {
+        handles->push_back(
+            new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+        impl->NewThreadStatusCfInfo(cfd);
+      } else {
+        if (db_options.create_missing_column_families) {
+          // missing column family, create it
+          ColumnFamilyHandle* handle = nullptr;
+          impl->mutex_.Unlock();
+          s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
+          impl->mutex_.Lock();
+          if (s.ok()) {
+            handles->push_back(handle);
+          } else {
+            break;
+          }
+        } else {
+          s = Status::InvalidArgument("Column family not found", cf.name);
+          break;
+        }
+      }
+    }
+  }
+
+  if (s.ok()) {
+    SuperVersionContext sv_context(/* create_superversion */ true);
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      impl->InstallSuperVersionAndScheduleWork(
+          cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
+    }
+    sv_context.Clean();
+  }
+
+  if (s.ok() && impl->immutable_db_options_.persist_stats_to_disk) {
+    // try to read format version
+    s = impl->PersistentStatsProcessFormatVersion();
+  }
+
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      if (!cfd->mem()->IsSnapshotSupported()) {
+        impl->is_snapshot_supported_ = false;
+      }
+      if (cfd->ioptions()->merge_operator != nullptr &&
+          !cfd->mem()->IsMergeOperatorSupported()) {
+        s = Status::InvalidArgument(
+            "The memtable of column family %s does not support merge operator "
+            "its options.merge_operator is non-null",
+            cfd->GetName().c_str());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("DBImpl::Open:Opened");
+  Status persist_options_status;
+  if (s.ok()) {
+    // Persist RocksDB Options before scheduling the compaction.
+    // The WriteOptionsFile() will release and lock the mutex internally.
+    persist_options_status = impl->WriteOptionsFile(
+        false /*need_mutex_lock*/, false /*need_enter_write_thread*/);
+
+    *dbptr = impl;
+    impl->opened_successfully_ = true;
+    impl->DeleteObsoleteFiles();
+    TEST_SYNC_POINT("DBImpl::Open:AfterDeleteFiles");
+    impl->MaybeScheduleFlushOrCompaction();
+  } else {
+    persist_options_status.PermitUncheckedError();
+  }
+  impl->mutex_.Unlock();
+
+#ifndef ROCKSDB_LITE
+  auto sfm = static_cast<SstFileManagerImpl*>(
+      impl->immutable_db_options_.sst_file_manager.get());
+  if (s.ok() && sfm) {
+    // Set Statistics ptr for SstFileManager to dump the stats of
+    // DeleteScheduler.
+    sfm->SetStatisticsPtr(impl->immutable_db_options_.statistics);
+    ROCKS_LOG_INFO(impl->immutable_db_options_.info_log,
+                   "SstFileManager instance %p", sfm);
+
+    // Notify SstFileManager about all sst files that already exist in
+    // db_paths[0] and cf_paths[0] when the DB is opened.
+
+    // SstFileManagerImpl needs to know sizes of the files. For files whose size
+    // we already know (sst files that appear in manifest - typically that's the
+    // vast majority of all files), we'll pass the size to SstFileManager.
+    // For all other files SstFileManager will query the size from filesystem.
+
+    std::vector<ColumnFamilyMetaData> metadata;
+    impl->GetAllColumnFamilyMetaData(&metadata);
+
+    std::unordered_map<std::string, uint64_t> known_file_sizes;
+    for (const auto& md : metadata) {
+      for (const auto& lmd : md.levels) {
+        for (const auto& fmd : lmd.files) {
+          known_file_sizes[fmd.relative_filename] = fmd.size;
+        }
+      }
+      for (const auto& bmd : md.blob_files) {
+        std::string name = bmd.blob_file_name;
+        // The BlobMetaData.blob_file_name may start with "/".
+        if (!name.empty() && name[0] == '/') {
+          name = name.substr(1);
+        }
+        known_file_sizes[name] = bmd.blob_file_size;
+      }
+    }
+
+    std::vector<std::string> paths;
+    paths.emplace_back(impl->immutable_db_options_.db_paths[0].path);
+    for (auto& cf : column_families) {
+      if (!cf.options.cf_paths.empty()) {
+        paths.emplace_back(cf.options.cf_paths[0].path);
+      }
+    }
+    // Remove duplicate paths.
+    std::sort(paths.begin(), paths.end());
+    paths.erase(std::unique(paths.begin(), paths.end()), paths.end());
+    IOOptions io_opts;
+    io_opts.do_not_recurse = true;
+    for (auto& path : paths) {
+      std::vector<std::string> existing_files;
+      impl->immutable_db_options_.fs
+          ->GetChildren(path, io_opts, &existing_files,
+                        /*IODebugContext*=*/nullptr)
+          .PermitUncheckedError();  //**TODO: What do to on error?
+      for (auto& file_name : existing_files) {
+        uint64_t file_number;
+        FileType file_type;
+        std::string file_path = path + "/" + file_name;
+        if (ParseFileName(file_name, &file_number, &file_type) &&
+            (file_type == kTableFile || file_type == kBlobFile)) {
+          // TODO: Check for errors from OnAddFile?
+          if (known_file_sizes.count(file_name)) {
+            // We're assuming that each sst file name exists in at most one of
+            // the paths.
+            sfm->OnAddFile(file_path, known_file_sizes.at(file_name))
+                .PermitUncheckedError();
+          } else {
+            sfm->OnAddFile(file_path).PermitUncheckedError();
+          }
+        }
+      }
+    }
+
+    // Reserve some disk buffer space. This is a heuristic - when we run out
+    // of disk space, this ensures that there is atleast write_buffer_size
+    // amount of free space before we resume DB writes. In low disk space
+    // conditions, we want to avoid a lot of small L0 files due to frequent
+    // WAL write failures and resultant forced flushes
+    sfm->ReserveDiskBuffer(max_write_buffer_size,
+                           impl->immutable_db_options_.db_paths[0].path);
+  }
+
+#endif  // !ROCKSDB_LITE
+
+  if (s.ok()) {
+    ROCKS_LOG_HEADER(impl->immutable_db_options_.info_log, "DB pointer %p",
+                     impl);
+    LogFlush(impl->immutable_db_options_.info_log);
+    if (!impl->WALBufferIsEmpty()) {
+      s = impl->FlushWAL(false);
+      if (s.ok()) {
+        // Sync is needed otherwise WAL buffered data might get lost after a
+        // power reset.
+        log::Writer* log_writer = impl->logs_.back().writer;
+        s = log_writer->file()->Sync(impl->immutable_db_options_.use_fsync);
+      }
+    }
+    if (s.ok() && !persist_options_status.ok()) {
+      s = Status::IOError(
+          "DB::Open() failed --- Unable to persist Options file",
+          persist_options_status.ToString());
+    }
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(impl->immutable_db_options_.info_log,
+                   "DB::Open() failed: %s", s.ToString().c_str());
+  }
+  if (s.ok()) {
+    s = impl->StartPeriodicTaskScheduler();
+  }
+
+  if (s.ok()) {
+    s = impl->RegisterRecordSeqnoTimeWorker();
+  }
+  if (!s.ok()) {
+    for (auto* h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+    *dbptr = nullptr;
+  }
+  return s;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.cc b/src/rocksdb/db/db_impl/db_impl_readonly.cc
new file mode 100644
index 000000000..0f10baf24
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.cc
@@ -0,0 +1,341 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_readonly.h"
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_impl/compacted_db_impl.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/merge_context.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
+                               const std::string& dbname)
+    : DBImpl(db_options, dbname, /*seq_per_batch*/ false,
+             /*batch_per_txn*/ true, /*read_only*/ true) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in read only mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplReadOnly::~DBImplReadOnly() {}
+
+// Implementations of the DB interface
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val) {
+  return Get(read_options, column_family, key, pinnable_val,
+             /*timestamp*/ nullptr);
+}
+
+Status DBImplReadOnly::Get(const ReadOptions& read_options,
+                           ColumnFamilyHandle* column_family, const Slice& key,
+                           PinnableSlice* pinnable_val,
+                           std::string* timestamp) {
+  assert(pinnable_val != nullptr);
+  // TODO: stopwatch DB_GET needed?, perf timer needed?
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamps for returning results so that we can distinguish
+  // between tombstone or key that has never been written
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  const Comparator* ucmp = column_family->GetComparator();
+  assert(ucmp);
+  std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+
+  Status s;
+  SequenceNumber snapshot = versions_->LastSequence();
+  GetWithTimestampReadCallback read_cb(snapshot);
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  SuperVersion* super_version = cfd->GetSuperVersion();
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*columns=*/nullptr, ts, &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options,
+                              false /* immutable_memtable */, &read_cb)) {
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(
+        read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+        /*value_found*/ nullptr,
+        /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb,
+        /*is_blob*/ nullptr,
+        /*do_merge*/ true);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  RecordTick(stats_, NUMBER_KEYS_READ);
+  size_t size = pinnable_val->size();
+  RecordTick(stats_, BYTES_READ, size);
+  RecordInHistogram(stats_, BYTES_PER_READ, size);
+  PERF_COUNTER_ADD(get_read_bytes, size);
+  return s;
+}
+
+Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
+                                      ColumnFamilyHandle* column_family) {
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      super_version->current, read_seq,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      read_seq, /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplReadOnly::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  SequenceNumber latest_snapshot = versions_->LastSequence();
+  SequenceNumber read_seq =
+      read_options.snapshot != nullptr
+          ? reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)
+                ->number_
+          : latest_snapshot;
+
+  for (auto cfh : column_families) {
+    auto* cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+    auto* sv = cfd->GetSuperVersion()->Ref();
+    auto* db_iter = NewArenaWrappedDbIterator(
+        env_, read_options, *cfd->ioptions(), sv->mutable_cf_options,
+        sv->current, read_seq,
+        sv->mutable_cf_options.max_sequential_skip_in_iterations,
+        sv->version_number, read_callback);
+    auto* internal_iter = NewInternalIterator(
+        db_iter->GetReadOptions(), cfd, sv, db_iter->GetArena(), read_seq,
+        /* allow_unprepared_value */ true, db_iter);
+    db_iter->SetIterUnderDBIter(internal_iter);
+    iterators->push_back(db_iter);
+  }
+
+  return Status::OK();
+}
+
+namespace {
+// Return OK if dbname exists in the file system or create it if
+// create_if_missing
+Status OpenForReadOnlyCheckExistence(const DBOptions& db_options,
+                                     const std::string& dbname) {
+  Status s;
+  if (!db_options.create_if_missing) {
+    // Attempt to read "CURRENT" file
+    const std::shared_ptr<FileSystem>& fs = db_options.env->GetFileSystem();
+    std::string manifest_path;
+    uint64_t manifest_file_number;
+    s = VersionSet::GetCurrentManifestPath(dbname, fs.get(), &manifest_path,
+                                           &manifest_file_number);
+  } else {
+    // Historic behavior that doesn't necessarily make sense
+    s = db_options.env->CreateDirIfMissing(dbname);
+  }
+  return s;
+}
+}  // namespace
+
+Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
+                           DB** dbptr, bool /*error_if_wal_file_exists*/) {
+  Status s = OpenForReadOnlyCheckExistence(options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
+  *dbptr = nullptr;
+
+  // Try to first open DB as fully compacted DB
+  s = CompactedDBImpl::Open(options, dbname, dbptr);
+  if (s.ok()) {
+    return s;
+  }
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    // i can delete the handle since DBImpl is always holding a
+    // reference to default column family
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_wal_file_exists) {
+  // If dbname does not exist in the file system, should not do anything
+  Status s = OpenForReadOnlyCheckExistence(db_options, dbname);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+      db_options, dbname, column_families, handles, dbptr,
+      error_if_wal_file_exists);
+}
+
+Status DBImplReadOnly::OpenForReadOnlyWithoutCheck(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+    bool error_if_wal_file_exists) {
+  *dbptr = nullptr;
+  handles->clear();
+
+  SuperVersionContext sv_context(/* create_superversion */ true);
+  DBImplReadOnly* impl = new DBImplReadOnly(db_options, dbname);
+  impl->mutex_.Lock();
+  Status s = impl->Recover(column_families, true /* read only */,
+                           error_if_wal_file_exists);
+  if (s.ok()) {
+    // set column family handles
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (cfd == nullptr) {
+        s = Status::InvalidArgument("Column family not found", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto* h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenForReadOnly(const Options& /*options*/,
+                           const std::string& /*dbname*/, DB** /*dbptr*/,
+                           bool /*error_if_wal_file_exists*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenForReadOnly(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/,
+    bool /*error_if_wal_file_exists*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_readonly.h b/src/rocksdb/db/db_impl/db_impl_readonly.h
new file mode 100644
index 000000000..b876a0fda
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_readonly.h
@@ -0,0 +1,170 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO: Share common structure with CompactedDBImpl and DBImplSecondary
+class DBImplReadOnly : public DBImpl {
+ public:
+  DBImplReadOnly(const DBOptions& options, const std::string& dbname);
+  // No copying allowed
+  DBImplReadOnly(const DBImplReadOnly&) = delete;
+  void operator=(const DBImplReadOnly&) = delete;
+
+  virtual ~DBImplReadOnly();
+
+  // Implementations of the DB interface
+  using DB::Get;
+  virtual Status Get(const ReadOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     PinnableSlice* value) override;
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  // TODO: Implement ReadOnly MultiGet?
+
+  using DBImpl::NewIterator;
+  virtual Iterator* NewIterator(const ReadOptions&,
+                                ColumnFamilyHandle* column_family) override;
+
+  virtual Status NewIterators(
+      const ReadOptions& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  virtual Status Put(const WriteOptions& /*options*/,
+                     ColumnFamilyHandle* /*column_family*/,
+                     const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::Merge;
+  virtual Status Merge(const WriteOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/,
+                       const Slice& /*key*/, const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::Delete;
+  virtual Status Delete(const WriteOptions& /*options*/,
+                        ColumnFamilyHandle* /*column_family*/,
+                        const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::SingleDelete;
+  virtual Status SingleDelete(const WriteOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status Write(const WriteOptions& /*options*/,
+                       WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  using DBImpl::CompactRange;
+  virtual Status CompactRange(const CompactRangeOptions& /*options*/,
+                              ColumnFamilyHandle* /*column_family*/,
+                              const Slice* /*begin*/,
+                              const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  virtual Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  virtual Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+  virtual Status GetLiveFiles(std::vector<std::string>& ret,
+                              uint64_t* manifest_file_size,
+                              bool /*flush_memtable*/) override {
+    return DBImpl::GetLiveFiles(ret, manifest_file_size,
+                                false /* flush_memtable */);
+  }
+
+  using DBImpl::Flush;
+  virtual Status Flush(const FlushOptions& /*options*/,
+                       ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  virtual Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::IngestExternalFile;
+  virtual Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not supported operation in read only mode.");
+  }
+
+  // FIXME: some missing overrides for more "write" functions
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  // A "helper" function for DB::OpenForReadOnly without column families
+  // to reduce unnecessary I/O
+  // It has the same functionality as DB::OpenForReadOnly with column families
+  // but does not check the existence of dbname in the file system
+  static Status OpenForReadOnlyWithoutCheck(
+      const DBOptions& db_options, const std::string& dbname,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
+      bool error_if_wal_file_exists = false);
+  friend class DB;
+};
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.cc b/src/rocksdb/db/db_impl/db_impl_secondary.cc
new file mode 100644
index 000000000..5189d17d9
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.cc
@@ -0,0 +1,967 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_impl/db_impl_secondary.h"
+
+#include <cinttypes>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/merge_context.h"
+#include "logging/auto_roll_logger.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/configurable.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+DBImplSecondary::DBImplSecondary(const DBOptions& db_options,
+                                 const std::string& dbname,
+                                 std::string secondary_path)
+    : DBImpl(db_options, dbname, false, true, true),
+      secondary_path_(std::move(secondary_path)) {
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "Opening the db in secondary mode");
+  LogFlush(immutable_db_options_.info_log);
+}
+
+DBImplSecondary::~DBImplSecondary() {}
+
+Status DBImplSecondary::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    bool /*readonly*/, bool /*error_if_wal_file_exists*/,
+    bool /*error_if_data_exists_in_wals*/, uint64_t*,
+    RecoveryContext* /*recovery_ctx*/) {
+  mutex_.AssertHeld();
+
+  JobContext job_context(0);
+  Status s;
+  s = static_cast<ReactiveVersionSet*>(versions_.get())
+          ->Recover(column_families, &manifest_reader_, &manifest_reporter_,
+                    &manifest_reader_status_);
+  if (!s.ok()) {
+    if (manifest_reader_status_) {
+      manifest_reader_status_->PermitUncheckedError();
+    }
+    return s;
+  }
+  if (immutable_db_options_.paranoid_checks && s.ok()) {
+    s = CheckConsistency();
+  }
+  // Initial max_total_in_memory_state_ before recovery logs.
+  max_total_in_memory_state_ = 0;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+    max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
+                                  mutable_cf_options->max_write_buffer_number;
+  }
+  if (s.ok()) {
+    default_cf_handle_ = new ColumnFamilyHandleImpl(
+        versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
+    default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
+
+    std::unordered_set<ColumnFamilyData*> cfds_changed;
+    s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+  }
+
+  if (s.IsPathNotFound()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Secondary tries to read WAL, but WAL file(s) have already "
+                   "been purged by primary.");
+    s = Status::OK();
+  }
+  // TODO: update options_file_number_ needed?
+
+  job_context.Clean();
+  return s;
+}
+
+// find new WAL and apply them in order to the secondary instance
+Status DBImplSecondary::FindAndRecoverLogFiles(
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  Status s;
+  std::vector<uint64_t> logs;
+  s = FindNewLogNumbers(&logs);
+  if (s.ok() && !logs.empty()) {
+    SequenceNumber next_sequence(kMaxSequenceNumber);
+    s = RecoverLogFiles(logs, &next_sequence, cfds_changed, job_context);
+  }
+  return s;
+}
+
+// List wal_dir and find all new WALs, return these log numbers
+Status DBImplSecondary::FindNewLogNumbers(std::vector<uint64_t>* logs) {
+  assert(logs != nullptr);
+  std::vector<std::string> filenames;
+  Status s;
+  IOOptions io_opts;
+  io_opts.do_not_recurse = true;
+  s = immutable_db_options_.fs->GetChildren(immutable_db_options_.GetWalDir(),
+                                            io_opts, &filenames,
+                                            /*IODebugContext*=*/nullptr);
+  if (s.IsNotFound()) {
+    return Status::InvalidArgument("Failed to open wal_dir",
+                                   immutable_db_options_.GetWalDir());
+  } else if (!s.ok()) {
+    return s;
+  }
+
+  // if log_readers_ is non-empty, it means we have applied all logs with log
+  // numbers smaller than the smallest log in log_readers_, so there is no
+  // need to pass these logs to RecoverLogFiles
+  uint64_t log_number_min = 0;
+  if (!log_readers_.empty()) {
+    log_number_min = log_readers_.begin()->first;
+  }
+  for (size_t i = 0; i < filenames.size(); i++) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(filenames[i], &number, &type) && type == kWalFile &&
+        number >= log_number_min) {
+      logs->push_back(number);
+    }
+  }
+  // Recover logs in the order that they were generated
+  if (!logs->empty()) {
+    std::sort(logs->begin(), logs->end());
+  }
+  return s;
+}
+
+Status DBImplSecondary::MaybeInitLogReader(
+    uint64_t log_number, log::FragmentBufferedReader** log_reader) {
+  auto iter = log_readers_.find(log_number);
+  // make sure the log file is still present
+  if (iter == log_readers_.end() ||
+      iter->second->reader_->GetLogNumber() != log_number) {
+    // delete the obsolete log reader if log number mismatch
+    if (iter != log_readers_.end()) {
+      log_readers_.erase(iter);
+    }
+    // initialize log reader from log_number
+    // TODO: min_log_number_to_keep_2pc check needed?
+    // Open the log file
+    std::string fname =
+        LogFileName(immutable_db_options_.GetWalDir(), log_number);
+    ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                   "Recovering log #%" PRIu64 " mode %d", log_number,
+                   static_cast<int>(immutable_db_options_.wal_recovery_mode));
+
+    std::unique_ptr<SequentialFileReader> file_reader;
+    {
+      std::unique_ptr<FSSequentialFile> file;
+      Status status = fs_->NewSequentialFile(
+          fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+      if (!status.ok()) {
+        *log_reader = nullptr;
+        return status;
+      }
+      file_reader.reset(new SequentialFileReader(
+          std::move(file), fname, immutable_db_options_.log_readahead_size,
+          io_tracer_));
+    }
+
+    // Create the log reader.
+    LogReaderContainer* log_reader_container = new LogReaderContainer(
+        env_, immutable_db_options_.info_log, std::move(fname),
+        std::move(file_reader), log_number);
+    log_readers_.insert(std::make_pair(
+        log_number, std::unique_ptr<LogReaderContainer>(log_reader_container)));
+  }
+  iter = log_readers_.find(log_number);
+  assert(iter != log_readers_.end());
+  *log_reader = iter->second->reader_;
+  return Status::OK();
+}
+
+// After manifest recovery, replay WALs and refresh log_readers_ if necessary
+// REQUIRES: log_numbers are sorted in ascending order
+Status DBImplSecondary::RecoverLogFiles(
+    const std::vector<uint64_t>& log_numbers, SequenceNumber* next_sequence,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed,
+    JobContext* job_context) {
+  assert(nullptr != cfds_changed);
+  assert(nullptr != job_context);
+  mutex_.AssertHeld();
+  Status status;
+  for (auto log_number : log_numbers) {
+    log::FragmentBufferedReader* reader = nullptr;
+    status = MaybeInitLogReader(log_number, &reader);
+    if (!status.ok()) {
+      return status;
+    }
+    assert(reader != nullptr);
+  }
+  for (auto log_number : log_numbers) {
+    auto it = log_readers_.find(log_number);
+    assert(it != log_readers_.end());
+    log::FragmentBufferedReader* reader = it->second->reader_;
+    Status* wal_read_status = it->second->status_;
+    assert(wal_read_status);
+    // Manually update the file number allocation counter in VersionSet.
+    versions_->MarkFileNumberUsed(log_number);
+
+    // Determine if we should tolerate incomplete records at the tail end of the
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+
+    while (reader->ReadRecord(&record, &scratch,
+                              immutable_db_options_.wal_recovery_mode) &&
+           wal_read_status->ok() && status.ok()) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reader->GetReporter()->Corruption(
+            record.size(), Status::Corruption("log record too small"));
+        continue;
+      }
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (!status.ok()) {
+        break;
+      }
+      SequenceNumber seq_of_batch = WriteBatchInternal::Sequence(&batch);
+      std::vector<uint32_t> column_family_ids;
+      status = CollectColumnFamilyIdsFromWriteBatch(batch, &column_family_ids);
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          if (cfds_changed->count(cfd) == 0) {
+            cfds_changed->insert(cfd);
+          }
+          const std::vector<FileMetaData*>& l0_files =
+              cfd->current()->storage_info()->LevelFiles(0);
+          SequenceNumber seq =
+              l0_files.empty() ? 0 : l0_files.back()->fd.largest_seqno;
+          // If the write batch's sequence number is smaller than the last
+          // sequence number of the largest sequence persisted for this column
+          // family, then its data must reside in an SST that has already been
+          // added in the prior MANIFEST replay.
+          if (seq_of_batch <= seq) {
+            continue;
+          }
+          auto curr_log_num = std::numeric_limits<uint64_t>::max();
+          if (cfd_to_current_log_.count(cfd) > 0) {
+            curr_log_num = cfd_to_current_log_[cfd];
+          }
+          // If the active memtable contains records added by replaying an
+          // earlier WAL, then we need to seal the memtable, add it to the
+          // immutable memtable list and create a new active memtable.
+          if (!cfd->mem()->IsEmpty() &&
+              (curr_log_num == std::numeric_limits<uint64_t>::max() ||
+               curr_log_num != log_number)) {
+            const MutableCFOptions mutable_cf_options =
+                *cfd->GetLatestMutableCFOptions();
+            MemTable* new_mem =
+                cfd->ConstructNewMemtable(mutable_cf_options, seq_of_batch);
+            cfd->mem()->SetNextLogNumber(log_number);
+            cfd->mem()->ConstructFragmentedRangeTombstones();
+            cfd->imm()->Add(cfd->mem(), &job_context->memtables_to_free);
+            new_mem->Ref();
+            cfd->SetMemtable(new_mem);
+          }
+        }
+        bool has_valid_writes = false;
+        status = WriteBatchInternal::InsertInto(
+            &batch, column_family_memtables_.get(),
+            nullptr /* flush_scheduler */, nullptr /* trim_history_scheduler*/,
+            true, log_number, this, false /* concurrent_memtable_writes */,
+            next_sequence, &has_valid_writes, seq_per_batch_, batch_per_txn_);
+      }
+      // If column family was not found, it might mean that the WAL write
+      // batch references to the column family that was dropped after the
+      // insert. We don't want to fail the whole write batch in that case --
+      // we just ignore the update.
+      // That's why we set ignore missing column families to true
+      // passing null flush_scheduler will disable memtable flushing which is
+      // needed for secondary instances
+      if (status.ok()) {
+        for (const auto id : column_family_ids) {
+          ColumnFamilyData* cfd =
+              versions_->GetColumnFamilySet()->GetColumnFamily(id);
+          if (cfd == nullptr) {
+            continue;
+          }
+          std::unordered_map<ColumnFamilyData*, uint64_t>::iterator iter =
+              cfd_to_current_log_.find(cfd);
+          if (iter == cfd_to_current_log_.end()) {
+            cfd_to_current_log_.insert({cfd, log_number});
+          } else if (log_number > iter->second) {
+            iter->second = log_number;
+          }
+        }
+        auto last_sequence = *next_sequence - 1;
+        if ((*next_sequence != kMaxSequenceNumber) &&
+            (versions_->LastSequence() <= last_sequence)) {
+          versions_->SetLastAllocatedSequence(last_sequence);
+          versions_->SetLastPublishedSequence(last_sequence);
+          versions_->SetLastSequence(last_sequence);
+        }
+      } else {
+        // We are treating this as a failure while reading since we read valid
+        // blocks that do not form coherent data
+        reader->GetReporter()->Corruption(record.size(), status);
+      }
+    }
+    if (status.ok() && !wal_read_status->ok()) {
+      status = *wal_read_status;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+  }
+  // remove logreaders from map after successfully recovering the WAL
+  if (log_readers_.size() > 1) {
+    auto erase_iter = log_readers_.begin();
+    std::advance(erase_iter, log_readers_.size() - 1);
+    log_readers_.erase(log_readers_.begin(), erase_iter);
+  }
+  return status;
+}
+
+// Implementation of the DB interface
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value) {
+  return GetImpl(read_options, column_family, key, value,
+                 /*timestamp*/ nullptr);
+}
+
+Status DBImplSecondary::Get(const ReadOptions& read_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            PinnableSlice* value, std::string* timestamp) {
+  return GetImpl(read_options, column_family, key, value, timestamp);
+}
+
+Status DBImplSecondary::GetImpl(const ReadOptions& read_options,
+                                ColumnFamilyHandle* column_family,
+                                const Slice& key, PinnableSlice* pinnable_val,
+                                std::string* timestamp) {
+  assert(pinnable_val != nullptr);
+  PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock);
+  StopWatch sw(immutable_db_options_.clock, stats_, DB_GET);
+  PERF_TIMER_GUARD(get_snapshot_time);
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  // Clear the timestamp for returning results so that we can distinguish
+  // between tombstone or key that has never been written later.
+  if (timestamp) {
+    timestamp->clear();
+  }
+
+  auto cfh = static_cast<ColumnFamilyHandleImpl*>(column_family);
+  ColumnFamilyData* cfd = cfh->cfd();
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_) {
+      tracer_->Get(column_family, key);
+    }
+  }
+  // Acquire SuperVersion
+  SuperVersion* super_version = GetAndRefSuperVersion(cfd);
+  SequenceNumber snapshot = versions_->LastSequence();
+  GetWithTimestampReadCallback read_cb(snapshot);
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  Status s;
+  LookupKey lkey(key, snapshot, read_options.timestamp);
+  PERF_TIMER_STOP(get_snapshot_time);
+
+  bool done = false;
+  const Comparator* ucmp = column_family->GetComparator();
+  assert(ucmp);
+  std::string* ts = ucmp->timestamp_size() > 0 ? timestamp : nullptr;
+  if (super_version->mem->Get(lkey, pinnable_val->GetSelf(),
+                              /*columns=*/nullptr, ts, &s, &merge_context,
+                              &max_covering_tombstone_seq, read_options,
+                              false /* immutable_memtable */, &read_cb)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  } else if ((s.ok() || s.IsMergeInProgress()) &&
+             super_version->imm->Get(
+                 lkey, pinnable_val->GetSelf(), /*columns=*/nullptr, ts, &s,
+                 &merge_context, &max_covering_tombstone_seq, read_options,
+                 &read_cb)) {
+    done = true;
+    pinnable_val->PinSelf();
+    RecordTick(stats_, MEMTABLE_HIT);
+  }
+  if (!done && !s.ok() && !s.IsMergeInProgress()) {
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    return s;
+  }
+  if (!done) {
+    PERF_TIMER_GUARD(get_from_output_files_time);
+    PinnedIteratorsManager pinned_iters_mgr;
+    super_version->current->Get(
+        read_options, lkey, pinnable_val, /*columns=*/nullptr, ts, &s,
+        &merge_context, &max_covering_tombstone_seq, &pinned_iters_mgr,
+        /*value_found*/ nullptr,
+        /*key_exists*/ nullptr, /*seq*/ nullptr, &read_cb, /*is_blob*/ nullptr,
+        /*do_merge*/ true);
+    RecordTick(stats_, MEMTABLE_MISS);
+  }
+  {
+    PERF_TIMER_GUARD(get_post_process_time);
+    ReturnAndCleanupSuperVersion(cfd, super_version);
+    RecordTick(stats_, NUMBER_KEYS_READ);
+    size_t size = pinnable_val->size();
+    RecordTick(stats_, BYTES_READ, size);
+    RecordTimeToHistogram(stats_, BYTES_PER_READ, size);
+    PERF_COUNTER_ADD(get_read_bytes, size);
+  }
+  return s;
+}
+
+Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options,
+                                       ColumnFamilyHandle* column_family) {
+  if (read_options.managed) {
+    return NewErrorIterator(
+        Status::NotSupported("Managed iterator is not supported anymore."));
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return NewErrorIterator(Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators."));
+  }
+
+  assert(column_family);
+  if (read_options.timestamp) {
+    const Status s = FailIfTsMismatchCf(
+        column_family, *(read_options.timestamp), /*ts_for_read=*/true);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  } else {
+    const Status s = FailIfCfHasTs(column_family);
+    if (!s.ok()) {
+      return NewErrorIterator(s);
+    }
+  }
+
+  Iterator* result = nullptr;
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  auto cfd = cfh->cfd();
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (read_options.tailing) {
+    return NewErrorIterator(Status::NotSupported(
+        "tailing iterator not supported in secondary mode"));
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return NewErrorIterator(
+        Status::NotSupported("snapshot not supported in secondary mode"));
+  } else {
+    SequenceNumber snapshot(kMaxSequenceNumber);
+    result = NewIteratorImpl(read_options, cfd, snapshot, read_callback);
+  }
+  return result;
+}
+
+ArenaWrappedDBIter* DBImplSecondary::NewIteratorImpl(
+    const ReadOptions& read_options, ColumnFamilyData* cfd,
+    SequenceNumber snapshot, ReadCallback* read_callback,
+    bool expose_blob_index, bool allow_refresh) {
+  assert(nullptr != cfd);
+  SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
+  assert(snapshot == kMaxSequenceNumber);
+  snapshot = versions_->LastSequence();
+  assert(snapshot != kMaxSequenceNumber);
+  auto db_iter = NewArenaWrappedDbIterator(
+      env_, read_options, *cfd->ioptions(), super_version->mutable_cf_options,
+      super_version->current, snapshot,
+      super_version->mutable_cf_options.max_sequential_skip_in_iterations,
+      super_version->version_number, read_callback, this, cfd,
+      expose_blob_index, read_options.snapshot ? false : allow_refresh);
+  auto internal_iter = NewInternalIterator(
+      db_iter->GetReadOptions(), cfd, super_version, db_iter->GetArena(),
+      snapshot, /* allow_unprepared_value */ true, db_iter);
+  db_iter->SetIterUnderDBIter(internal_iter);
+  return db_iter;
+}
+
+Status DBImplSecondary::NewIterators(
+    const ReadOptions& read_options,
+    const std::vector<ColumnFamilyHandle*>& column_families,
+    std::vector<Iterator*>* iterators) {
+  if (read_options.managed) {
+    return Status::NotSupported("Managed iterator is not supported anymore.");
+  }
+  if (read_options.read_tier == kPersistedTier) {
+    return Status::NotSupported(
+        "ReadTier::kPersistedData is not yet supported in iterators.");
+  }
+  ReadCallback* read_callback = nullptr;  // No read callback provided.
+  if (iterators == nullptr) {
+    return Status::InvalidArgument("iterators not allowed to be nullptr");
+  }
+
+  if (read_options.timestamp) {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfTsMismatchCf(cf, *(read_options.timestamp),
+                                          /*ts_for_read=*/true);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  } else {
+    for (auto* cf : column_families) {
+      assert(cf);
+      const Status s = FailIfCfHasTs(cf);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+  iterators->clear();
+  iterators->reserve(column_families.size());
+  if (read_options.tailing) {
+    return Status::NotSupported(
+        "tailing iterator not supported in secondary mode");
+  } else if (read_options.snapshot != nullptr) {
+    // TODO (yanqin) support snapshot.
+    return Status::NotSupported("snapshot not supported in secondary mode");
+  } else {
+    SequenceNumber read_seq(kMaxSequenceNumber);
+    for (auto cfh : column_families) {
+      ColumnFamilyData* cfd = static_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
+      iterators->push_back(
+          NewIteratorImpl(read_options, cfd, read_seq, read_callback));
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImplSecondary::CheckConsistency() {
+  mutex_.AssertHeld();
+  Status s = DBImpl::CheckConsistency();
+  // If DBImpl::CheckConsistency() which is stricter returns success, then we
+  // do not need to give a second chance.
+  if (s.ok()) {
+    return s;
+  }
+  // It's possible that DBImpl::CheckConssitency() can fail because the primary
+  // may have removed certain files, causing the GetFileSize(name) call to
+  // fail and returning a PathNotFound. In this case, we take a best-effort
+  // approach and just proceed.
+  TEST_SYNC_POINT_CALLBACK(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", &s);
+
+  if (immutable_db_options_.skip_checking_sst_file_sizes_on_db_open) {
+    return Status::OK();
+  }
+
+  std::vector<LiveFileMetaData> metadata;
+  versions_->GetLiveFilesMetaData(&metadata);
+
+  std::string corruption_messages;
+  for (const auto& md : metadata) {
+    // md.name has a leading "/".
+    std::string file_path = md.db_path + md.name;
+
+    uint64_t fsize = 0;
+    s = env_->GetFileSize(file_path, &fsize);
+    if (!s.ok() &&
+        (env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok() ||
+         s.IsPathNotFound())) {
+      s = Status::OK();
+    }
+    if (!s.ok()) {
+      corruption_messages +=
+          "Can't access " + md.name + ": " + s.ToString() + "\n";
+    }
+  }
+  return corruption_messages.empty() ? Status::OK()
+                                     : Status::Corruption(corruption_messages);
+}
+
+Status DBImplSecondary::TryCatchUpWithPrimary() {
+  assert(versions_.get() != nullptr);
+  assert(manifest_reader_.get() != nullptr);
+  Status s;
+  // read the manifest and apply new changes to the secondary instance
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  JobContext job_context(0, true /*create_superversion*/);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
+            ->ReadAndApply(&mutex_, &manifest_reader_,
+                           manifest_reader_status_.get(), &cfds_changed);
+
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
+                   static_cast<uint64_t>(versions_->LastSequence()));
+    for (ColumnFamilyData* cfd : cfds_changed) {
+      if (cfd->IsDropped()) {
+        ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
+                        cfd->GetName().c_str());
+        continue;
+      }
+      VersionStorageInfo::LevelSummaryStorage tmp;
+      ROCKS_LOG_DEBUG(immutable_db_options_.info_log,
+                      "[%s] Level summary: %s\n", cfd->GetName().c_str(),
+                      cfd->current()->storage_info()->LevelSummary(&tmp));
+    }
+
+    // list wal_dir to discover new WALs and apply new changes to the secondary
+    // instance
+    if (s.ok()) {
+      s = FindAndRecoverLogFiles(&cfds_changed, &job_context);
+    }
+    if (s.IsPathNotFound()) {
+      ROCKS_LOG_INFO(
+          immutable_db_options_.info_log,
+          "Secondary tries to read WAL, but WAL file(s) have already "
+          "been purged by primary.");
+      s = Status::OK();
+    }
+    if (s.ok()) {
+      for (auto cfd : cfds_changed) {
+        cfd->imm()->RemoveOldMemTables(cfd->GetLogNumber(),
+                                       &job_context.memtables_to_free);
+        auto& sv_context = job_context.superversion_contexts.back();
+        cfd->InstallSuperVersion(&sv_context, &mutex_);
+        sv_context.NewSuperVersion();
+      }
+    }
+  }
+  job_context.Clean();
+
+  // Cleanup unused, obsolete files.
+  JobContext purge_files_job_context(0);
+  {
+    InstrumentedMutexLock lock_guard(&mutex_);
+    // Currently, secondary instance does not own the database files, thus it
+    // is unnecessary for the secondary to force full scan.
+    FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
+  }
+  if (purge_files_job_context.HaveSomethingToDelete()) {
+    PurgeObsoleteFiles(purge_files_job_context);
+  }
+  purge_files_job_context.Clean();
+  return s;
+}
+
+Status DB::OpenAsSecondary(const Options& options, const std::string& dbname,
+                           const std::string& secondary_path, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions db_options(options);
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options);
+  std::vector<ColumnFamilyHandle*> handles;
+
+  Status s = DB::OpenAsSecondary(db_options, dbname, secondary_path,
+                                 column_families, &handles, dbptr);
+  if (s.ok()) {
+    assert(handles.size() == 1);
+    delete handles[0];
+  }
+  return s;
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& db_options, const std::string& dbname,
+    const std::string& secondary_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
+  *dbptr = nullptr;
+
+  DBOptions tmp_opts(db_options);
+  Status s;
+  if (nullptr == tmp_opts.info_log) {
+    s = CreateLoggerFromOptions(secondary_path, tmp_opts, &tmp_opts.info_log);
+    if (!s.ok()) {
+      tmp_opts.info_log = nullptr;
+      return s;
+    }
+  }
+
+  assert(tmp_opts.info_log != nullptr);
+  if (db_options.max_open_files != -1) {
+    std::ostringstream oss;
+    oss << "The primary instance may delete all types of files after they "
+           "become obsolete. The application can coordinate the primary and "
+           "secondary so that primary does not delete/rename files that are "
+           "currently being used by the secondary. Alternatively, a custom "
+           "Env/FS can be provided such that files become inaccessible only "
+           "after all primary and secondaries indicate that they are obsolete "
+           "and deleted. If the above two are not possible, you can open the "
+           "secondary instance with `max_open_files==-1` so that secondary "
+           "will eagerly keep all table files open. Even if a file is deleted, "
+           "its content can still be accessed via a prior open file "
+           "descriptor. This is a hacky workaround for only table files. If "
+           "none of the above is done, then point lookup or "
+           "range scan via the secondary instance can result in IOError: file "
+           "not found. This can be resolved by retrying "
+           "TryCatchUpWithPrimary().";
+    ROCKS_LOG_WARN(tmp_opts.info_log, "%s", oss.str().c_str());
+  }
+
+  handles->clear();
+  DBImplSecondary* impl = new DBImplSecondary(tmp_opts, dbname, secondary_path);
+  impl->versions_.reset(new ReactiveVersionSet(
+      dbname, &impl->immutable_db_options_, impl->file_options_,
+      impl->table_cache_.get(), impl->write_buffer_manager_,
+      &impl->write_controller_, impl->io_tracer_));
+  impl->column_family_memtables_.reset(
+      new ColumnFamilyMemTablesImpl(impl->versions_->GetColumnFamilySet()));
+  impl->wal_in_db_path_ = impl->immutable_db_options_.IsWalDirSameAsDBPath();
+
+  impl->mutex_.Lock();
+  s = impl->Recover(column_families, true, false, false);
+  if (s.ok()) {
+    for (auto cf : column_families) {
+      auto cfd =
+          impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
+      if (nullptr == cfd) {
+        s = Status::InvalidArgument("Column family not found", cf.name);
+        break;
+      }
+      handles->push_back(new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
+    }
+  }
+  SuperVersionContext sv_context(true /* create_superversion */);
+  if (s.ok()) {
+    for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
+      sv_context.NewSuperVersion();
+      cfd->InstallSuperVersion(&sv_context, &impl->mutex_);
+    }
+  }
+  impl->mutex_.Unlock();
+  sv_context.Clean();
+  if (s.ok()) {
+    *dbptr = impl;
+    for (auto h : *handles) {
+      impl->NewThreadStatusCfInfo(
+          static_cast_with_check<ColumnFamilyHandleImpl>(h)->cfd());
+    }
+  } else {
+    for (auto h : *handles) {
+      delete h;
+    }
+    handles->clear();
+    delete impl;
+  }
+  return s;
+}
+
+Status DBImplSecondary::CompactWithoutInstallation(
+    const OpenAndCompactOptions& options, ColumnFamilyHandle* cfh,
+    const CompactionServiceInput& input, CompactionServiceResult* result) {
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  InstrumentedMutexLock l(&mutex_);
+  auto cfd = static_cast_with_check<ColumnFamilyHandleImpl>(cfh)->cfd();
+  if (!cfd) {
+    return Status::InvalidArgument("Cannot find column family" +
+                                   cfh->GetName());
+  }
+
+  std::unordered_set<uint64_t> input_set;
+  for (const auto& file_name : input.input_files) {
+    input_set.insert(TableFileNameToNumber(file_name));
+  }
+
+  auto* version = cfd->current();
+
+  ColumnFamilyMetaData cf_meta;
+  version->GetColumnFamilyMetaData(&cf_meta);
+
+  const MutableCFOptions* mutable_cf_options = cfd->GetLatestMutableCFOptions();
+  ColumnFamilyOptions cf_options = cfd->GetLatestCFOptions();
+  VersionStorageInfo* vstorage = version->storage_info();
+
+  // Use comp_options to reuse some CompactFiles functions
+  CompactionOptions comp_options;
+  comp_options.compression = kDisableCompressionOption;
+  comp_options.output_file_size_limit = MaxFileSizeForLevel(
+      *mutable_cf_options, input.output_level, cf_options.compaction_style,
+      vstorage->base_level(), cf_options.level_compaction_dynamic_level_bytes);
+
+  std::vector<CompactionInputFiles> input_files;
+  Status s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers(
+      &input_files, &input_set, vstorage, comp_options);
+  if (!s.ok()) {
+    return s;
+  }
+
+  std::unique_ptr<Compaction> c;
+  assert(cfd->compaction_picker());
+  c.reset(cfd->compaction_picker()->CompactFiles(
+      comp_options, input_files, input.output_level, vstorage,
+      *mutable_cf_options, mutable_db_options_, 0));
+  assert(c != nullptr);
+
+  c->SetInputVersion(version);
+
+  // Create output directory if it's not existed yet
+  std::unique_ptr<FSDirectory> output_dir;
+  s = CreateAndNewDirectory(fs_.get(), secondary_path_, &output_dir);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
+                       immutable_db_options_.info_log.get());
+
+  const int job_id = next_job_id_.fetch_add(1);
+
+  // use primary host's db_id for running the compaction, but db_session_id is
+  // using the local one, which is to make sure the unique id is unique from
+  // the remote compactors. Because the id is generated from db_id,
+  // db_session_id and orig_file_number, unlike the local compaction, remote
+  // compaction cannot guarantee the uniqueness of orig_file_number, the file
+  // number is only assigned when compaction is done.
+  CompactionServiceCompactionJob compaction_job(
+      job_id, c.get(), immutable_db_options_, mutable_db_options_,
+      file_options_for_compaction_, versions_.get(), &shutting_down_,
+      &log_buffer, output_dir.get(), stats_, &mutex_, &error_handler_,
+      input.snapshots, table_cache_, &event_logger_, dbname_, io_tracer_,
+      options.canceled ? *options.canceled : kManualCompactionCanceledFalse_,
+      input.db_id, db_session_id_, secondary_path_, input, result);
+
+  mutex_.Unlock();
+  s = compaction_job.Run();
+  mutex_.Lock();
+
+  // clean up
+  compaction_job.io_status().PermitUncheckedError();
+  compaction_job.CleanupCompaction();
+  c->ReleaseCompactionFiles(s);
+  c.reset();
+
+  TEST_SYNC_POINT_CALLBACK("DBImplSecondary::CompactWithoutInstallation::End",
+                           &s);
+  result->status = s;
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const OpenAndCompactOptions& options, const std::string& name,
+    const std::string& output_directory, const std::string& input,
+    std::string* output,
+    const CompactionServiceOptionsOverride& override_options) {
+  if (options.canceled && options.canceled->load(std::memory_order_acquire)) {
+    return Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+  }
+  CompactionServiceInput compaction_input;
+  Status s = CompactionServiceInput::Read(input, &compaction_input);
+  if (!s.ok()) {
+    return s;
+  }
+
+  compaction_input.db_options.max_open_files = -1;
+  compaction_input.db_options.compaction_service = nullptr;
+  if (compaction_input.db_options.statistics) {
+    compaction_input.db_options.statistics.reset();
+  }
+  compaction_input.db_options.env = override_options.env;
+  compaction_input.db_options.file_checksum_gen_factory =
+      override_options.file_checksum_gen_factory;
+  compaction_input.db_options.statistics = override_options.statistics;
+  compaction_input.column_family.options.comparator =
+      override_options.comparator;
+  compaction_input.column_family.options.merge_operator =
+      override_options.merge_operator;
+  compaction_input.column_family.options.compaction_filter =
+      override_options.compaction_filter;
+  compaction_input.column_family.options.compaction_filter_factory =
+      override_options.compaction_filter_factory;
+  compaction_input.column_family.options.prefix_extractor =
+      override_options.prefix_extractor;
+  compaction_input.column_family.options.table_factory =
+      override_options.table_factory;
+  compaction_input.column_family.options.sst_partitioner_factory =
+      override_options.sst_partitioner_factory;
+  compaction_input.column_family.options.table_properties_collector_factories =
+      override_options.table_properties_collector_factories;
+  compaction_input.db_options.listeners = override_options.listeners;
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(compaction_input.column_family);
+  // TODO: we have to open default CF, because of an implementation limitation,
+  // currently we just use the same CF option from input, which is not collect
+  // and open may fail.
+  if (compaction_input.column_family.name != kDefaultColumnFamilyName) {
+    column_families.emplace_back(kDefaultColumnFamilyName,
+                                 compaction_input.column_family.options);
+  }
+
+  DB* db;
+  std::vector<ColumnFamilyHandle*> handles;
+
+  s = DB::OpenAsSecondary(compaction_input.db_options, name, output_directory,
+                          column_families, &handles, &db);
+  if (!s.ok()) {
+    return s;
+  }
+
+  CompactionServiceResult compaction_result;
+  DBImplSecondary* db_secondary = static_cast_with_check<DBImplSecondary>(db);
+  assert(handles.size() > 0);
+  s = db_secondary->CompactWithoutInstallation(
+      options, handles[0], compaction_input, &compaction_result);
+
+  Status serialization_status = compaction_result.Write(output);
+
+  for (auto& handle : handles) {
+    delete handle;
+  }
+  delete db;
+  if (s.ok()) {
+    return serialization_status;
+  }
+  return s;
+}
+
+Status DB::OpenAndCompact(
+    const std::string& name, const std::string& output_directory,
+    const std::string& input, std::string* output,
+    const CompactionServiceOptionsOverride& override_options) {
+  return OpenAndCompact(OpenAndCompactOptions(), name, output_directory, input,
+                        output, override_options);
+}
+
+#else   // !ROCKSDB_LITE
+
+Status DB::OpenAsSecondary(const Options& /*options*/,
+                           const std::string& /*name*/,
+                           const std::string& /*secondary_path*/,
+                           DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+
+Status DB::OpenAsSecondary(
+    const DBOptions& /*db_options*/, const std::string& /*dbname*/,
+    const std::string& /*secondary_path*/,
+    const std::vector<ColumnFamilyDescriptor>& /*column_families*/,
+    std::vector<ColumnFamilyHandle*>* /*handles*/, DB** /*dbptr*/) {
+  return Status::NotSupported("Not supported in ROCKSDB_LITE.");
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_impl/db_impl_secondary.h b/src/rocksdb/db/db_impl/db_impl_secondary.h
new file mode 100644
index 000000000..eb9361875
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_secondary.h
@@ -0,0 +1,410 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A wrapper class to hold log reader, log reporter, log status.
+class LogReaderContainer {
+ public:
+  LogReaderContainer()
+      : reader_(nullptr), reporter_(nullptr), status_(nullptr) {}
+  LogReaderContainer(Env* env, std::shared_ptr<Logger> info_log,
+                     std::string fname,
+                     std::unique_ptr<SequentialFileReader>&& file_reader,
+                     uint64_t log_number) {
+    LogReporter* reporter = new LogReporter();
+    status_ = new Status();
+    reporter->env = env;
+    reporter->info_log = info_log.get();
+    reporter->fname = std::move(fname);
+    reporter->status = status_;
+    reporter_ = reporter;
+    // We intentially make log::Reader do checksumming even if
+    // paranoid_checks==false so that corruptions cause entire commits
+    // to be skipped instead of propagating bad information (like overly
+    // large sequence numbers).
+    reader_ = new log::FragmentBufferedReader(info_log, std::move(file_reader),
+                                              reporter, true /*checksum*/,
+                                              log_number);
+  }
+  log::FragmentBufferedReader* reader_;
+  log::Reader::Reporter* reporter_;
+  Status* status_;
+  ~LogReaderContainer() {
+    delete reader_;
+    delete reporter_;
+    delete status_;
+  }
+
+ private:
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    std::string fname;
+    Status* status;  // nullptr if immutable_db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "%s%s: dropping %d bytes; %s",
+                     (this->status == nullptr ? "(ignoring error) " : ""),
+                     fname.c_str(), static_cast<int>(bytes),
+                     s.ToString().c_str());
+      if (this->status != nullptr && this->status->ok()) {
+        *this->status = s;
+      }
+    }
+  };
+};
+
+// The secondary instance shares access to the storage as the primary.
+// The secondary is able to read and replay changes described in both the
+// MANIFEST and the WAL files without coordination with the primary.
+// The secondary instance can be opened using `DB::OpenAsSecondary`. After
+// that, it can call `DBImplSecondary::TryCatchUpWithPrimary` to make best
+// effort attempts to catch up with the primary.
+// TODO: Share common structure with CompactedDBImpl and DBImplReadOnly
+class DBImplSecondary : public DBImpl {
+ public:
+  DBImplSecondary(const DBOptions& options, const std::string& dbname,
+                  std::string secondary_path);
+  ~DBImplSecondary() override;
+
+  // Recover by replaying MANIFEST and WAL. Also initialize manifest_reader_
+  // and log_readers_ to facilitate future operations.
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only, bool error_if_wal_file_exists,
+                 bool error_if_data_exists_in_wals, uint64_t* = nullptr,
+                 RecoveryContext* recovery_ctx = nullptr) override;
+
+  // Implementations of the DB interface.
+  using DB::Get;
+  // Can return IOError due to files being deleted by the primary. To avoid
+  // IOError in this case, application can coordinate between primary and
+  // secondaries so that primary will not delete files that are currently being
+  // used by the secondaries. The application can also provide a custom FS/Env
+  // implementation so that files will remain present until all primary and
+  // secondaries indicate that they can be deleted. As a partial hacky
+  // workaround, the secondaries can be opened with `max_open_files=-1` so that
+  // it eagerly keeps all talbe files open and is able to access the contents of
+  // deleted files via prior open fd.
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value) override;
+
+  Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family,
+             const Slice& key, PinnableSlice* value,
+             std::string* timestamp) override;
+
+  Status GetImpl(const ReadOptions& options, ColumnFamilyHandle* column_family,
+                 const Slice& key, PinnableSlice* value,
+                 std::string* timestamp);
+
+  using DBImpl::NewIterator;
+  // Operations on the created iterators can return IOError due to files being
+  // deleted by the primary. To avoid IOError in this case, application can
+  // coordinate between primary and secondaries so that primary will not delete
+  // files that are currently being used by the secondaries. The application can
+  // also provide a custom FS/Env implementation so that files will remain
+  // present until all primary and secondaries indicate that they can be
+  // deleted. As a partial hacky workaround, the secondaries can be opened with
+  // `max_open_files=-1` so that it eagerly keeps all talbe files open and is
+  // able to access the contents of deleted files via prior open fd.
+  Iterator* NewIterator(const ReadOptions&,
+                        ColumnFamilyHandle* column_family) override;
+
+  ArenaWrappedDBIter* NewIteratorImpl(const ReadOptions& read_options,
+                                      ColumnFamilyData* cfd,
+                                      SequenceNumber snapshot,
+                                      ReadCallback* read_callback,
+                                      bool expose_blob_index = false,
+                                      bool allow_refresh = true);
+
+  Status NewIterators(const ReadOptions& options,
+                      const std::vector<ColumnFamilyHandle*>& column_families,
+                      std::vector<Iterator*>* iterators) override;
+
+  using DBImpl::Put;
+  Status Put(const WriteOptions& /*options*/,
+             ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+             const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Merge;
+  Status Merge(const WriteOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Delete;
+  Status Delete(const WriteOptions& /*options*/,
+                ColumnFamilyHandle* /*column_family*/,
+                const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SingleDelete;
+  Status SingleDelete(const WriteOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*key*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status Write(const WriteOptions& /*options*/,
+               WriteBatch* /*updates*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*begin*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status DisableFileDeletions() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status EnableFileDeletions(bool /*force*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  Status GetLiveFiles(std::vector<std::string>&,
+                      uint64_t* /*manifest_file_size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::Flush;
+  Status Flush(const FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetDBOptions;
+  Status SetDBOptions(const std::unordered_map<std::string, std::string>&
+                      /*options_map*/) override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SetOptions;
+  Status SetOptions(
+      ColumnFamilyHandle* /*cfd*/,
+      const std::unordered_map<std::string, std::string>& /*options_map*/)
+      override {
+    // Currently not supported because changing certain options may cause
+    // flush/compaction and/or write to MANIFEST.
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DBImpl::SyncWAL;
+  Status SyncWAL() override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*ingestion_options*/) override {
+    return Status::NotSupported("Not supported operation in secondary mode.");
+  }
+
+  // Try to catch up with the primary by reading as much as possible from the
+  // log files until there is nothing more to read or encounters an error. If
+  // the amount of information in the log files to process is huge, this
+  // method can take long time due to all the I/O and CPU costs.
+  Status TryCatchUpWithPrimary() override;
+
+  // Try to find log reader using log_number from log_readers_ map, initialize
+  // if it doesn't exist
+  Status MaybeInitLogReader(uint64_t log_number,
+                            log::FragmentBufferedReader** log_reader);
+
+  // Check if all live files exist on file system and that their file sizes
+  // matche to the in-memory records. It is possible that some live files may
+  // have been deleted by the primary. In this case, CheckConsistency() does
+  // not flag the missing file as inconsistency.
+  Status CheckConsistency() override;
+
+#ifndef NDEBUG
+  Status TEST_CompactWithoutInstallation(const OpenAndCompactOptions& options,
+                                         ColumnFamilyHandle* cfh,
+                                         const CompactionServiceInput& input,
+                                         CompactionServiceResult* result) {
+    return CompactWithoutInstallation(options, cfh, input, result);
+  }
+#endif  // NDEBUG
+
+ protected:
+#ifndef ROCKSDB_LITE
+  Status FlushForGetLiveFiles() override {
+    // No-op for read-only DB
+    return Status::OK();
+  }
+#endif  // !ROCKSDB_LITE
+
+  // ColumnFamilyCollector is a write batch handler which does nothing
+  // except recording unique column family IDs
+  class ColumnFamilyCollector : public WriteBatch::Handler {
+    std::unordered_set<uint32_t> column_family_ids_;
+
+    Status AddColumnFamilyId(uint32_t column_family_id) {
+      if (column_family_ids_.find(column_family_id) ==
+          column_family_ids_.end()) {
+        column_family_ids_.insert(column_family_id);
+      }
+      return Status::OK();
+    }
+
+   public:
+    explicit ColumnFamilyCollector() {}
+
+    ~ColumnFamilyCollector() override {}
+
+    Status PutCF(uint32_t column_family_id, const Slice&,
+                 const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status SingleDeleteCF(uint32_t column_family_id, const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status DeleteRangeCF(uint32_t column_family_id, const Slice&,
+                         const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MergeCF(uint32_t column_family_id, const Slice&,
+                   const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status PutBlobIndexCF(uint32_t column_family_id, const Slice&,
+                          const Slice&) override {
+      return AddColumnFamilyId(column_family_id);
+    }
+
+    Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+    Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+    Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+    Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+      return Status::OK();
+    }
+
+    Status MarkNoop(bool) override { return Status::OK(); }
+
+    const std::unordered_set<uint32_t>& column_families() const {
+      return column_family_ids_;
+    }
+  };
+
+  Status CollectColumnFamilyIdsFromWriteBatch(
+      const WriteBatch& batch, std::vector<uint32_t>* column_family_ids) {
+    assert(column_family_ids != nullptr);
+    column_family_ids->clear();
+    ColumnFamilyCollector handler;
+    Status s = batch.Iterate(&handler);
+    if (s.ok()) {
+      for (const auto& cf : handler.column_families()) {
+        column_family_ids->push_back(cf);
+      }
+    }
+    return s;
+  }
+
+  bool OwnTablesAndLogs() const override {
+    // Currently, the secondary instance does not own the database files. It
+    // simply opens the files of the primary instance and tracks their file
+    // descriptors until they become obsolete. In the future, the secondary may
+    // create links to database files. OwnTablesAndLogs will return true then.
+    return false;
+  }
+
+ private:
+  friend class DB;
+
+  // No copying allowed
+  DBImplSecondary(const DBImplSecondary&);
+  void operator=(const DBImplSecondary&);
+
+  using DBImpl::Recover;
+
+  Status FindAndRecoverLogFiles(
+      std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      JobContext* job_context);
+  Status FindNewLogNumbers(std::vector<uint64_t>* logs);
+  // After manifest recovery, replay WALs and refresh log_readers_ if necessary
+  // REQUIRES: log_numbers are sorted in ascending order
+  Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
+                         SequenceNumber* next_sequence,
+                         std::unordered_set<ColumnFamilyData*>* cfds_changed,
+                         JobContext* job_context);
+
+  // Run compaction without installation, the output files will be placed in the
+  // secondary DB path. The LSM tree won't be changed, the secondary DB is still
+  // in read-only mode.
+  Status CompactWithoutInstallation(const OpenAndCompactOptions& options,
+                                    ColumnFamilyHandle* cfh,
+                                    const CompactionServiceInput& input,
+                                    CompactionServiceResult* result);
+
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader_;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter_;
+  std::unique_ptr<Status> manifest_reader_status_;
+
+  // Cache log readers for each log number, used for continue WAL replay
+  // after recovery
+  std::map<uint64_t, std::unique_ptr<LogReaderContainer>> log_readers_;
+
+  // Current WAL number replayed for each column family.
+  std::unordered_map<ColumnFamilyData*, uint64_t> cfd_to_current_log_;
+
+  const std::string secondary_path_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/db_impl/db_impl_write.cc b/src/rocksdb/db/db_impl/db_impl_write.cc
new file mode 100644
index 000000000..a597c168d
--- /dev/null
+++ b/src/rocksdb/db/db_impl/db_impl_write.cc
@@ -0,0 +1,2435 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <cinttypes>
+
+#include "db/db_impl/db_impl.h"
+#include "db/error_handler.h"
+#include "db/event_helpers.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "options/options_helper.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+// Convenience methods
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& val) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Put(o, column_family, key, val);
+}
+
+Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                   const Slice& key, const Slice& ts, const Slice& val) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Put(o, column_family, key, ts, val);
+}
+
+Status DBImpl::PutEntity(const WriteOptions& options,
+                         ColumnFamilyHandle* column_family, const Slice& key,
+                         const WideColumns& columns) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return DB::PutEntity(options, column_family, key, columns);
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& val) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
+  if (!cfh->cfd()->ioptions()->merge_operator) {
+    return Status::NotSupported("Provide a merge_operator when opening DB");
+  } else {
+    return DB::Merge(o, column_family, key, val);
+  }
+}
+
+Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
+                     const Slice& key, const Slice& ts, const Slice& val) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Merge(o, column_family, key, ts, val);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+                      ColumnFamilyHandle* column_family, const Slice& key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Delete(write_options, column_family, key);
+}
+
+Status DBImpl::Delete(const WriteOptions& write_options,
+                      ColumnFamilyHandle* column_family, const Slice& key,
+                      const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::Delete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family,
+                            const Slice& key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::SingleDelete(write_options, column_family, key);
+}
+
+Status DBImpl::SingleDelete(const WriteOptions& write_options,
+                            ColumnFamilyHandle* column_family, const Slice& key,
+                            const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::SingleDelete(write_options, column_family, key, ts);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+                           ColumnFamilyHandle* column_family,
+                           const Slice& begin_key, const Slice& end_key) {
+  const Status s = FailIfCfHasTs(column_family);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::DeleteRange(write_options, column_family, begin_key, end_key);
+}
+
+Status DBImpl::DeleteRange(const WriteOptions& write_options,
+                           ColumnFamilyHandle* column_family,
+                           const Slice& begin_key, const Slice& end_key,
+                           const Slice& ts) {
+  const Status s = FailIfTsMismatchCf(column_family, ts, /*ts_for_read=*/false);
+  if (!s.ok()) {
+    return s;
+  }
+  return DB::DeleteRange(write_options, column_family, begin_key, end_key, ts);
+}
+
+void DBImpl::SetRecoverableStatePreReleaseCallback(
+    PreReleaseCallback* callback) {
+  recoverable_state_pre_release_callback_.reset(callback);
+}
+
+Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        my_batch, write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    s = WriteImpl(write_options, my_batch, /*callback=*/nullptr,
+                  /*log_used=*/nullptr);
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+Status DBImpl::WriteWithCallback(const WriteOptions& write_options,
+                                 WriteBatch* my_batch,
+                                 WriteCallback* callback) {
+  Status s;
+  if (write_options.protection_bytes_per_key > 0) {
+    s = WriteBatchInternal::UpdateProtectionInfo(
+        my_batch, write_options.protection_bytes_per_key);
+  }
+  if (s.ok()) {
+    s = WriteImpl(write_options, my_batch, callback, nullptr);
+  }
+  return s;
+}
+#endif  // ROCKSDB_LITE
+
+// The main write queue. This is the only write queue that updates LastSequence.
+// When using one write queue, the same sequence also indicates the last
+// published sequence.
+Status DBImpl::WriteImpl(const WriteOptions& write_options,
+                         WriteBatch* my_batch, WriteCallback* callback,
+                         uint64_t* log_used, uint64_t log_ref,
+                         bool disable_memtable, uint64_t* seq_used,
+                         size_t batch_cnt,
+                         PreReleaseCallback* pre_release_callback,
+                         PostMemTableCallback* post_memtable_callback) {
+  assert(!seq_per_batch_ || batch_cnt != 0);
+  assert(my_batch == nullptr || my_batch->Count() == 0 ||
+         write_options.protection_bytes_per_key == 0 ||
+         write_options.protection_bytes_per_key ==
+             my_batch->GetProtectionBytesPerKey());
+  if (my_batch == nullptr) {
+    return Status::InvalidArgument("Batch is nullptr!");
+  } else if (!disable_memtable &&
+             WriteBatchInternal::TimestampsUpdateNeeded(*my_batch)) {
+    // If writing to memtable, then we require the caller to set/update the
+    // timestamps for the keys in the write batch.
+    // Otherwise, it means we are just writing to the WAL, and we allow
+    // timestamps unset for the keys in the write batch. This can happen if we
+    // use TransactionDB with write-committed policy, and we currently do not
+    // support user-defined timestamp with other policies.
+    // In the prepare phase, a transaction can write the batch to the WAL
+    // without inserting to memtable. The keys in the batch do not have to be
+    // assigned timestamps because they will be used only during recovery if
+    // there is a commit marker which includes their commit timestamp.
+    return Status::InvalidArgument("write batch must have timestamp(s) set");
+  } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+             write_options.rate_limiter_priority != Env::IO_USER) {
+    return Status::InvalidArgument(
+        "WriteOptions::rate_limiter_priority only allows "
+        "Env::IO_TOTAL and Env::IO_USER due to implementation constraints");
+  } else if (write_options.rate_limiter_priority != Env::IO_TOTAL &&
+             (write_options.disableWAL || manual_wal_flush_)) {
+    return Status::InvalidArgument(
+        "WriteOptions::rate_limiter_priority currently only supports "
+        "rate-limiting automatic WAL flush, which requires "
+        "`WriteOptions::disableWAL` and "
+        "`DBOptions::manual_wal_flush` both set to false");
+  } else if (write_options.protection_bytes_per_key != 0 &&
+             write_options.protection_bytes_per_key != 8) {
+    return Status::InvalidArgument(
+        "`WriteOptions::protection_bytes_per_key` must be zero or eight");
+  }
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ && !tracer_->IsWriteOrderPreserved()) {
+      // We don't have to preserve write order so can trace anywhere. It's more
+      // efficient to trace here than to add latency to a phase of the log/apply
+      // pipeline.
+      // TODO: maybe handle the tracing status?
+      tracer_->Write(my_batch).PermitUncheckedError();
+    }
+  }
+  if (write_options.sync && write_options.disableWAL) {
+    return Status::InvalidArgument("Sync writes has to enable WAL.");
+  }
+  if (two_write_queues_ && immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with concurrent prepares");
+  }
+  if (seq_per_batch_ && immutable_db_options_.enable_pipelined_write) {
+    // TODO(yiwu): update pipeline write with seq_per_batch and batch_cnt
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with seq_per_batch");
+  }
+  if (immutable_db_options_.unordered_write &&
+      immutable_db_options_.enable_pipelined_write) {
+    return Status::NotSupported(
+        "pipelined_writes is not compatible with unordered_write");
+  }
+  if (immutable_db_options_.enable_pipelined_write &&
+      post_memtable_callback != nullptr) {
+    return Status::NotSupported(
+        "pipelined write currently does not honor post_memtable_callback");
+  }
+  if (seq_per_batch_ && post_memtable_callback != nullptr) {
+    return Status::NotSupported(
+        "seq_per_batch currently does not honor post_memtable_callback");
+  }
+  // Otherwise IsLatestPersistentState optimization does not make sense
+  assert(!WriteBatchInternal::IsLatestPersistentState(my_batch) ||
+         disable_memtable);
+
+  if (write_options.low_pri) {
+    Status s = ThrottleLowPriWritesIfNeeded(write_options, my_batch);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (two_write_queues_ && disable_memtable) {
+    AssignOrder assign_order =
+        seq_per_batch_ ? kDoAssignOrder : kDontAssignOrder;
+    // Otherwise it is WAL-only Prepare batches in WriteCommitted policy and
+    // they don't consume sequence.
+    return WriteImplWALOnly(&nonmem_write_thread_, write_options, my_batch,
+                            callback, log_used, log_ref, seq_used, batch_cnt,
+                            pre_release_callback, assign_order,
+                            kDontPublishLastSeq, disable_memtable);
+  }
+
+  if (immutable_db_options_.unordered_write) {
+    const size_t sub_batch_cnt = batch_cnt != 0
+                                     ? batch_cnt
+                                     // every key is a sub-batch consuming a seq
+                                     : WriteBatchInternal::Count(my_batch);
+    uint64_t seq = 0;
+    // Use a write thread to i) optimize for WAL write, ii) publish last
+    // sequence in in increasing order, iii) call pre_release_callback serially
+    Status status = WriteImplWALOnly(
+        &write_thread_, write_options, my_batch, callback, log_used, log_ref,
+        &seq, sub_batch_cnt, pre_release_callback, kDoAssignOrder,
+        kDoPublishLastSeq, disable_memtable);
+    TEST_SYNC_POINT("DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL");
+    if (!status.ok()) {
+      return status;
+    }
+    if (seq_used) {
+      *seq_used = seq;
+    }
+    if (!disable_memtable) {
+      TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeUnorderedWriteMemtable");
+      status = UnorderedWriteMemtable(write_options, my_batch, callback,
+                                      log_ref, seq, sub_batch_cnt);
+    }
+    return status;
+  }
+
+  if (immutable_db_options_.enable_pipelined_write) {
+    return PipelinedWriteImpl(write_options, my_batch, callback, log_used,
+                              log_ref, disable_memtable, seq_used);
+  }
+
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, batch_cnt, pre_release_callback,
+                        post_memtable_callback);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  write_thread_.JoinBatchGroup(&w);
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    // we are a non-leader in a parallel group
+
+    if (w.ShouldWriteToMemtable()) {
+      PERF_TIMER_STOP(write_pre_and_post_process_time);
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      ColumnFamilyMemTablesImpl column_family_memtables(
+          versions_->GetColumnFamilySet());
+      w.status = WriteBatchInternal::InsertInto(
+          &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+          &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          true /*concurrent_memtable_writes*/, seq_per_batch_, w.batch_cnt,
+          batch_per_txn_, write_options.memtable_insert_hint_per_batch);
+
+      PERF_TIMER_START(write_pre_and_post_process_time);
+    }
+
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      // we're responsible for exit batch group
+      // TODO(myabandeh): propagate status to write_group
+      auto last_sequence = w.write_group->last_sequence;
+      for (auto* tmp_w : *(w.write_group)) {
+        assert(tmp_w);
+        if (tmp_w->post_memtable_callback) {
+          Status tmp_s =
+              (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+          // TODO: propagate the execution status of post_memtable_callback to
+          // caller.
+          assert(tmp_s.ok());
+        }
+      }
+      versions_->SetLastSequence(last_sequence);
+      MemTableInsertStatusCheck(w.status);
+      write_thread_.ExitAsBatchGroupFollower(&w);
+    }
+    assert(w.state == WriteThread::STATE_COMPLETED);
+    // STATE_COMPLETED conditional below handles exit
+  }
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    // write is complete and leader has updated sequence
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+  Status status;
+  // Once reaches this point, the current writer "w" will try to do its write
+  // job.  It may also pick up some of the remaining writers in the "writers_"
+  // when it finds suitable, and finish them in the same write batch.
+  // This is how a write job could be done by the other writer.
+  WriteContext write_context;
+  LogContext log_context(write_options.sync);
+  WriteThread::WriteGroup write_group;
+  bool in_parallel_group = false;
+  uint64_t last_sequence = kMaxSequenceNumber;
+
+  assert(!two_write_queues_ || !disable_memtable);
+  {
+    // With concurrent writes we do preprocess only in the write thread that
+    // also does write to memtable to avoid sync issue on shared data structure
+    // with the other thread
+
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    status = PreprocessWrite(write_options, &log_context, &write_context);
+    if (!two_write_queues_) {
+      // Assign it after ::PreprocessWrite since the sequence might advance
+      // inside it by WriteRecoverableState
+      last_sequence = versions_->LastSequence();
+    }
+
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+
+  // Add to log and apply to memtable.  We can release the lock
+  // during this phase since &w is currently responsible for logging
+  // and protects against concurrent loggers and concurrent writes
+  // into memtables
+
+  TEST_SYNC_POINT("DBImpl::WriteImpl:BeforeLeaderEnters");
+  last_batch_group_size_ =
+      write_thread_.EnterAsBatchGroupLeader(&w, &write_group);
+
+  IOStatus io_s;
+  Status pre_release_cb_status;
+  if (status.ok()) {
+    // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+    // grabs but does not seem thread-safe.
+    if (tracer_) {
+      InstrumentedMutexLock lock(&trace_mutex_);
+      if (tracer_ && tracer_->IsWriteOrderPreserved()) {
+        for (auto* writer : write_group) {
+          // TODO: maybe handle the tracing status?
+          tracer_->Write(writer->batch).PermitUncheckedError();
+        }
+      }
+    }
+    // Rules for when we can update the memtable concurrently
+    // 1. supported by memtable
+    // 2. Puts are not okay if inplace_update_support
+    // 3. Merges are not okay
+    //
+    // Rules 1..2 are enforced by checking the options
+    // during startup (CheckConcurrentWritesSupported), so if
+    // options.allow_concurrent_memtable_write is true then they can be
+    // assumed to be true.  Rule 3 is checked for each batch.  We could
+    // relax rules 2 if we could prevent write batches from referring
+    // more than once to a particular key.
+    bool parallel = immutable_db_options_.allow_concurrent_memtable_write &&
+                    write_group.size > 1;
+    size_t total_count = 0;
+    size_t valid_batches = 0;
+    size_t total_byte_size = 0;
+    size_t pre_release_callback_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer);
+      if (writer->CheckCallback(this)) {
+        valid_batches += writer->batch_cnt;
+        if (writer->ShouldWriteToMemtable()) {
+          total_count += WriteBatchInternal::Count(writer->batch);
+          parallel = parallel && !writer->batch->HasMerge();
+        }
+        total_byte_size = WriteBatchInternal::AppendedByteSize(
+            total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        if (writer->pre_release_callback) {
+          pre_release_callback_cnt++;
+        }
+      }
+    }
+    // Note about seq_per_batch_: either disableWAL is set for the entire write
+    // group or not. In either case we inc seq for each write batch with no
+    // failed callback. This means that there could be a batch with
+    // disalbe_memtable in between; although we do not write this batch to
+    // memtable it still consumes a seq. Otherwise, if !seq_per_batch_, we inc
+    // the seq per valid written key to mem.
+    size_t seq_inc = seq_per_batch_ ? valid_batches : total_count;
+
+    const bool concurrent_update = two_write_queues_;
+    // Update stats while we are an exclusive group leader, so we know
+    // that nobody else can be writing to these particular stats.
+    // We're optimistic, updating the stats before we successfully
+    // commit.  That lets us release our leader status early.
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count,
+                      concurrent_update);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                      concurrent_update);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                      concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_SELF);
+    auto write_done_by_other = write_group.size - 1;
+    if (write_done_by_other > 0) {
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                        write_done_by_other, concurrent_update);
+      RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+    }
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    if (!two_write_queues_) {
+      if (status.ok() && !write_options.disableWAL) {
+        assert(log_context.log_file_number_size);
+        LogFileNumberSize& log_file_number_size =
+            *(log_context.log_file_number_size);
+        PERF_TIMER_GUARD(write_wal_time);
+        io_s =
+            WriteToWAL(write_group, log_context.writer, log_used,
+                       log_context.need_log_sync, log_context.need_log_dir_sync,
+                       last_sequence + 1, log_file_number_size);
+      }
+    } else {
+      if (status.ok() && !write_options.disableWAL) {
+        PERF_TIMER_GUARD(write_wal_time);
+        // LastAllocatedSequence is increased inside WriteToWAL under
+        // wal_write_mutex_ to ensure ordered events in WAL
+        io_s = ConcurrentWriteToWAL(write_group, log_used, &last_sequence,
+                                    seq_inc);
+      } else {
+        // Otherwise we inc seq number for memtable writes
+        last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+      }
+    }
+    status = io_s;
+    assert(last_sequence != kMaxSequenceNumber);
+    const SequenceNumber current_sequence = last_sequence + 1;
+    last_sequence += seq_inc;
+
+    // PreReleaseCallback is called after WAL write and before memtable write
+    if (status.ok()) {
+      SequenceNumber next_sequence = current_sequence;
+      size_t index = 0;
+      // Note: the logic for advancing seq here must be consistent with the
+      // logic in WriteBatchInternal::InsertInto(write_group...) as well as
+      // with WriteBatchInternal::InsertInto(write_batch...) that is called on
+      // the merged batch during recovery from the WAL.
+      for (auto* writer : write_group) {
+        if (writer->CallbackFailed()) {
+          continue;
+        }
+        writer->sequence = next_sequence;
+        if (writer->pre_release_callback) {
+          Status ws = writer->pre_release_callback->Callback(
+              writer->sequence, disable_memtable, writer->log_used, index++,
+              pre_release_callback_cnt);
+          if (!ws.ok()) {
+            status = pre_release_cb_status = ws;
+            break;
+          }
+        }
+        if (seq_per_batch_) {
+          assert(writer->batch_cnt);
+          next_sequence += writer->batch_cnt;
+        } else if (writer->ShouldWriteToMemtable()) {
+          next_sequence += WriteBatchInternal::Count(writer->batch);
+        }
+      }
+    }
+
+    if (status.ok()) {
+      PERF_TIMER_GUARD(write_memtable_time);
+
+      if (!parallel) {
+        // w.sequence will be set inside InsertInto
+        w.status = WriteBatchInternal::InsertInto(
+            write_group, current_sequence, column_family_memtables_.get(),
+            &flush_scheduler_, &trim_history_scheduler_,
+            write_options.ignore_missing_column_families,
+            0 /*recovery_log_number*/, this, parallel, seq_per_batch_,
+            batch_per_txn_);
+      } else {
+        write_group.last_sequence = last_sequence;
+        write_thread_.LaunchParallelMemTableWriters(&write_group);
+        in_parallel_group = true;
+
+        // Each parallel follower is doing each own writes. The leader should
+        // also do its own.
+        if (w.ShouldWriteToMemtable()) {
+          ColumnFamilyMemTablesImpl column_family_memtables(
+              versions_->GetColumnFamilySet());
+          assert(w.sequence == current_sequence);
+          w.status = WriteBatchInternal::InsertInto(
+              &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+              &trim_history_scheduler_,
+              write_options.ignore_missing_column_families, 0 /*log_number*/,
+              this, true /*concurrent_memtable_writes*/, seq_per_batch_,
+              w.batch_cnt, batch_per_txn_,
+              write_options.memtable_insert_hint_per_batch);
+        }
+      }
+      if (seq_used != nullptr) {
+        *seq_used = w.sequence;
+      }
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!io_s.ok()) {
+    // Check WriteToWAL status
+    IOStatusCheck(io_s);
+  }
+  if (!w.CallbackFailed()) {
+    if (!io_s.ok()) {
+      assert(pre_release_cb_status.ok());
+    } else {
+      WriteStatusCheck(pre_release_cb_status);
+    }
+  } else {
+    assert(pre_release_cb_status.ok());
+  }
+
+  if (log_context.need_log_sync) {
+    VersionEdit synced_wals;
+    log_write_mutex_.Lock();
+    if (status.ok()) {
+      MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+                     &synced_wals);
+    } else {
+      MarkLogsNotSynced(logfile_number_);
+    }
+    log_write_mutex_.Unlock();
+    if (status.ok() && synced_wals.IsWalAddition()) {
+      InstrumentedMutexLock l(&mutex_);
+      status = ApplyWALToManifest(&synced_wals);
+    }
+
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hence provide a simple implementation that is not necessarily efficient.
+    if (two_write_queues_) {
+      if (manual_wal_flush_) {
+        status = FlushWAL(true);
+      } else {
+        status = SyncWAL();
+      }
+    }
+  }
+
+  bool should_exit_batch_group = true;
+  if (in_parallel_group) {
+    // CompleteParallelWorker returns true if this thread should
+    // handle exit, false means somebody else did
+    should_exit_batch_group = write_thread_.CompleteParallelMemTableWriter(&w);
+  }
+  if (should_exit_batch_group) {
+    if (status.ok()) {
+      for (auto* tmp_w : write_group) {
+        assert(tmp_w);
+        if (tmp_w->post_memtable_callback) {
+          Status tmp_s =
+              (*tmp_w->post_memtable_callback)(last_sequence, disable_memtable);
+          // TODO: propagate the execution status of post_memtable_callback to
+          // caller.
+          assert(tmp_s.ok());
+        }
+      }
+      // Note: if we are to resume after non-OK statuses we need to revisit how
+      // we reacts to non-OK statuses here.
+      versions_->SetLastSequence(last_sequence);
+    }
+    MemTableInsertStatusCheck(w.status);
+    write_thread_.ExitAsBatchGroupLeader(write_group, status);
+  }
+
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  return status;
+}
+
+Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
+                                  WriteBatch* my_batch, WriteCallback* callback,
+                                  uint64_t* log_used, uint64_t log_ref,
+                                  bool disable_memtable, uint64_t* seq_used) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  WriteContext write_context;
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, /*_batch_cnt=*/0,
+                        /*_pre_release_callback=*/nullptr);
+  write_thread_.JoinBatchGroup(&w);
+  TEST_SYNC_POINT("DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup");
+  if (w.state == WriteThread::STATE_GROUP_LEADER) {
+    WriteThread::WriteGroup wal_write_group;
+    if (w.callback && !w.callback->AllowWriteBatching()) {
+      write_thread_.WaitForMemTableWriters();
+    }
+    LogContext log_context(!write_options.disableWAL && write_options.sync);
+    // PreprocessWrite does its own perf timing.
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    w.status = PreprocessWrite(write_options, &log_context, &write_context);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+
+    // This can set non-OK status if callback fail.
+    last_batch_group_size_ =
+        write_thread_.EnterAsBatchGroupLeader(&w, &wal_write_group);
+    const SequenceNumber current_sequence =
+        write_thread_.UpdateLastSequence(versions_->LastSequence()) + 1;
+    size_t total_count = 0;
+    size_t total_byte_size = 0;
+
+    if (w.status.ok()) {
+      // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+      // grabs but does not seem thread-safe.
+      if (tracer_) {
+        InstrumentedMutexLock lock(&trace_mutex_);
+        if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+          for (auto* writer : wal_write_group) {
+            // TODO: maybe handle the tracing status?
+            tracer_->Write(writer->batch).PermitUncheckedError();
+          }
+        }
+      }
+      SequenceNumber next_sequence = current_sequence;
+      for (auto* writer : wal_write_group) {
+        assert(writer);
+        if (writer->CheckCallback(this)) {
+          if (writer->ShouldWriteToMemtable()) {
+            writer->sequence = next_sequence;
+            size_t count = WriteBatchInternal::Count(writer->batch);
+            next_sequence += count;
+            total_count += count;
+          }
+          total_byte_size = WriteBatchInternal::AppendedByteSize(
+              total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+        }
+      }
+      if (w.disable_wal) {
+        has_unpersisted_data_.store(true, std::memory_order_relaxed);
+      }
+      write_thread_.UpdateLastSequence(current_sequence + total_count - 1);
+    }
+
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+    stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size);
+    RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+    RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+    IOStatus io_s;
+    io_s.PermitUncheckedError();  // Allow io_s to be uninitialized
+
+    if (w.status.ok() && !write_options.disableWAL) {
+      PERF_TIMER_GUARD(write_wal_time);
+      stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1);
+      RecordTick(stats_, WRITE_DONE_BY_SELF, 1);
+      if (wal_write_group.size > 1) {
+        stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                          wal_write_group.size - 1);
+        RecordTick(stats_, WRITE_DONE_BY_OTHER, wal_write_group.size - 1);
+      }
+      assert(log_context.log_file_number_size);
+      LogFileNumberSize& log_file_number_size =
+          *(log_context.log_file_number_size);
+      io_s =
+          WriteToWAL(wal_write_group, log_context.writer, log_used,
+                     log_context.need_log_sync, log_context.need_log_dir_sync,
+                     current_sequence, log_file_number_size);
+      w.status = io_s;
+    }
+
+    if (!io_s.ok()) {
+      // Check WriteToWAL status
+      IOStatusCheck(io_s);
+    } else if (!w.CallbackFailed()) {
+      WriteStatusCheck(w.status);
+    }
+
+    VersionEdit synced_wals;
+    if (log_context.need_log_sync) {
+      InstrumentedMutexLock l(&log_write_mutex_);
+      if (w.status.ok()) {
+        MarkLogsSynced(logfile_number_, log_context.need_log_dir_sync,
+                       &synced_wals);
+      } else {
+        MarkLogsNotSynced(logfile_number_);
+      }
+    }
+    if (w.status.ok() && synced_wals.IsWalAddition()) {
+      InstrumentedMutexLock l(&mutex_);
+      w.status = ApplyWALToManifest(&synced_wals);
+    }
+    write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status);
+  }
+
+  // NOTE: the memtable_write_group is declared before the following
+  // `if` statement because its lifetime needs to be longer
+  // that the inner context  of the `if` as a reference to it
+  // may be used further below within the outer _write_thread
+  WriteThread::WriteGroup memtable_write_group;
+
+  if (w.state == WriteThread::STATE_MEMTABLE_WRITER_LEADER) {
+    PERF_TIMER_GUARD(write_memtable_time);
+    assert(w.ShouldWriteToMemtable());
+    write_thread_.EnterAsMemTableWriter(&w, &memtable_write_group);
+    if (memtable_write_group.size > 1 &&
+        immutable_db_options_.allow_concurrent_memtable_write) {
+      write_thread_.LaunchParallelMemTableWriters(&memtable_write_group);
+    } else {
+      memtable_write_group.status = WriteBatchInternal::InsertInto(
+          memtable_write_group, w.sequence, column_family_memtables_.get(),
+          &flush_scheduler_, &trim_history_scheduler_,
+          write_options.ignore_missing_column_families, 0 /*log_number*/, this,
+          false /*concurrent_memtable_writes*/, seq_per_batch_, batch_per_txn_);
+      versions_->SetLastSequence(memtable_write_group.last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, memtable_write_group);
+    }
+  } else {
+    // NOTE: the memtable_write_group is never really used,
+    // so we need to set its status to pass ASSERT_STATUS_CHECKED
+    memtable_write_group.status.PermitUncheckedError();
+  }
+
+  if (w.state == WriteThread::STATE_PARALLEL_MEMTABLE_WRITER) {
+    assert(w.ShouldWriteToMemtable());
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        false /*seq_per_batch*/, 0 /*batch_cnt*/, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+    if (write_thread_.CompleteParallelMemTableWriter(&w)) {
+      MemTableInsertStatusCheck(w.status);
+      versions_->SetLastSequence(w.write_group->last_sequence);
+      write_thread_.ExitAsMemTableWriter(&w, *w.write_group);
+    }
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+
+  assert(w.state == WriteThread::STATE_COMPLETED);
+  return w.FinalStatus();
+}
+
+Status DBImpl::UnorderedWriteMemtable(const WriteOptions& write_options,
+                                      WriteBatch* my_batch,
+                                      WriteCallback* callback, uint64_t log_ref,
+                                      SequenceNumber seq,
+                                      const size_t sub_batch_cnt) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        false /*disable_memtable*/);
+
+  if (w.CheckCallback(this) && w.ShouldWriteToMemtable()) {
+    w.sequence = seq;
+    size_t total_count = WriteBatchInternal::Count(my_batch);
+    InternalStats* stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsNumKeysWritten, total_count);
+    RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count);
+
+    ColumnFamilyMemTablesImpl column_family_memtables(
+        versions_->GetColumnFamilySet());
+    w.status = WriteBatchInternal::InsertInto(
+        &w, w.sequence, &column_family_memtables, &flush_scheduler_,
+        &trim_history_scheduler_, write_options.ignore_missing_column_families,
+        0 /*log_number*/, this, true /*concurrent_memtable_writes*/,
+        seq_per_batch_, sub_batch_cnt, true /*batch_per_txn*/,
+        write_options.memtable_insert_hint_per_batch);
+    if (write_options.disableWAL) {
+      has_unpersisted_data_.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  size_t pending_cnt = pending_memtable_writes_.fetch_sub(1) - 1;
+  if (pending_cnt == 0) {
+    // switch_cv_ waits until pending_memtable_writes_ = 0. Locking its mutex
+    // before notify ensures that cv is in waiting state when it is notified
+    // thus not missing the update to pending_memtable_writes_ even though it is
+    // not modified under the mutex.
+    std::lock_guard<std::mutex> lck(switch_mutex_);
+    switch_cv_.notify_all();
+  }
+  WriteStatusCheck(w.status);
+
+  if (!w.FinalStatus().ok()) {
+    return w.FinalStatus();
+  }
+  return Status::OK();
+}
+
+// The 2nd write queue. If enabled it will be used only for WAL-only writes.
+// This is the only queue that updates LastPublishedSequence which is only
+// applicable in a two-queue setting.
+Status DBImpl::WriteImplWALOnly(
+    WriteThread* write_thread, const WriteOptions& write_options,
+    WriteBatch* my_batch, WriteCallback* callback, uint64_t* log_used,
+    const uint64_t log_ref, uint64_t* seq_used, const size_t sub_batch_cnt,
+    PreReleaseCallback* pre_release_callback, const AssignOrder assign_order,
+    const PublishLastSeq publish_last_seq, const bool disable_memtable) {
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+  WriteThread::Writer w(write_options, my_batch, callback, log_ref,
+                        disable_memtable, sub_batch_cnt, pre_release_callback);
+  StopWatch write_sw(immutable_db_options_.clock, stats_, DB_WRITE);
+
+  write_thread->JoinBatchGroup(&w);
+  assert(w.state != WriteThread::STATE_PARALLEL_MEMTABLE_WRITER);
+  if (w.state == WriteThread::STATE_COMPLETED) {
+    if (log_used != nullptr) {
+      *log_used = w.log_used;
+    }
+    if (seq_used != nullptr) {
+      *seq_used = w.sequence;
+    }
+    return w.FinalStatus();
+  }
+  // else we are the leader of the write batch group
+  assert(w.state == WriteThread::STATE_GROUP_LEADER);
+
+  if (publish_last_seq == kDoPublishLastSeq) {
+    Status status;
+
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+    WriteContext write_context;
+    if (error_handler_.IsDBStopped()) {
+      status = error_handler_.GetBGError();
+    }
+    // TODO(myabandeh): Make preliminary checks thread-safe so we could do them
+    // without paying the cost of obtaining the mutex.
+    if (status.ok()) {
+      LogContext log_context;
+      status = PreprocessWrite(write_options, &log_context, &write_context);
+      WriteStatusCheckOnLocked(status);
+    }
+    if (!status.ok()) {
+      WriteThread::WriteGroup write_group;
+      write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  }
+
+  WriteThread::WriteGroup write_group;
+  uint64_t last_sequence;
+  write_thread->EnterAsBatchGroupLeader(&w, &write_group);
+  // Note: no need to update last_batch_group_size_ here since the batch writes
+  // to WAL only
+  // TODO: this use of operator bool on `tracer_` can avoid unnecessary lock
+  // grabs but does not seem thread-safe.
+  if (tracer_) {
+    InstrumentedMutexLock lock(&trace_mutex_);
+    if (tracer_ != nullptr && tracer_->IsWriteOrderPreserved()) {
+      for (auto* writer : write_group) {
+        // TODO: maybe handle the tracing status?
+        tracer_->Write(writer->batch).PermitUncheckedError();
+      }
+    }
+  }
+
+  size_t pre_release_callback_cnt = 0;
+  size_t total_byte_size = 0;
+  for (auto* writer : write_group) {
+    assert(writer);
+    if (writer->CheckCallback(this)) {
+      total_byte_size = WriteBatchInternal::AppendedByteSize(
+          total_byte_size, WriteBatchInternal::ByteSize(writer->batch));
+      if (writer->pre_release_callback) {
+        pre_release_callback_cnt++;
+      }
+    }
+  }
+
+  const bool concurrent_update = true;
+  // Update stats while we are an exclusive group leader, so we know
+  // that nobody else can be writing to these particular stats.
+  // We're optimistic, updating the stats before we successfully
+  // commit.  That lets us release our leader status early.
+  auto stats = default_cf_internal_stats_;
+  stats->AddDBStats(InternalStats::kIntStatsBytesWritten, total_byte_size,
+                    concurrent_update);
+  RecordTick(stats_, BYTES_WRITTEN, total_byte_size);
+  stats->AddDBStats(InternalStats::kIntStatsWriteDoneBySelf, 1,
+                    concurrent_update);
+  RecordTick(stats_, WRITE_DONE_BY_SELF);
+  auto write_done_by_other = write_group.size - 1;
+  if (write_done_by_other > 0) {
+    stats->AddDBStats(InternalStats::kIntStatsWriteDoneByOther,
+                      write_done_by_other, concurrent_update);
+    RecordTick(stats_, WRITE_DONE_BY_OTHER, write_done_by_other);
+  }
+  RecordInHistogram(stats_, BYTES_PER_WRITE, total_byte_size);
+
+  PERF_TIMER_STOP(write_pre_and_post_process_time);
+
+  PERF_TIMER_GUARD(write_wal_time);
+  // LastAllocatedSequence is increased inside WriteToWAL under
+  // wal_write_mutex_ to ensure ordered events in WAL
+  size_t seq_inc = 0 /* total_count */;
+  if (assign_order == kDoAssignOrder) {
+    size_t total_batch_cnt = 0;
+    for (auto* writer : write_group) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      if (!writer->CallbackFailed()) {
+        total_batch_cnt += writer->batch_cnt;
+      }
+    }
+    seq_inc = total_batch_cnt;
+  }
+  Status status;
+  if (!write_options.disableWAL) {
+    IOStatus io_s =
+        ConcurrentWriteToWAL(write_group, log_used, &last_sequence, seq_inc);
+    status = io_s;
+    // last_sequence may not be set if there is an error
+    // This error checking and return is moved up to avoid using uninitialized
+    // last_sequence.
+    if (!io_s.ok()) {
+      IOStatusCheck(io_s);
+      write_thread->ExitAsBatchGroupLeader(write_group, status);
+      return status;
+    }
+  } else {
+    // Otherwise we inc seq number to do solely the seq allocation
+    last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  }
+
+  size_t memtable_write_cnt = 0;
+  auto curr_seq = last_sequence + 1;
+  for (auto* writer : write_group) {
+    if (writer->CallbackFailed()) {
+      continue;
+    }
+    writer->sequence = curr_seq;
+    if (assign_order == kDoAssignOrder) {
+      assert(writer->batch_cnt || !seq_per_batch_);
+      curr_seq += writer->batch_cnt;
+    }
+    if (!writer->disable_memtable) {
+      memtable_write_cnt++;
+    }
+    // else seq advances only by memtable writes
+  }
+  if (status.ok() && write_options.sync) {
+    assert(!write_options.disableWAL);
+    // Requesting sync with two_write_queues_ is expected to be very rare. We
+    // hance provide a simple implementation that is not necessarily efficient.
+    if (manual_wal_flush_) {
+      status = FlushWAL(true);
+    } else {
+      status = SyncWAL();
+    }
+  }
+  PERF_TIMER_START(write_pre_and_post_process_time);
+
+  if (!w.CallbackFailed()) {
+    WriteStatusCheck(status);
+  }
+  if (status.ok()) {
+    size_t index = 0;
+    for (auto* writer : write_group) {
+      if (!writer->CallbackFailed() && writer->pre_release_callback) {
+        assert(writer->sequence != kMaxSequenceNumber);
+        Status ws = writer->pre_release_callback->Callback(
+            writer->sequence, disable_memtable, writer->log_used, index++,
+            pre_release_callback_cnt);
+        if (!ws.ok()) {
+          status = ws;
+          break;
+        }
+      }
+    }
+  }
+  if (publish_last_seq == kDoPublishLastSeq) {
+    versions_->SetLastSequence(last_sequence + seq_inc);
+    // Currently we only use kDoPublishLastSeq in unordered_write
+    assert(immutable_db_options_.unordered_write);
+  }
+  if (immutable_db_options_.unordered_write && status.ok()) {
+    pending_memtable_writes_ += memtable_write_cnt;
+  }
+  write_thread->ExitAsBatchGroupLeader(write_group, status);
+  if (status.ok()) {
+    status = w.FinalStatus();
+  }
+  if (seq_used != nullptr) {
+    *seq_used = w.sequence;
+  }
+  return status;
+}
+
+void DBImpl::WriteStatusCheckOnLocked(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  InstrumentedMutexLock l(&mutex_);
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+  }
+}
+
+void DBImpl::WriteStatusCheck(const Status& status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  assert(!status.IsIOFenced() || !error_handler_.GetBGError().ok());
+  if (immutable_db_options_.paranoid_checks && !status.ok() &&
+      !status.IsBusy() && !status.IsIncomplete()) {
+    mutex_.Lock();
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  }
+}
+
+void DBImpl::IOStatusCheck(const IOStatus& io_status) {
+  // Is setting bg_error_ enough here?  This will at least stop
+  // compaction and fail any further writes.
+  if ((immutable_db_options_.paranoid_checks && !io_status.ok() &&
+       !io_status.IsBusy() && !io_status.IsIncomplete()) ||
+      io_status.IsIOFenced()) {
+    mutex_.Lock();
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(io_status, BackgroundErrorReason::kWriteCallback);
+    mutex_.Unlock();
+  } else {
+    // Force writable file to be continue writable.
+    logs_.back().writer->file()->reset_seen_error();
+  }
+}
+
+void DBImpl::MemTableInsertStatusCheck(const Status& status) {
+  // A non-OK status here indicates that the state implied by the
+  // WAL has diverged from the in-memory state.  This could be
+  // because of a corrupt write_batch (very bad), or because the
+  // client specified an invalid column family and didn't specify
+  // ignore_missing_column_families.
+  if (!status.ok()) {
+    mutex_.Lock();
+    assert(!error_handler_.IsBGWorkStopped());
+    // Maybe change the return status to void?
+    error_handler_.SetBGError(status, BackgroundErrorReason::kMemTable)
+        .PermitUncheckedError();
+    mutex_.Unlock();
+  }
+}
+
+Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
+                               LogContext* log_context,
+                               WriteContext* write_context) {
+  assert(write_context != nullptr && log_context != nullptr);
+  Status status;
+
+  if (error_handler_.IsDBStopped()) {
+    InstrumentedMutexLock l(&mutex_);
+    status = error_handler_.GetBGError();
+  }
+
+  PERF_TIMER_GUARD(write_scheduling_flushes_compactions_time);
+
+  if (UNLIKELY(status.ok() && total_log_size_ > GetMaxTotalWalSize())) {
+    assert(versions_);
+    InstrumentedMutexLock l(&mutex_);
+    const ColumnFamilySet* const column_families =
+        versions_->GetColumnFamilySet();
+    assert(column_families);
+    size_t num_cfs = column_families->NumberOfColumnFamilies();
+    assert(num_cfs >= 1);
+    if (num_cfs > 1) {
+      WaitForPendingWrites();
+      status = SwitchWAL(write_context);
+    }
+  }
+
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
+    // Before a new memtable is added in SwitchMemtable(),
+    // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+    // thread is writing to another DB with the same write buffer, they may also
+    // be flushed. We may end up with flushing much more DBs than needed. It's
+    // suboptimal but still correct.
+    InstrumentedMutexLock l(&mutex_);
+    WaitForPendingWrites();
+    status = HandleWriteBufferManagerFlush(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !trim_history_scheduler_.Empty())) {
+    InstrumentedMutexLock l(&mutex_);
+    status = TrimMemtableHistory(write_context);
+  }
+
+  if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
+    InstrumentedMutexLock l(&mutex_);
+    WaitForPendingWrites();
+    status = ScheduleFlushes(write_context);
+  }
+
+  PERF_TIMER_STOP(write_scheduling_flushes_compactions_time);
+  PERF_TIMER_GUARD(write_pre_and_post_process_time);
+
+  if (UNLIKELY(status.ok() && (write_controller_.IsStopped() ||
+                               write_controller_.NeedsDelay()))) {
+    PERF_TIMER_STOP(write_pre_and_post_process_time);
+    PERF_TIMER_GUARD(write_delay_time);
+    // We don't know size of curent batch so that we always use the size
+    // for previous one. It might create a fairness issue that expiration
+    // might happen for smaller writes but larger writes can go through.
+    // Can optimize it if it is an issue.
+    InstrumentedMutexLock l(&mutex_);
+    status = DelayWrite(last_batch_group_size_, write_options);
+    PERF_TIMER_START(write_pre_and_post_process_time);
+  }
+
+  // If memory usage exceeded beyond a certain threshold,
+  // write_buffer_manager_->ShouldStall() returns true to all threads writing to
+  // all DBs and writers will be stalled.
+  // It does soft checking because WriteBufferManager::buffer_limit_ has already
+  // exceeded at this point so no new write (including current one) will go
+  // through until memory usage is decreased.
+  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldStall())) {
+    if (write_options.no_slowdown) {
+      status = Status::Incomplete("Write stall");
+    } else {
+      InstrumentedMutexLock l(&mutex_);
+      WriteBufferManagerStallWrites();
+    }
+  }
+  InstrumentedMutexLock l(&log_write_mutex_);
+  if (status.ok() && log_context->need_log_sync) {
+    // Wait until the parallel syncs are finished. Any sync process has to sync
+    // the front log too so it is enough to check the status of front()
+    // We do a while loop since log_sync_cv_ is signalled when any sync is
+    // finished
+    // Note: there does not seem to be a reason to wait for parallel sync at
+    // this early step but it is not important since parallel sync (SyncWAL) and
+    // need_log_sync are usually not used together.
+    while (logs_.front().IsSyncing()) {
+      log_sync_cv_.Wait();
+    }
+    for (auto& log : logs_) {
+      // This is just to prevent the logs to be synced by a parallel SyncWAL
+      // call. We will do the actual syncing later after we will write to the
+      // WAL.
+      // Note: there does not seem to be a reason to set this early before we
+      // actually write to the WAL
+      log.PrepareForSync();
+    }
+  } else {
+    log_context->need_log_sync = false;
+  }
+  log_context->writer = logs_.back().writer;
+  log_context->need_log_dir_sync =
+      log_context->need_log_dir_sync && !log_dir_synced_;
+  log_context->log_file_number_size = std::addressof(alive_log_files_.back());
+
+  return status;
+}
+
+Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
+                          WriteBatch* tmp_batch, WriteBatch** merged_batch,
+                          size_t* write_with_wal,
+                          WriteBatch** to_be_cached_state) {
+  assert(write_with_wal != nullptr);
+  assert(tmp_batch != nullptr);
+  assert(*to_be_cached_state == nullptr);
+  *write_with_wal = 0;
+  auto* leader = write_group.leader;
+  assert(!leader->disable_wal);  // Same holds for all in the batch group
+  if (write_group.size == 1 && !leader->CallbackFailed() &&
+      leader->batch->GetWalTerminationPoint().is_cleared()) {
+    // we simply write the first WriteBatch to WAL if the group only
+    // contains one batch, that batch should be written to the WAL,
+    // and the batch is not wanting to be truncated
+    *merged_batch = leader->batch;
+    if (WriteBatchInternal::IsLatestPersistentState(*merged_batch)) {
+      *to_be_cached_state = *merged_batch;
+    }
+    *write_with_wal = 1;
+  } else {
+    // WAL needs all of the batches flattened into a single batch.
+    // We could avoid copying here with an iov-like AddRecord
+    // interface
+    *merged_batch = tmp_batch;
+    for (auto writer : write_group) {
+      if (!writer->CallbackFailed()) {
+        Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
+                                              /*WAL_only*/ true);
+        if (!s.ok()) {
+          tmp_batch->Clear();
+          return s;
+        }
+        if (WriteBatchInternal::IsLatestPersistentState(writer->batch)) {
+          // We only need to cache the last of such write batch
+          *to_be_cached_state = writer->batch;
+        }
+        (*write_with_wal)++;
+      }
+    }
+  }
+  // return merged_batch;
+  return Status::OK();
+}
+
+// When two_write_queues_ is disabled, this function is called from the only
+// write thread. Otherwise this must be called holding log_write_mutex_.
+IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            uint64_t* log_size,
+                            Env::IOPriority rate_limiter_priority,
+                            LogFileNumberSize& log_file_number_size) {
+  assert(log_size != nullptr);
+
+  Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
+  TEST_SYNC_POINT_CALLBACK("DBImpl::WriteToWAL:log_entry", &log_entry);
+  auto s = merged_batch.VerifyChecksum();
+  if (!s.ok()) {
+    return status_to_io_status(std::move(s));
+  }
+  *log_size = log_entry.size();
+  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
+  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
+  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
+  // from possible concurrent calls via the FlushWAL by the application.
+  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+  // Due to performance cocerns of missed branch prediction penalize the new
+  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
+  // when we do not need any locking.
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Lock();
+  }
+  IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
+
+  if (UNLIKELY(needs_locking)) {
+    log_write_mutex_.Unlock();
+  }
+  if (log_used != nullptr) {
+    *log_used = logfile_number_;
+  }
+  total_log_size_ += log_entry.size();
+  log_file_number_size.AddSize(*log_size);
+  log_empty_ = false;
+  return io_s;
+}
+
+IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
+                            log::Writer* log_writer, uint64_t* log_used,
+                            bool need_log_sync, bool need_log_dir_sync,
+                            SequenceNumber sequence,
+                            LogFileNumberSize& log_file_number_size) {
+  IOStatus io_s;
+  assert(!two_write_queues_);
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch;
+  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
+                                        &write_with_wal, &to_be_cached_state));
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
+  }
+
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  uint64_t log_size;
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    write_group.leader->rate_limiter_priority,
+                    log_file_number_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+
+  if (io_s.ok() && need_log_sync) {
+    StopWatch sw(immutable_db_options_.clock, stats_, WAL_FILE_SYNC_MICROS);
+    // It's safe to access logs_ with unlocked mutex_ here because:
+    //  - we've set getting_synced=true for all logs,
+    //    so other threads won't pop from logs_ while we're here,
+    //  - only writer thread can push to logs_, and we're in
+    //    writer thread, so no one will push to logs_,
+    //  - as long as other threads don't modify it, it's safe to read
+    //    from std::deque from multiple threads concurrently.
+    //
+    // Sync operation should work with locked log_write_mutex_, because:
+    //   when DBOptions.manual_wal_flush_ is set,
+    //   FlushWAL function will be invoked by another thread.
+    //   if without locked log_write_mutex_, the log file may get data
+    //   corruption
+
+    const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Lock();
+    }
+
+    for (auto& log : logs_) {
+      io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
+      if (!io_s.ok()) {
+        break;
+      }
+    }
+
+    if (UNLIKELY(needs_locking)) {
+      log_write_mutex_.Unlock();
+    }
+
+    if (io_s.ok() && need_log_dir_sync) {
+      // We only sync WAL directory the first time WAL syncing is
+      // requested, so that in case users never turn on WAL sync,
+      // we can avoid the disk I/O in the write code path.
+      io_s = directories_.GetWalDir()->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+  }
+
+  if (merged_batch == &tmp_batch_) {
+    tmp_batch_.Clear();
+  }
+  if (io_s.ok()) {
+    auto stats = default_cf_internal_stats_;
+    if (need_log_sync) {
+      stats->AddDBStats(InternalStats::kIntStatsWalFileSynced, 1);
+      RecordTick(stats_, WAL_FILE_SYNCED);
+    }
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return io_s;
+}
+
+IOStatus DBImpl::ConcurrentWriteToWAL(
+    const WriteThread::WriteGroup& write_group, uint64_t* log_used,
+    SequenceNumber* last_sequence, size_t seq_inc) {
+  IOStatus io_s;
+
+  assert(two_write_queues_ || immutable_db_options_.unordered_write);
+  assert(!write_group.leader->disable_wal);
+  // Same holds for all in the batch group
+  WriteBatch tmp_batch;
+  size_t write_with_wal = 0;
+  WriteBatch* to_be_cached_state = nullptr;
+  WriteBatch* merged_batch;
+  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch, &merged_batch,
+                                        &write_with_wal, &to_be_cached_state));
+  if (UNLIKELY(!io_s.ok())) {
+    return io_s;
+  }
+
+  // We need to lock log_write_mutex_ since logs_ and alive_log_files might be
+  // pushed back concurrently
+  log_write_mutex_.Lock();
+  if (merged_batch == write_group.leader->batch) {
+    write_group.leader->log_used = logfile_number_;
+  } else if (write_with_wal > 1) {
+    for (auto writer : write_group) {
+      writer->log_used = logfile_number_;
+    }
+  }
+  *last_sequence = versions_->FetchAddLastAllocatedSequence(seq_inc);
+  auto sequence = *last_sequence + 1;
+  WriteBatchInternal::SetSequence(merged_batch, sequence);
+
+  log::Writer* log_writer = logs_.back().writer;
+  LogFileNumberSize& log_file_number_size = alive_log_files_.back();
+
+  assert(log_writer->get_log_number() == log_file_number_size.number);
+
+  uint64_t log_size;
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    write_group.leader->rate_limiter_priority,
+                    log_file_number_size);
+  if (to_be_cached_state) {
+    cached_recoverable_state_ = *to_be_cached_state;
+    cached_recoverable_state_empty_ = false;
+  }
+  log_write_mutex_.Unlock();
+
+  if (io_s.ok()) {
+    const bool concurrent = true;
+    auto stats = default_cf_internal_stats_;
+    stats->AddDBStats(InternalStats::kIntStatsWalFileBytes, log_size,
+                      concurrent);
+    RecordTick(stats_, WAL_FILE_BYTES, log_size);
+    stats->AddDBStats(InternalStats::kIntStatsWriteWithWal, write_with_wal,
+                      concurrent);
+    RecordTick(stats_, WRITE_WITH_WAL, write_with_wal);
+  }
+  return io_s;
+}
+
+Status DBImpl::WriteRecoverableState() {
+  mutex_.AssertHeld();
+  if (!cached_recoverable_state_empty_) {
+    bool dont_care_bool;
+    SequenceNumber next_seq;
+    if (two_write_queues_) {
+      log_write_mutex_.Lock();
+    }
+    SequenceNumber seq;
+    if (two_write_queues_) {
+      seq = versions_->FetchAddLastAllocatedSequence(0);
+    } else {
+      seq = versions_->LastSequence();
+    }
+    WriteBatchInternal::SetSequence(&cached_recoverable_state_, seq + 1);
+    auto status = WriteBatchInternal::InsertInto(
+        &cached_recoverable_state_, column_family_memtables_.get(),
+        &flush_scheduler_, &trim_history_scheduler_, true,
+        0 /*recovery_log_number*/, this, false /* concurrent_memtable_writes */,
+        &next_seq, &dont_care_bool, seq_per_batch_);
+    auto last_seq = next_seq - 1;
+    if (two_write_queues_) {
+      versions_->FetchAddLastAllocatedSequence(last_seq - seq);
+      versions_->SetLastPublishedSequence(last_seq);
+    }
+    versions_->SetLastSequence(last_seq);
+    if (two_write_queues_) {
+      log_write_mutex_.Unlock();
+    }
+    if (status.ok() && recoverable_state_pre_release_callback_) {
+      const bool DISABLE_MEMTABLE = true;
+      for (uint64_t sub_batch_seq = seq + 1;
+           sub_batch_seq < next_seq && status.ok(); sub_batch_seq++) {
+        uint64_t const no_log_num = 0;
+        // Unlock it since the callback might end up locking mutex. e.g.,
+        // AddCommitted -> AdvanceMaxEvictedSeq -> GetSnapshotListFromDB
+        mutex_.Unlock();
+        status = recoverable_state_pre_release_callback_->Callback(
+            sub_batch_seq, !DISABLE_MEMTABLE, no_log_num, 0, 1);
+        mutex_.Lock();
+      }
+    }
+    if (status.ok()) {
+      cached_recoverable_state_.Clear();
+      cached_recoverable_state_empty_ = true;
+    }
+    return status;
+  }
+  return Status::OK();
+}
+
+void DBImpl::SelectColumnFamiliesForAtomicFlush(
+    autovector<ColumnFamilyData*>* cfds) {
+  for (ColumnFamilyData* cfd : *versions_->GetColumnFamilySet()) {
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    if (cfd->imm()->NumNotFlushed() != 0 || !cfd->mem()->IsEmpty() ||
+        !cached_recoverable_state_empty_.load()) {
+      cfds->push_back(cfd);
+    }
+  }
+}
+
+// Assign sequence number for atomic flush.
+void DBImpl::AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds) {
+  assert(immutable_db_options_.atomic_flush);
+  auto seq = versions_->LastSequence();
+  for (auto cfd : cfds) {
+    cfd->imm()->AssignAtomicFlushSeq(seq);
+  }
+}
+
+Status DBImpl::SwitchWAL(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  if (alive_log_files_.begin()->getting_flushed) {
+    return status;
+  }
+
+  auto oldest_alive_log = alive_log_files_.begin()->number;
+  bool flush_wont_release_oldest_log = false;
+  if (allow_2pc()) {
+    auto oldest_log_with_uncommitted_prep =
+        logs_with_prep_tracker_.FindMinLogContainingOutstandingPrep();
+
+    assert(oldest_log_with_uncommitted_prep == 0 ||
+           oldest_log_with_uncommitted_prep >= oldest_alive_log);
+    if (oldest_log_with_uncommitted_prep > 0 &&
+        oldest_log_with_uncommitted_prep == oldest_alive_log) {
+      if (unable_to_release_oldest_log_) {
+        // we already attempted to flush all column families dependent on
+        // the oldest alive log but the log still contained uncommitted
+        // transactions so there is still nothing that we can do.
+        return status;
+      } else {
+        ROCKS_LOG_WARN(
+            immutable_db_options_.info_log,
+            "Unable to release oldest log due to uncommitted transaction");
+        unable_to_release_oldest_log_ = true;
+        flush_wont_release_oldest_log = true;
+      }
+    }
+  }
+  if (!flush_wont_release_oldest_log) {
+    // we only mark this log as getting flushed if we have successfully
+    // flushed all data in this log. If this log contains outstanding prepared
+    // transactions then we cannot flush this log until those transactions are
+    // commited.
+    unable_to_release_oldest_log_ = false;
+    alive_log_files_.begin()->getting_flushed = true;
+  }
+
+  ROCKS_LOG_INFO(
+      immutable_db_options_.info_log,
+      "Flushing all column families with data in WAL number %" PRIu64
+      ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
+      oldest_alive_log, total_log_size_.load(), GetMaxTotalWalSize());
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (cfd->OldestLogToKeep() <= oldest_alive_log) {
+        cfds.push_back(cfd);
+      }
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (const auto cfd : cfds) {
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWalFull);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
+  mutex_.AssertHeld();
+  assert(write_context != nullptr);
+  Status status;
+
+  // Before a new memtable is added in SwitchMemtable(),
+  // write_buffer_manager_->ShouldFlush() will keep returning true. If another
+  // thread is writing to another DB with the same write buffer, they may also
+  // be flushed. We may end up with flushing much more DBs than needed. It's
+  // suboptimal but still correct.
+  // no need to refcount because drop is happening in write thread, so can't
+  // happen while we're in the write thread
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+  } else {
+    ColumnFamilyData* cfd_picked = nullptr;
+    SequenceNumber seq_num_for_cf_picked = kMaxSequenceNumber;
+
+    for (auto cfd : *versions_->GetColumnFamilySet()) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (!cfd->mem()->IsEmpty() && !cfd->imm()->IsFlushPendingOrRunning()) {
+        // We only consider flush on CFs with bytes in the mutable memtable,
+        // and no immutable memtables for which flush has yet to finish. If
+        // we triggered flush on CFs already trying to flush, we would risk
+        // creating too many immutable memtables leading to write stalls.
+        uint64_t seq = cfd->mem()->GetCreationSeq();
+        if (cfd_picked == nullptr || seq < seq_num_for_cf_picked) {
+          cfd_picked = cfd;
+          seq_num_for_cf_picked = seq;
+        }
+      }
+    }
+    if (cfd_picked != nullptr) {
+      cfds.push_back(cfd_picked);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  if (!cfds.empty()) {
+    ROCKS_LOG_INFO(
+        immutable_db_options_.info_log,
+        "Flushing triggered to alleviate write buffer memory usage. Write "
+        "buffer is using %" ROCKSDB_PRIszt
+        " bytes out of a total of %" ROCKSDB_PRIszt ".",
+        write_buffer_manager_->memory_usage(),
+        write_buffer_manager_->buffer_size());
+  }
+
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+  for (const auto cfd : cfds) {
+    if (cfd->mem()->IsEmpty()) {
+      continue;
+    }
+    cfd->Ref();
+    status = SwitchMemtable(cfd, write_context);
+    cfd->UnrefAndTryDelete();
+    if (!status.ok()) {
+      break;
+    }
+  }
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+    }
+    for (const auto cfd : cfds) {
+      cfd->imm()->FlushRequested();
+      if (!immutable_db_options_.atomic_flush) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+      }
+    }
+    if (immutable_db_options_.atomic_flush) {
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+uint64_t DBImpl::GetMaxTotalWalSize() const {
+  uint64_t max_total_wal_size =
+      max_total_wal_size_.load(std::memory_order_acquire);
+  if (max_total_wal_size > 0) {
+    return max_total_wal_size;
+  }
+  return 4 * max_total_in_memory_state_.load(std::memory_order_acquire);
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+Status DBImpl::DelayWrite(uint64_t num_bytes,
+                          const WriteOptions& write_options) {
+  uint64_t time_delayed = 0;
+  bool delayed = false;
+  {
+    StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL,
+                 &time_delayed);
+    uint64_t delay =
+        write_controller_.GetDelay(immutable_db_options_.clock, num_bytes);
+    TEST_SYNC_POINT("DBImpl::DelayWrite:Start");
+    if (delay > 0) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Sleep");
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      mutex_.Unlock();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:BeginWriteStallDone");
+      // We will delay the write until we have slept for `delay` microseconds
+      // or we don't need a delay anymore. We check for cancellation every 1ms
+      // (slightly longer because WriteController minimum delay is 1ms, in
+      // case of sleep imprecision, rounding, etc.)
+      const uint64_t kDelayInterval = 1001;
+      uint64_t stall_end = sw.start_time() + delay;
+      while (write_controller_.NeedsDelay()) {
+        if (immutable_db_options_.clock->NowMicros() >= stall_end) {
+          // We already delayed this write `delay` microseconds
+          break;
+        }
+
+        delayed = true;
+        // Sleep for 0.001 seconds
+        immutable_db_options_.clock->SleepForMicroseconds(kDelayInterval);
+      }
+      mutex_.Lock();
+      write_thread_.EndWriteStall();
+    }
+
+    // Don't wait if there's a background error, even if its a soft error. We
+    // might wait here indefinitely as the background compaction may never
+    // finish successfully, resulting in the stall condition lasting
+    // indefinitely
+    while (error_handler_.GetBGError().ok() && write_controller_.IsStopped() &&
+           !shutting_down_.load(std::memory_order_relaxed)) {
+      if (write_options.no_slowdown) {
+        return Status::Incomplete("Write stall");
+      }
+      delayed = true;
+
+      // Notify write_thread_ about the stall so it can setup a barrier and
+      // fail any pending writers with no_slowdown
+      write_thread_.BeginWriteStall();
+      TEST_SYNC_POINT("DBImpl::DelayWrite:Wait");
+      bg_cv_.Wait();
+      write_thread_.EndWriteStall();
+    }
+  }
+  assert(!delayed || !write_options.no_slowdown);
+  if (delayed) {
+    default_cf_internal_stats_->AddDBStats(
+        InternalStats::kIntStatsWriteStallMicros, time_delayed);
+    RecordTick(stats_, STALL_MICROS, time_delayed);
+  }
+
+  // If DB is not in read-only mode and write_controller is not stopping
+  // writes, we can ignore any background errors and allow the write to
+  // proceed
+  Status s;
+  if (write_controller_.IsStopped()) {
+    if (!shutting_down_.load(std::memory_order_relaxed)) {
+      // If writes are still stopped and db not shutdown, it means we bailed
+      // due to a background error
+      s = Status::Incomplete(error_handler_.GetBGError().ToString());
+    } else {
+      s = Status::ShutdownInProgress("stalled writes");
+    }
+  }
+  if (error_handler_.IsDBStopped()) {
+    s = error_handler_.GetBGError();
+  }
+  return s;
+}
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+void DBImpl::WriteBufferManagerStallWrites() {
+  mutex_.AssertHeld();
+  // First block future writer threads who want to add themselves to the queue
+  // of WriteThread.
+  write_thread_.BeginWriteStall();
+  mutex_.Unlock();
+
+  // Change the state to State::Blocked.
+  static_cast<WBMStallInterface*>(wbm_stall_.get())
+      ->SetState(WBMStallInterface::State::BLOCKED);
+  // Then WriteBufferManager will add DB instance to its queue
+  // and block this thread by calling WBMStallInterface::Block().
+  write_buffer_manager_->BeginWriteStall(wbm_stall_.get());
+  wbm_stall_->Block();
+
+  mutex_.Lock();
+  // Stall has ended. Signal writer threads so that they can add
+  // themselves to the WriteThread queue for writes.
+  write_thread_.EndWriteStall();
+}
+
+Status DBImpl::ThrottleLowPriWritesIfNeeded(const WriteOptions& write_options,
+                                            WriteBatch* my_batch) {
+  assert(write_options.low_pri);
+  // This is called outside the DB mutex. Although it is safe to make the call,
+  // the consistency condition is not guaranteed to hold. It's OK to live with
+  // it in this case.
+  // If we need to speed compaction, it means the compaction is left behind
+  // and we start to limit low pri writes to a limit.
+  if (write_controller_.NeedSpeedupCompaction()) {
+    if (allow_2pc() && (my_batch->HasCommit() || my_batch->HasRollback())) {
+      // For 2PC, we only rate limit prepare, not commit.
+      return Status::OK();
+    }
+    if (write_options.no_slowdown) {
+      return Status::Incomplete("Low priority write stall");
+    } else {
+      assert(my_batch != nullptr);
+      // Rate limit those writes. The reason that we don't completely wait
+      // is that in case the write is heavy, low pri writes may never have
+      // a chance to run. Now we guarantee we are still slowly making
+      // progress.
+      PERF_TIMER_GUARD(write_delay_time);
+      write_controller_.low_pri_rate_limiter()->Request(
+          my_batch->GetDataSize(), Env::IO_HIGH, nullptr /* stats */,
+          RateLimiter::OpType::kWrite);
+    }
+  }
+  return Status::OK();
+}
+
+void DBImpl::MaybeFlushStatsCF(autovector<ColumnFamilyData*>* cfds) {
+  assert(cfds != nullptr);
+  if (!cfds->empty() && immutable_db_options_.persist_stats_to_disk) {
+    ColumnFamilyData* cfd_stats =
+        versions_->GetColumnFamilySet()->GetColumnFamily(
+            kPersistentStatsColumnFamilyName);
+    if (cfd_stats != nullptr && !cfd_stats->mem()->IsEmpty()) {
+      for (ColumnFamilyData* cfd : *cfds) {
+        if (cfd == cfd_stats) {
+          // stats CF already included in cfds
+          return;
+        }
+      }
+      // force flush stats CF when its log number is less than all other CF's
+      // log numbers
+      bool force_flush_stats_cf = true;
+      for (auto* loop_cfd : *versions_->GetColumnFamilySet()) {
+        if (loop_cfd == cfd_stats) {
+          continue;
+        }
+        if (loop_cfd->GetLogNumber() <= cfd_stats->GetLogNumber()) {
+          force_flush_stats_cf = false;
+        }
+      }
+      if (force_flush_stats_cf) {
+        cfds->push_back(cfd_stats);
+        ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                       "Force flushing stats CF with automated flush "
+                       "to avoid holding old logs");
+      }
+    }
+  }
+}
+
+Status DBImpl::TrimMemtableHistory(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  ColumnFamilyData* tmp_cfd;
+  while ((tmp_cfd = trim_history_scheduler_.TakeNextColumnFamily()) !=
+         nullptr) {
+    cfds.push_back(tmp_cfd);
+  }
+  for (auto& cfd : cfds) {
+    autovector<MemTable*> to_delete;
+    bool trimmed = cfd->imm()->TrimHistory(&context->memtables_to_free_,
+                                           cfd->mem()->MemoryAllocatedBytes());
+    if (trimmed) {
+      context->superversion_context.NewSuperVersion();
+      assert(context->superversion_context.new_superversion.get() != nullptr);
+      cfd->InstallSuperVersion(&context->superversion_context, &mutex_);
+    }
+
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+  }
+  return Status::OK();
+}
+
+Status DBImpl::ScheduleFlushes(WriteContext* context) {
+  autovector<ColumnFamilyData*> cfds;
+  if (immutable_db_options_.atomic_flush) {
+    SelectColumnFamiliesForAtomicFlush(&cfds);
+    for (auto cfd : cfds) {
+      cfd->Ref();
+    }
+    flush_scheduler_.Clear();
+  } else {
+    ColumnFamilyData* tmp_cfd;
+    while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
+      cfds.push_back(tmp_cfd);
+    }
+    MaybeFlushStatsCF(&cfds);
+  }
+  Status status;
+  WriteThread::Writer nonmem_w;
+  if (two_write_queues_) {
+    nonmem_write_thread_.EnterUnbatched(&nonmem_w, &mutex_);
+  }
+
+  for (auto& cfd : cfds) {
+    if (!cfd->mem()->IsEmpty()) {
+      status = SwitchMemtable(cfd, context);
+    }
+    if (cfd->UnrefAndTryDelete()) {
+      cfd = nullptr;
+    }
+    if (!status.ok()) {
+      break;
+    }
+  }
+
+  if (two_write_queues_) {
+    nonmem_write_thread_.ExitUnbatched(&nonmem_w);
+  }
+
+  if (status.ok()) {
+    if (immutable_db_options_.atomic_flush) {
+      AssignAtomicFlushSeq(cfds);
+      FlushRequest flush_req;
+      GenerateFlushRequest(cfds, &flush_req);
+      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+    } else {
+      for (auto* cfd : cfds) {
+        FlushRequest flush_req;
+        GenerateFlushRequest({cfd}, &flush_req);
+        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
+      }
+    }
+    MaybeScheduleFlushOrCompaction();
+  }
+  return status;
+}
+
+#ifndef ROCKSDB_LITE
+void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/,
+                                    const MemTableInfo& mem_table_info) {
+  if (immutable_db_options_.listeners.size() == 0U) {
+    return;
+  }
+  if (shutting_down_.load(std::memory_order_acquire)) {
+    return;
+  }
+
+  mutex_.Unlock();
+  for (auto listener : immutable_db_options_.listeners) {
+    listener->OnMemTableSealed(mem_table_info);
+  }
+  mutex_.Lock();
+}
+#endif  // ROCKSDB_LITE
+
+// REQUIRES: mutex_ is held
+// REQUIRES: this thread is currently at the front of the writer queue
+// REQUIRES: this thread is currently at the front of the 2nd writer queue if
+// two_write_queues_ is true (This is to simplify the reasoning.)
+Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
+  mutex_.AssertHeld();
+  log::Writer* new_log = nullptr;
+  MemTable* new_mem = nullptr;
+  IOStatus io_s;
+
+  // Recoverable state is persisted in WAL. After memtable switch, WAL might
+  // be deleted, so we write the state to memtable to be persisted as well.
+  Status s = WriteRecoverableState();
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Attempt to switch to a new memtable and trigger flush of old.
+  // Do this without holding the dbmutex lock.
+  assert(versions_->prev_log_number() == 0);
+  if (two_write_queues_) {
+    log_write_mutex_.Lock();
+  }
+  bool creating_new_log = !log_empty_;
+  if (two_write_queues_) {
+    log_write_mutex_.Unlock();
+  }
+  uint64_t recycle_log_number = 0;
+  if (creating_new_log && immutable_db_options_.recycle_log_file_num &&
+      !log_recycle_files_.empty()) {
+    recycle_log_number = log_recycle_files_.front();
+  }
+  uint64_t new_log_number =
+      creating_new_log ? versions_->NewFileNumber() : logfile_number_;
+  const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+
+  // Set memtable_info for memtable sealed callback
+#ifndef ROCKSDB_LITE
+  MemTableInfo memtable_info;
+  memtable_info.cf_name = cfd->GetName();
+  memtable_info.first_seqno = cfd->mem()->GetFirstSequenceNumber();
+  memtable_info.earliest_seqno = cfd->mem()->GetEarliestSequenceNumber();
+  memtable_info.num_entries = cfd->mem()->num_entries();
+  memtable_info.num_deletes = cfd->mem()->num_deletes();
+#endif  // ROCKSDB_LITE
+  // Log this later after lock release. It may be outdated, e.g., if background
+  // flush happens before logging, but that should be ok.
+  int num_imm_unflushed = cfd->imm()->NumNotFlushed();
+  const auto preallocate_block_size =
+      GetWalPreallocateBlockSize(mutable_cf_options.write_buffer_size);
+  mutex_.Unlock();
+  if (creating_new_log) {
+    // TODO: Write buffer size passed in should be max of all CF's instead
+    // of mutable_cf_options.write_buffer_size.
+    io_s = CreateWAL(new_log_number, recycle_log_number, preallocate_block_size,
+                     &new_log);
+    if (s.ok()) {
+      s = io_s;
+    }
+  }
+  if (s.ok()) {
+    SequenceNumber seq = versions_->LastSequence();
+    new_mem = cfd->ConstructNewMemtable(mutable_cf_options, seq);
+    context->superversion_context.NewSuperVersion();
+  }
+  ROCKS_LOG_INFO(immutable_db_options_.info_log,
+                 "[%s] New memtable created with log file: #%" PRIu64
+                 ". Immutable memtables: %d.\n",
+                 cfd->GetName().c_str(), new_log_number, num_imm_unflushed);
+  // There should be no concurrent write as the thread is at the front of
+  // writer queue
+  cfd->mem()->ConstructFragmentedRangeTombstones();
+
+  mutex_.Lock();
+  if (recycle_log_number != 0) {
+    // Since renaming the file is done outside DB mutex, we need to ensure
+    // concurrent full purges don't delete the file while we're recycling it.
+    // To achieve that we hold the old log number in the recyclable list until
+    // after it has been renamed.
+    assert(log_recycle_files_.front() == recycle_log_number);
+    log_recycle_files_.pop_front();
+  }
+  if (s.ok() && creating_new_log) {
+    InstrumentedMutexLock l(&log_write_mutex_);
+    assert(new_log != nullptr);
+    if (!logs_.empty()) {
+      // Alway flush the buffer of the last log before switching to a new one
+      log::Writer* cur_log_writer = logs_.back().writer;
+      if (error_handler_.IsRecoveryInProgress()) {
+        // In recovery path, we force another try of writing WAL buffer.
+        cur_log_writer->file()->reset_seen_error();
+      }
+      io_s = cur_log_writer->WriteBuffer();
+      if (s.ok()) {
+        s = io_s;
+      }
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(immutable_db_options_.info_log,
+                       "[%s] Failed to switch from #%" PRIu64 " to #%" PRIu64
+                       "  WAL file\n",
+                       cfd->GetName().c_str(), cur_log_writer->get_log_number(),
+                       new_log_number);
+      }
+    }
+    if (s.ok()) {
+      logfile_number_ = new_log_number;
+      log_empty_ = true;
+      log_dir_synced_ = false;
+      logs_.emplace_back(logfile_number_, new_log);
+      alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+    }
+  }
+
+  if (!s.ok()) {
+    // how do we fail if we're not creating new log?
+    assert(creating_new_log);
+    delete new_mem;
+    delete new_log;
+    context->superversion_context.new_superversion.reset();
+    // We may have lost data from the WritableFileBuffer in-memory buffer for
+    // the current log, so treat it as a fatal error and set bg_error
+    if (!io_s.ok()) {
+      error_handler_.SetBGError(io_s, BackgroundErrorReason::kMemTable);
+    } else {
+      error_handler_.SetBGError(s, BackgroundErrorReason::kMemTable);
+    }
+    // Read back bg_error in order to get the right severity
+    s = error_handler_.GetBGError();
+    return s;
+  }
+
+  bool empty_cf_updated = false;
+  if (immutable_db_options_.track_and_verify_wals_in_manifest &&
+      !immutable_db_options_.allow_2pc && creating_new_log) {
+    // In non-2pc mode, WALs become obsolete if they do not contain unflushed
+    // data. Updating the empty CF's log number might cause some WALs to become
+    // obsolete. So we should track the WAL obsoletion event before actually
+    // updating the empty CF's log number.
+    uint64_t min_wal_number_to_keep =
+        versions_->PreComputeMinLogNumberWithUnflushedData(logfile_number_);
+    if (min_wal_number_to_keep >
+        versions_->GetWalSet().GetMinWalNumberToKeep()) {
+      // Get a snapshot of the empty column families.
+      // LogAndApply may release and reacquire db
+      // mutex, during that period, column family may become empty (e.g. its
+      // flush succeeds), then it affects the computed min_log_number_to_keep,
+      // so we take a snapshot for consistency of column family data
+      // status. If a column family becomes non-empty afterwards, its active log
+      // should still be the created new log, so the min_log_number_to_keep is
+      // not affected.
+      autovector<ColumnFamilyData*> empty_cfs;
+      for (auto cf : *versions_->GetColumnFamilySet()) {
+        if (cf->IsEmpty()) {
+          empty_cfs.push_back(cf);
+        }
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+      s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_,
+                                                      directories_.GetDbDir());
+      if (!s.ok() && versions_->io_status().IsIOError()) {
+        s = error_handler_.SetBGError(versions_->io_status(),
+                                      BackgroundErrorReason::kManifestWrite);
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      for (auto cf : empty_cfs) {
+        if (cf->IsEmpty()) {
+          cf->SetLogNumber(logfile_number_);
+          // MEMPURGE: No need to change this, because new adds
+          // should still receive new sequence numbers.
+          cf->mem()->SetCreationSeq(versions_->LastSequence());
+        }  // cf may become non-empty.
+      }
+      empty_cf_updated = true;
+    }
+  }
+  if (!empty_cf_updated) {
+    for (auto cf : *versions_->GetColumnFamilySet()) {
+      // all this is just optimization to delete logs that
+      // are no longer needed -- if CF is empty, that means it
+      // doesn't need that particular log to stay alive, so we just
+      // advance the log number. no need to persist this in the manifest
+      if (cf->IsEmpty()) {
+        if (creating_new_log) {
+          cf->SetLogNumber(logfile_number_);
+        }
+        cf->mem()->SetCreationSeq(versions_->LastSequence());
+      }
+    }
+  }
+
+  cfd->mem()->SetNextLogNumber(logfile_number_);
+  assert(new_mem != nullptr);
+  cfd->imm()->Add(cfd->mem(), &context->memtables_to_free_);
+  new_mem->Ref();
+  cfd->SetMemtable(new_mem);
+  InstallSuperVersionAndScheduleWork(cfd, &context->superversion_context,
+                                     mutable_cf_options);
+
+#ifndef ROCKSDB_LITE
+  // Notify client that memtable is sealed, now that we have successfully
+  // installed a new memtable
+  NotifyOnMemTableSealed(cfd, memtable_info);
+#endif  // ROCKSDB_LITE
+  // It is possible that we got here without checking the value of i_os, but
+  // that is okay.  If we did, it most likely means that s was already an error.
+  // In any case, ignore any unchecked error for i_os here.
+  io_s.PermitUncheckedError();
+  return s;
+}
+
+size_t DBImpl::GetWalPreallocateBlockSize(uint64_t write_buffer_size) const {
+  mutex_.AssertHeld();
+  size_t bsize =
+      static_cast<size_t>(write_buffer_size / 10 + write_buffer_size);
+  // Some users might set very high write_buffer_size and rely on
+  // max_total_wal_size or other parameters to control the WAL size.
+  if (mutable_db_options_.max_total_wal_size > 0) {
+    bsize = std::min<size_t>(
+        bsize, static_cast<size_t>(mutable_db_options_.max_total_wal_size));
+  }
+  if (immutable_db_options_.db_write_buffer_size > 0) {
+    bsize = std::min<size_t>(bsize, immutable_db_options_.db_write_buffer_size);
+  }
+  if (immutable_db_options_.write_buffer_manager &&
+      immutable_db_options_.write_buffer_manager->enabled()) {
+    bsize = std::min<size_t>(
+        bsize, immutable_db_options_.write_buffer_manager->buffer_size());
+  }
+
+  return bsize;
+}
+
+// Default implementations of convenience methods that subclasses of DB
+// can call if they wish
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& value) {
+  // Pre-allocate size of write batch conservatively.
+  // 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
+  // and we allocate 11 extra bytes for key length, as well as value length.
+  WriteBatch batch(key.size() + value.size() + 24, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Put(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+               const Slice& key, const Slice& ts, const Slice& value) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Put(column_family, key, ts, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::PutEntity(const WriteOptions& options,
+                     ColumnFamilyHandle* column_family, const Slice& key,
+                     const WideColumns& columns) {
+  const ColumnFamilyHandle* const default_cf = DefaultColumnFamily();
+  assert(default_cf);
+
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+
+  WriteBatch batch(/* reserved_bytes */ 0, /* max_bytes */ 0,
+                   options.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+
+  const Status s = batch.PutEntity(column_family, key, columns);
+  if (!s.ok()) {
+    return s;
+  }
+
+  return Write(options, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Delete(column_family, key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                  const Slice& key, const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Delete(column_family, key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.SingleDelete(column_family, key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::SingleDelete(const WriteOptions& opt,
+                        ColumnFamilyHandle* column_family, const Slice& key,
+                        const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.SingleDelete(column_family, key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+                       ColumnFamilyHandle* column_family,
+                       const Slice& begin_key, const Slice& end_key) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.DeleteRange(column_family, begin_key, end_key);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::DeleteRange(const WriteOptions& opt,
+                       ColumnFamilyHandle* column_family,
+                       const Slice& begin_key, const Slice& end_key,
+                       const Slice& ts) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.DeleteRange(column_family, begin_key, end_key, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& value) {
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  Status s = batch.Merge(column_family, key, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
+                 const Slice& key, const Slice& ts, const Slice& value) {
+  ColumnFamilyHandle* default_cf = DefaultColumnFamily();
+  assert(default_cf);
+  const Comparator* const default_cf_ucmp = default_cf->GetComparator();
+  assert(default_cf_ucmp);
+  WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                   opt.protection_bytes_per_key,
+                   default_cf_ucmp->timestamp_size());
+  Status s = batch.Merge(column_family, key, ts, value);
+  if (!s.ok()) {
+    return s;
+  }
+  return Write(opt, &batch);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.cc b/src/rocksdb/db/db_info_dumper.cc
new file mode 100644
index 000000000..be8d5bee1
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.cc
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_info_dumper.h"
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "file/filename.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname,
+                       const std::string& session_id) {
+  if (options.info_log == nullptr) {
+    return;
+  }
+
+  auto* env = options.env;
+  uint64_t number = 0;
+  FileType type = kInfoLogFile;
+
+  std::vector<std::string> files;
+  uint64_t file_num = 0;
+  uint64_t file_size;
+  std::string file_info, wal_info;
+
+  Header(options.info_log, "DB SUMMARY\n");
+  Header(options.info_log, "DB Session ID:  %s\n", session_id.c_str());
+
+  Status s;
+  // Get files in dbname dir
+  s = env->GetChildren(dbname, &files);
+  if (!s.ok()) {
+    Error(options.info_log, "Error when reading %s dir %s\n", dbname.c_str(),
+          s.ToString().c_str());
+  }
+  std::sort(files.begin(), files.end());
+  for (const std::string& file : files) {
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+    switch (type) {
+      case kCurrentFile:
+        Header(options.info_log, "CURRENT file:  %s\n", file.c_str());
+        break;
+      case kIdentityFile:
+        Header(options.info_log, "IDENTITY file:  %s\n", file.c_str());
+        break;
+      case kDescriptorFile:
+        s = env->GetFileSize(dbname + "/" + file, &file_size);
+        if (s.ok()) {
+          Header(options.info_log,
+                 "MANIFEST file:  %s size: %" PRIu64 " Bytes\n", file.c_str(),
+                 file_size);
+        } else {
+          Error(options.info_log,
+                "Error when reading MANIFEST file: %s/%s %s\n", dbname.c_str(),
+                file.c_str(), s.ToString().c_str());
+        }
+        break;
+      case kWalFile:
+        s = env->GetFileSize(dbname + "/" + file, &file_size);
+        if (s.ok()) {
+          wal_info.append(file)
+              .append(" size: ")
+              .append(std::to_string(file_size))
+              .append(" ; ");
+        } else {
+          Error(options.info_log, "Error when reading LOG file: %s/%s %s\n",
+                dbname.c_str(), file.c_str(), s.ToString().c_str());
+        }
+        break;
+      case kTableFile:
+        if (++file_num < 10) {
+          file_info.append(file).append(" ");
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  // Get sst files in db_path dir
+  for (auto& db_path : options.db_paths) {
+    if (dbname.compare(db_path.path) != 0) {
+      s = env->GetChildren(db_path.path, &files);
+      if (!s.ok()) {
+        Error(options.info_log, "Error when reading %s dir %s\n",
+              db_path.path.c_str(), s.ToString().c_str());
+        continue;
+      }
+      std::sort(files.begin(), files.end());
+      for (const std::string& file : files) {
+        if (ParseFileName(file, &number, &type)) {
+          if (type == kTableFile && ++file_num < 10) {
+            file_info.append(file).append(" ");
+          }
+        }
+      }
+    }
+    Header(options.info_log,
+           "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n",
+           db_path.path.c_str(), file_num, file_info.c_str());
+    file_num = 0;
+    file_info.clear();
+  }
+
+  // Get wal file in wal_dir
+  const auto& wal_dir = options.GetWalDir(dbname);
+  if (!options.IsWalDirSameAsDBPath(dbname)) {
+    s = env->GetChildren(wal_dir, &files);
+    if (!s.ok()) {
+      Error(options.info_log, "Error when reading %s dir %s\n", wal_dir.c_str(),
+            s.ToString().c_str());
+      return;
+    }
+    wal_info.clear();
+    for (const std::string& file : files) {
+      if (ParseFileName(file, &number, &type)) {
+        if (type == kWalFile) {
+          s = env->GetFileSize(wal_dir + "/" + file, &file_size);
+          if (s.ok()) {
+            wal_info.append(file)
+                .append(" size: ")
+                .append(std::to_string(file_size))
+                .append(" ; ");
+          } else {
+            Error(options.info_log, "Error when reading LOG file %s/%s %s\n",
+                  wal_dir.c_str(), file.c_str(), s.ToString().c_str());
+          }
+        }
+      }
+    }
+  }
+  Header(options.info_log, "Write Ahead Log file in %s: %s\n", wal_dir.c_str(),
+         wal_info.c_str());
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_info_dumper.h b/src/rocksdb/db/db_info_dumper.h
new file mode 100644
index 000000000..f518e840f
--- /dev/null
+++ b/src/rocksdb/db/db_info_dumper.h
@@ -0,0 +1,15 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+
+#include "options/db_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+void DumpDBFileSummary(const ImmutableDBOptions& options,
+                       const std::string& dbname,
+                       const std::string& session_id = "");
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_inplace_update_test.cc b/src/rocksdb/db/db_inplace_update_test.cc
new file mode 100644
index 000000000..3921a3b00
--- /dev/null
+++ b/src/rocksdb/db/db_inplace_update_test.cc
@@ -0,0 +1,262 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestInPlaceUpdate : public DBTestBase {
+ public:
+  DBTestInPlaceUpdate()
+      : DBTestBase("db_inplace_update_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdate) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    for (int i = numValues; i > 0; i--) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntitySmallerNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.allow_concurrent_memtable_write = false;
+
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    constexpr int num_values = 10;
+    for (int i = num_values; i > 0; --i) {
+      constexpr char key[] = "key";
+      const std::string value = DummyString(i, 'a');
+      WideColumns wide_columns{{"attr", value}};
+
+      ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+      // TODO: use Get to check entity once it's supported
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateEntityLargerNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.allow_concurrent_memtable_write = false;
+
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    constexpr int num_values = 10;
+    for (int i = 0; i < num_values; ++i) {
+      constexpr char key[] = "key";
+      const std::string value = DummyString(i, 'a');
+      WideColumns wide_columns{{"attr", value}};
+
+      ASSERT_OK(db_->PutEntity(WriteOptions(), handles_[1], key, wide_columns));
+      // TODO: use Get to check entity once it's supported
+    }
+
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(num_values, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size
+    int numValues = 10;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i - 1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackSmallerVarintSize) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceSmallerVarintSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller varint size
+    int numValues = 265;
+    ASSERT_OK(Put(1, "key", DummyString(numValues, 'a')));
+    ASSERT_EQ(DummyString(numValues, 'c'), Get(1, "key"));
+
+    for (int i = numValues; i > 0; i--) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(1, 'b'), Get(1, "key"));
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackLargeNewValue) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceLargerSize;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of larger size
+    int numValues = 10;
+    for (int i = 0; i < numValues; i++) {
+      ASSERT_OK(Put(1, "key", DummyString(i, 'a')));
+      ASSERT_EQ(DummyString(i, 'c'), Get(1, "key"));
+    }
+
+    // No inplace updates. All updates are puts with new seq number
+    // All 10 updates exist in the internal iterator
+    validateNumberOfEntries(numValues, 1);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.inplace_callback =
+        ROCKSDB_NAMESPACE::DBTestInPlaceUpdate::updateInPlaceNoAction;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Callback function requests no actions from db
+    ASSERT_OK(Put(1, "key", DummyString(1, 'a')));
+    ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size, and
+    // run GetSnapshot and ReleaseSnapshot
+    int numValues = 2;
+    for (int i = numValues; i > 0; i--) {
+      const Snapshot* s = db_->GetSnapshot();
+      ASSERT_EQ(nullptr, s);
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+      // release s (nullptr)
+      db_->ReleaseSnapshot(s);
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_io_failure_test.cc b/src/rocksdb/db/db_io_failure_test.cc
new file mode 100644
index 000000000..2a405fd38
--- /dev/null
+++ b/src/rocksdb/db/db_io_failure_test.cc
@@ -0,0 +1,593 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIOFailureTest : public DBTestBase {
+ public:
+  DBIOFailureTest() : DBTestBase("db_io_failure_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// Check that number of files does not grow when writes are dropped
+TEST_F(DBIOFailureTest, DropWrites) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    ASSERT_EQ("v1", Get("foo"));
+    Compact("a", "z");
+    const size_t num_files = CountFiles();
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+    env_->sleep_counter_.Reset();
+    env_->SetMockSleep();
+    for (int i = 0; i < 5; i++) {
+      if (option_config_ != kUniversalCompactionMultiLevel &&
+          option_config_ != kUniversalSubcompactions) {
+        for (int level = 0; level < dbfull()->NumberLevels(); level++) {
+          if (level > 0 && level == dbfull()->NumberLevels() - 1) {
+            break;
+          }
+          Status s =
+              dbfull()->TEST_CompactRange(level, nullptr, nullptr, nullptr,
+                                          true /* disallow trivial move */);
+          ASSERT_TRUE(s.ok() || s.IsCorruption());
+        }
+      } else {
+        Status s =
+            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+        ASSERT_TRUE(s.ok() || s.IsCorruption());
+      }
+    }
+
+    std::string property_value;
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("5", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+    const size_t count = CountFiles();
+    ASSERT_LT(count, num_files + 3);
+
+    // Check that compaction attempts slept after errors
+    // TODO @krad: Figure out why ASSERT_EQ 5 keeps failing in certain compiler
+    // versions
+    ASSERT_GE(env_->sleep_counter_.Read(), 4);
+  } while (ChangeCompactOptions());
+}
+
+// Check background error counter bumped on flush failures.
+TEST_F(DBIOFailureTest, DropWritesFlush) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.max_background_flushes = 1;
+    Reopen(options);
+
+    ASSERT_OK(Put("foo", "v1"));
+    // Force out-of-space errors
+    env_->drop_writes_.store(true, std::memory_order_release);
+
+    std::string property_value;
+    // Background error count is 0 now.
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("0", property_value);
+
+    // ASSERT file is too short
+    ASSERT_TRUE(dbfull()->TEST_FlushMemTable(true).IsCorruption());
+
+    ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value));
+    ASSERT_EQ("1", property_value);
+
+    env_->drop_writes_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+
+// Check that CompactRange() returns failure if there is not enough space left
+// on device
+TEST_F(DBIOFailureTest, NoSpaceCompactRange) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // generate 5 tables
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_OK(Put(Key(i), Key(i) + "v"));
+      ASSERT_OK(Flush());
+    }
+
+    // Force out-of-space errors
+    env_->no_space_.store(true, std::memory_order_release);
+
+    Status s = dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                           true /* disallow trivial move */);
+    ASSERT_TRUE(s.IsIOError());
+    ASSERT_TRUE(s.IsNoSpace());
+
+    env_->no_space_.store(false, std::memory_order_release);
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBIOFailureTest, NonWritableFileSystem) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 4096;
+    options.arena_block_size = 4096;
+    options.env = env_;
+    Reopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    env_->non_writeable_rate_.store(100);
+    std::string big(100000, 'x');
+    int errors = 0;
+    for (int i = 0; i < 20; i++) {
+      if (!Put("foo", big).ok()) {
+        errors++;
+        env_->SleepForMicroseconds(100000);
+      }
+    }
+    ASSERT_GT(errors, 0);
+    env_->non_writeable_rate_.store(0);
+  } while (ChangeCompactOptions());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBIOFailureTest, ManifestWriteError) {
+  // Test for the following problem:
+  // (a) Compaction produces file F
+  // (b) Log record containing F is written to MANIFEST file, but Sync() fails
+  // (c) GC deletes F
+  // (d) After reopening DB, reads fail since deleted F is named in log record
+
+  // We iterate twice.  In the second iteration, everything is the
+  // same except the log record never makes it to the MANIFEST file.
+  for (int iter = 0; iter < 2; iter++) {
+    std::atomic<bool>* error_type = (iter == 0) ? &env_->manifest_sync_error_
+                                                : &env_->manifest_write_error_;
+
+    // Insert foo=>bar mapping
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.error_if_exists = false;
+    options.paranoid_checks = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Memtable compaction (will succeed)
+    ASSERT_OK(Flush());
+    ASSERT_EQ("bar", Get("foo"));
+    const int last = 2;
+    MoveFilesToLevel(2);
+    ASSERT_EQ(NumTableFilesAtLevel(last), 1);  // foo=>bar is now in last level
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    ASSERT_NOK(
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr));  // Should fail
+    ASSERT_EQ("bar", Get("foo"));
+
+    error_type->store(false, std::memory_order_release);
+
+    // Since paranoid_checks=true, writes should fail
+    ASSERT_NOK(Put("foo2", "bar2"));
+
+    // Recovery: should not lose data
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Try again with paranoid_checks=false
+    Close();
+    options.paranoid_checks = false;
+    Reopen(options);
+
+    // Merging compaction (will fail)
+    error_type->store(true, std::memory_order_release);
+    Status s =
+        dbfull()->TEST_CompactRange(last, nullptr, nullptr);  // Should fail
+    if (iter == 0) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_TRUE(s.IsIOError());
+    }
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Recovery: should not lose data
+    error_type->store(false, std::memory_order_release);
+    Reopen(options);
+    ASSERT_EQ("bar", Get("foo"));
+
+    // Since paranoid_checks=false, writes should succeed
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_EQ("bar", Get("foo"));
+    ASSERT_EQ("bar2", Get("foo2"));
+  }
+}
+
+TEST_F(DBIOFailureTest, PutFailsParanoid) {
+  // Test the following:
+  // (a) A random put fails in paranoid mode (simulate by sync fail)
+  // (b) All other puts have to fail, even if writes would succeed
+  // (c) All of that should happen ONLY if paranoid_checks = true
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
+  env_->log_write_error_.store(false, std::memory_order_release);
+  // the next put should fail, too
+  ASSERT_NOK(Put(1, "foo3", "bar3"));
+  // but we're still able to read
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  // do the same thing with paranoid checks off
+  options.paranoid_checks = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  // simulate error
+  env_->log_write_error_.store(true, std::memory_order_release);
+  ASSERT_NOK(Put(1, "foo2", "bar2"));
+  env_->log_write_error_.store(false, std::memory_order_release);
+  // the next put should NOT fail
+  ASSERT_OK(Put(1, "foo3", "bar3"));
+}
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBIOFailureTest, FlushSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(10));
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "range sync dummy error";
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::string rnd_str =
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, CompactSstRangeSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.write_buffer_size = 256 * 1024 * 1024;
+  options.writable_file_max_buffer_size = 128 * 1024;
+  options.bytes_per_sync = 128 * 1024;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 256 * 1024 * 1024;
+  options.disable_auto_compactions = true;
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  std::string rnd_str =
+      rnd.RandomString(static_cast<int>(options.bytes_per_sync / 2));
+  std::string rnd_str_512kb = rnd.RandomString(512 * 1024);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  // First 1MB doesn't get range synced
+  ASSERT_OK(Put(1, "foo0_0", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo0_1", rnd_str_512kb));
+  ASSERT_OK(Put(1, "foo1_1", rnd_str));
+  ASSERT_OK(Put(1, "foo1_2", rnd_str));
+  ASSERT_OK(Put(1, "foo1_3", rnd_str));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo3_1", rnd_str));
+  ASSERT_OK(Put(1, "foo3_2", rnd_str));
+  ASSERT_OK(Put(1, "foo3_3", rnd_str));
+  ASSERT_OK(Put(1, "foo4", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  const char* io_error_msg = "range sync dummy error";
+  std::atomic<int> range_sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::RangeSync", [&](void* arg) {
+        if (range_sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_GE(1, range_sync_called.load());
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "close dummy error";
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstCloseError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const char* io_error_msg = "close dummy error";
+  std::atomic<int> close_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Close", [&](void* arg) {
+        if (close_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+
+TEST_F(DBIOFailureTest, FlushSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.use_fsync = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const char* io_error_msg = "sync dummy error";
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo1", "bar1"));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  Status s = dbfull()->TEST_WaitForFlushMemTable(handles_[1]);
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as flush failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar2", Get(1, "foo"));
+  ASSERT_EQ("bar1", Get(1, "foo1"));
+}
+
+TEST_F(DBIOFailureTest, CompactionSstSyncError) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  options.paranoid_checks = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = true;
+  options.use_fsync = false;
+
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar2"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "foo", "bar3"));
+  ASSERT_OK(Put(1, "foo2", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  const char* io_error_msg = "sync dummy error";
+  std::atomic<int> sync_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SpecialEnv::SStableFile::Sync", [&](void* arg) {
+        if (sync_called.fetch_add(1) == 0) {
+          Status* st = static_cast<Status*>(arg);
+          *st = Status::IOError(io_error_msg);
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {
+                                     {"disable_auto_compactions", "false"},
+                                 }));
+  Status s = dbfull()->TEST_WaitForCompact();
+  ASSERT_TRUE(s.IsIOError());
+  ASSERT_STREQ(s.getState(), io_error_msg);
+
+  // Following writes should fail as compaction failed.
+  ASSERT_NOK(Put(1, "foo2", "bar3"));
+  ASSERT_EQ("bar3", Get(1, "foo"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ("bar3", Get(1, "foo"));
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter.cc b/src/rocksdb/db/db_iter.cc
new file mode 100644
index 000000000..e1375deb7
--- /dev/null
+++ b/src/rocksdb/db/db_iter.cc
@@ -0,0 +1,1708 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_iter.h"
+
+#include <iostream>
+#include <limits>
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/wide/wide_column_serialization.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/system_clock.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "trace_replay/trace_replay.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+DBIter::DBIter(Env* _env, const ReadOptions& read_options,
+               const ImmutableOptions& ioptions,
+               const MutableCFOptions& mutable_cf_options,
+               const Comparator* cmp, InternalIterator* iter,
+               const Version* version, SequenceNumber s, bool arena_mode,
+               uint64_t max_sequential_skip_in_iterations,
+               ReadCallback* read_callback, DBImpl* db_impl,
+               ColumnFamilyData* cfd, bool expose_blob_index)
+    : prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      env_(_env),
+      clock_(ioptions.clock),
+      logger_(ioptions.logger),
+      user_comparator_(cmp),
+      merge_operator_(ioptions.merge_operator.get()),
+      iter_(iter),
+      version_(version),
+      read_callback_(read_callback),
+      sequence_(s),
+      statistics_(ioptions.stats),
+      max_skip_(max_sequential_skip_in_iterations),
+      max_skippable_internal_keys_(read_options.max_skippable_internal_keys),
+      num_internal_keys_skipped_(0),
+      iterate_lower_bound_(read_options.iterate_lower_bound),
+      iterate_upper_bound_(read_options.iterate_upper_bound),
+      direction_(kForward),
+      valid_(false),
+      current_entry_is_merged_(false),
+      is_key_seqnum_zero_(false),
+      prefix_same_as_start_(mutable_cf_options.prefix_extractor
+                                ? read_options.prefix_same_as_start
+                                : false),
+      pin_thru_lifetime_(read_options.pin_data),
+      expect_total_order_inner_iter_(prefix_extractor_ == nullptr ||
+                                     read_options.total_order_seek ||
+                                     read_options.auto_prefix_mode),
+      read_tier_(read_options.read_tier),
+      fill_cache_(read_options.fill_cache),
+      verify_checksums_(read_options.verify_checksums),
+      expose_blob_index_(expose_blob_index),
+      is_blob_(false),
+      arena_mode_(arena_mode),
+      db_impl_(db_impl),
+      cfd_(cfd),
+      timestamp_ub_(read_options.timestamp),
+      timestamp_lb_(read_options.iter_start_ts),
+      timestamp_size_(timestamp_ub_ ? timestamp_ub_->size() : 0) {
+  RecordTick(statistics_, NO_ITERATOR_CREATED);
+  if (pin_thru_lifetime_) {
+    pinned_iters_mgr_.StartPinning();
+  }
+  if (iter_.iter()) {
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+  status_.PermitUncheckedError();
+  assert(timestamp_size_ ==
+         user_comparator_.user_comparator()->timestamp_size());
+}
+
+Status DBIter::GetProperty(std::string prop_name, std::string* prop) {
+  if (prop == nullptr) {
+    return Status::InvalidArgument("prop is nullptr");
+  }
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    // First try to pass the value returned from inner iterator.
+    return iter_.iter()->GetProperty(prop_name, prop);
+  } else if (prop_name == "rocksdb.iterator.is-key-pinned") {
+    if (valid_) {
+      *prop = (pin_thru_lifetime_ && saved_key_.IsKeyPinned()) ? "1" : "0";
+    } else {
+      *prop = "Iterator is not valid.";
+    }
+    return Status::OK();
+  } else if (prop_name == "rocksdb.iterator.internal-key") {
+    *prop = saved_key_.GetUserKey().ToString();
+    return Status::OK();
+  }
+  return Status::InvalidArgument("Unidentified property.");
+}
+
+bool DBIter::ParseKey(ParsedInternalKey* ikey) {
+  Status s = ParseInternalKey(iter_.key(), ikey, false /* log_err_key */);
+  if (!s.ok()) {
+    status_ = Status::Corruption("In DBIter: ", s.getState());
+    valid_ = false;
+    ROCKS_LOG_ERROR(logger_, "In DBIter: %s", status_.getState());
+    return false;
+  } else {
+    return true;
+  }
+}
+
+void DBIter::Next() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_next_cpu_nanos, clock_);
+  // Release temporarily pinned blocks from last operation
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  local_stats_.skip_count_ += num_internal_keys_skipped_;
+  local_stats_.skip_count_--;
+  num_internal_keys_skipped_ = 0;
+  bool ok = true;
+  if (direction_ == kReverse) {
+    is_key_seqnum_zero_ = false;
+    if (!ReverseToForward()) {
+      ok = false;
+    }
+  } else if (!current_entry_is_merged_) {
+    // If the current value is not a merge, the iter position is the
+    // current key, which is already returned. We can safely issue a
+    // Next() without checking the current key.
+    // If the current key is a merge, very likely iter already points
+    // to the next internal position.
+    assert(iter_.Valid());
+    iter_.Next();
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+  }
+
+  local_stats_.next_count_++;
+  if (ok && iter_.Valid()) {
+    ClearSavedValue();
+
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      const Slice prefix = prefix_.GetUserKey();
+      FindNextUserEntry(true /* skipping the current user key */, &prefix);
+    } else {
+      FindNextUserEntry(true /* skipping the current user key */, nullptr);
+    }
+  } else {
+    is_key_seqnum_zero_ = false;
+    valid_ = false;
+  }
+  if (statistics_ != nullptr && valid_) {
+    local_stats_.next_found_count_++;
+    local_stats_.bytes_read_ += (key().size() + value().size());
+  }
+}
+
+bool DBIter::SetBlobValueIfNeeded(const Slice& user_key,
+                                  const Slice& blob_index) {
+  assert(!is_blob_);
+  assert(blob_value_.empty());
+
+  if (expose_blob_index_) {  // Stacked BlobDB implementation
+    is_blob_ = true;
+    return true;
+  }
+
+  if (!version_) {
+    status_ = Status::Corruption("Encountered unexpected blob index.");
+    valid_ = false;
+    return false;
+  }
+
+  // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to
+  // avoid having to copy options back and forth.
+  ReadOptions read_options;
+  read_options.read_tier = read_tier_;
+  read_options.fill_cache = fill_cache_;
+  read_options.verify_checksums = verify_checksums_;
+
+  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+  constexpr uint64_t* bytes_read = nullptr;
+
+  const Status s = version_->GetBlob(read_options, user_key, blob_index,
+                                     prefetch_buffer, &blob_value_, bytes_read);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  is_blob_ = true;
+  return true;
+}
+
+bool DBIter::SetValueAndColumnsFromEntity(Slice slice) {
+  assert(value_.empty());
+  assert(wide_columns_.empty());
+
+  const Status s = WideColumnSerialization::Deserialize(slice, wide_columns_);
+
+  if (!s.ok()) {
+    status_ = s;
+    valid_ = false;
+    return false;
+  }
+
+  if (!wide_columns_.empty() &&
+      wide_columns_[0].name() == kDefaultWideColumnName) {
+    value_ = wide_columns_[0].value();
+  }
+
+  return true;
+}
+
+// PRE: saved_key_ has the current user key if skipping_saved_key
+// POST: saved_key_ should have the next user key if valid_,
+//       if the current entry is a result of merge
+//           current_entry_is_merged_ => true
+//           saved_value_             => the merged value
+//
+// NOTE: In between, saved_key_ can point to a user key that has
+//       a delete marker or a sequence number higher than sequence_
+//       saved_key_ MUST have a proper user_key before calling this function
+//
+// The prefix parameter, if not null, indicates that we need to iterate
+// within the prefix, and the iterator needs to be made invalid, if no
+// more entry for the prefix can be found.
+bool DBIter::FindNextUserEntry(bool skipping_saved_key, const Slice* prefix) {
+  PERF_TIMER_GUARD(find_next_user_entry_time);
+  return FindNextUserEntryInternal(skipping_saved_key, prefix);
+}
+
+// Actual implementation of DBIter::FindNextUserEntry()
+bool DBIter::FindNextUserEntryInternal(bool skipping_saved_key,
+                                       const Slice* prefix) {
+  // Loop until we hit an acceptable entry to yield
+  assert(iter_.Valid());
+  assert(status_.ok());
+  assert(direction_ == kForward);
+  current_entry_is_merged_ = false;
+
+  // How many times in a row we have skipped an entry with user key less than
+  // or equal to saved_key_. We could skip these entries either because
+  // sequence numbers were too high or because skipping_saved_key = true.
+  // What saved_key_ contains throughout this method:
+  //  - if skipping_saved_key : saved_key_ contains the key that we need
+  //                            to skip, and we haven't seen any keys greater
+  //                            than that,
+  //  - if num_skipped > 0    : saved_key_ contains the key that we have skipped
+  //                            num_skipped times, and we haven't seen any keys
+  //                            greater than that,
+  //  - none of the above     : saved_key_ can contain anything, it doesn't
+  //                            matter.
+  uint64_t num_skipped = 0;
+  // For write unprepared, the target sequence number in reseek could be larger
+  // than the snapshot, and thus needs to be skipped again. This could result in
+  // an infinite loop of reseeks. To avoid that, we limit the number of reseeks
+  // to one.
+  bool reseek_done = false;
+
+  do {
+    // Will update is_key_seqnum_zero_ as soon as we parsed the current key
+    // but we need to save the previous value to be used in the loop.
+    bool is_prev_key_seqnum_zero = is_key_seqnum_zero_;
+    if (!ParseKey(&ikey_)) {
+      is_key_seqnum_zero_ = false;
+      return false;
+    }
+    Slice user_key_without_ts =
+        StripTimestampFromUserKey(ikey_.user_key, timestamp_size_);
+
+    is_key_seqnum_zero_ = (ikey_.sequence == 0);
+
+    assert(iterate_upper_bound_ == nullptr ||
+           iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound ||
+           user_comparator_.CompareWithoutTimestamp(
+               user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+               /*b_has_ts=*/false) < 0);
+    if (iterate_upper_bound_ != nullptr &&
+        iter_.UpperBoundCheckResult() != IterBoundCheck::kInbound &&
+        user_comparator_.CompareWithoutTimestamp(
+            user_key_without_ts, /*a_has_ts=*/false, *iterate_upper_bound_,
+            /*b_has_ts=*/false) >= 0) {
+      break;
+    }
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_->Transform(user_key_without_ts).compare(*prefix) !=
+            0) {
+      assert(prefix_same_as_start_);
+      break;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    assert(ikey_.user_key.size() >= timestamp_size_);
+    Slice ts = timestamp_size_ > 0 ? ExtractTimestampFromUserKey(
+                                         ikey_.user_key, timestamp_size_)
+                                   : Slice();
+    bool more_recent = false;
+    if (IsVisible(ikey_.sequence, ts, &more_recent)) {
+      // If the previous entry is of seqnum 0, the current entry will not
+      // possibly be skipped. This condition can potentially be relaxed to
+      // prev_key.seq <= ikey_.sequence. We are cautious because it will be more
+      // prone to bugs causing the same user key with the same sequence number.
+      // Note that with current timestamp implementation, the same user key can
+      // have different timestamps and zero sequence number on the bottommost
+      // level. This may change in the future.
+      if ((!is_prev_key_seqnum_zero || timestamp_size_ > 0) &&
+          skipping_saved_key &&
+          CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) <= 0) {
+        num_skipped++;  // skip this entry
+        PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+      } else {
+        assert(!skipping_saved_key ||
+               CompareKeyForSkip(ikey_.user_key, saved_key_.GetUserKey()) > 0);
+        if (!iter_.PrepareValue()) {
+          assert(!iter_.status().ok());
+          valid_ = false;
+          return false;
+        }
+        num_skipped = 0;
+        reseek_done = false;
+        switch (ikey_.type) {
+          case kTypeDeletion:
+          case kTypeDeletionWithTimestamp:
+          case kTypeSingleDeletion:
+            // Arrange to skip all upcoming entries for this key since
+            // they are hidden by this deletion.
+            if (timestamp_lb_) {
+              saved_key_.SetInternalKey(ikey_);
+              valid_ = true;
+              return true;
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+              skipping_saved_key = true;
+              PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+            }
+            break;
+          case kTypeValue:
+          case kTypeBlobIndex:
+          case kTypeWideColumnEntity:
+            if (timestamp_lb_) {
+              saved_key_.SetInternalKey(ikey_);
+            } else {
+              saved_key_.SetUserKey(
+                  ikey_.user_key, !pin_thru_lifetime_ ||
+                                      !iter_.iter()->IsKeyPinned() /* copy */);
+            }
+
+            if (ikey_.type == kTypeBlobIndex) {
+              if (!SetBlobValueIfNeeded(ikey_.user_key, iter_.value())) {
+                return false;
+              }
+
+              SetValueAndColumnsFromPlain(expose_blob_index_ ? iter_.value()
+                                                             : blob_value_);
+            } else if (ikey_.type == kTypeWideColumnEntity) {
+              if (!SetValueAndColumnsFromEntity(iter_.value())) {
+                return false;
+              }
+            } else {
+              assert(ikey_.type == kTypeValue);
+              SetValueAndColumnsFromPlain(iter_.value());
+            }
+
+            valid_ = true;
+            return true;
+            break;
+          case kTypeMerge:
+            saved_key_.SetUserKey(
+                ikey_.user_key,
+                !pin_thru_lifetime_ || !iter_.iter()->IsKeyPinned() /* copy */);
+            // By now, we are sure the current ikey is going to yield a value
+            current_entry_is_merged_ = true;
+            valid_ = true;
+            return MergeValuesNewToOld();  // Go to a different state machine
+            break;
+          default:
+            valid_ = false;
+            status_ = Status::Corruption(
+                "Unknown value type: " +
+                std::to_string(static_cast<unsigned int>(ikey_.type)));
+            return false;
+        }
+      }
+    } else {
+      if (more_recent) {
+        PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+      }
+
+      // This key was inserted after our snapshot was taken or skipped by
+      // timestamp range. If this happens too many times in a row for the same
+      // user key, we want to seek to the target sequence number.
+      int cmp = user_comparator_.CompareWithoutTimestamp(
+          ikey_.user_key, saved_key_.GetUserKey());
+      if (cmp == 0 || (skipping_saved_key && cmp < 0)) {
+        num_skipped++;
+      } else {
+        saved_key_.SetUserKey(
+            ikey_.user_key,
+            !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+        skipping_saved_key = false;
+        num_skipped = 0;
+        reseek_done = false;
+      }
+    }
+
+    // If we have sequentially iterated via numerous equal keys, then it's
+    // better to seek so that we can avoid too many key comparisons.
+    //
+    // To avoid infinite loops, do not reseek if we have already attempted to
+    // reseek previously.
+    //
+    // TODO(lth): If we reseek to sequence number greater than ikey_.sequence,
+    // then it does not make sense to reseek as we would actually land further
+    // away from the desired key. There is opportunity for optimization here.
+    if (num_skipped > max_skip_ && !reseek_done) {
+      is_key_seqnum_zero_ = false;
+      num_skipped = 0;
+      reseek_done = true;
+      std::string last_key;
+      if (skipping_saved_key) {
+        // We're looking for the next user-key but all we see are the same
+        // user-key with decreasing sequence numbers. Fast forward to
+        // sequence number 0 and type deletion (the smallest type).
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion));
+        } else {
+          const std::string kTsMin(timestamp_size_, '\0');
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), 0, kTypeDeletion),
+              kTsMin);
+        }
+        // Don't set skipping_saved_key = false because we may still see more
+        // user-keys equal to saved_key_.
+      } else {
+        // We saw multiple entries with this user key and sequence numbers
+        // higher than sequence_. Fast forward to sequence_.
+        // Note that this only covers a case when a higher key was overwritten
+        // many times since our snapshot was taken, not the case when a lot of
+        // different keys were inserted after our snapshot was taken.
+        if (timestamp_size_ == 0) {
+          AppendInternalKey(
+              &last_key, ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                           kValueTypeForSeek));
+        } else {
+          AppendInternalKeyWithDifferentTimestamp(
+              &last_key,
+              ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                kValueTypeForSeek),
+              *timestamp_ub_);
+        }
+      }
+      iter_.Seek(last_key);
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+    } else {
+      iter_.Next();
+    }
+  } while (iter_.Valid());
+
+  valid_ = false;
+  return iter_.status().ok();
+}
+
+// Merge values of the same user key starting from the current iter_ position
+// Scan from the newer entries to older entries.
+// PRE: iter_.key() points to the first merge type entry
+//      saved_key_ stores the user key
+//      iter_.PrepareValue() has been called
+// POST: saved_value_ has the merged value for the user key
+//       iter_ points to the next entry (or invalid)
+bool DBIter::MergeValuesNewToOld() {
+  if (!merge_operator_) {
+    ROCKS_LOG_ERROR(logger_, "Options::merge_operator is null.");
+    status_ = Status::InvalidArgument("merge_operator_ must be set.");
+    valid_ = false;
+    return false;
+  }
+
+  // Temporarily pin the blocks that hold merge operands
+  TempPinData();
+  merge_context_.Clear();
+  // Start the merge process by pushing the first operand
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:PushedFirstOperand");
+
+  ParsedInternalKey ikey;
+  for (iter_.Next(); iter_.Valid(); iter_.Next()) {
+    TEST_SYNC_POINT("DBIter::MergeValuesNewToOld:SteppedToNextOperand");
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // hit the next user key, stop right here
+      break;
+    }
+    if (kTypeDeletion == ikey.type || kTypeSingleDeletion == ikey.type ||
+        kTypeDeletionWithTimestamp == ikey.type) {
+      // hit a delete with the same user key, stop right here
+      // iter_ is positioned after delete
+      iter_.Next();
+      break;
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (kTypeValue == ikey.type) {
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      const Slice val = iter_.value();
+      if (!Merge(&val, ikey.user_key)) {
+        return false;
+      }
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else if (kTypeMerge == ikey.type) {
+      // hit a merge, add the value as an operand and run associative merge.
+      // when complete, add result to operands and continue.
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (kTypeBlobIndex == ikey.type) {
+      if (expose_blob_index_) {
+        status_ =
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      // hit a put, merge the put value with operands and store the
+      // final result in saved_value_. We are done!
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      if (!Merge(&blob_value_, ikey.user_key)) {
+        return false;
+      }
+
+      ResetBlobValue();
+
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      return true;
+    } else if (kTypeWideColumnEntity == ikey.type) {
+      if (!MergeEntity(iter_.value(), ikey.user_key)) {
+        return false;
+      }
+
+      // iter_ is positioned after put
+      iter_.Next();
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+
+      return true;
+    } else {
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unrecognized value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
+      return false;
+    }
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  // we either exhausted all internal keys under this user key, or hit
+  // a deletion marker.
+  // feed null as the existing value to the merge operator, such that
+  // client can differentiate this scenario and do things accordingly.
+  if (!Merge(nullptr, saved_key_.GetUserKey())) {
+    return false;
+  }
+  assert(status_.ok());
+  return true;
+}
+
+void DBIter::Prev() {
+  assert(valid_);
+  assert(status_.ok());
+
+  PERF_CPU_TIMER_GUARD(iter_prev_cpu_nanos, clock_);
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  bool ok = true;
+  if (direction_ == kForward) {
+    if (!ReverseToBackward()) {
+      ok = false;
+    }
+  }
+  if (ok) {
+    ClearSavedValue();
+
+    Slice prefix;
+    if (prefix_same_as_start_) {
+      assert(prefix_extractor_ != nullptr);
+      prefix = prefix_.GetUserKey();
+    }
+    PrevInternal(prefix_same_as_start_ ? &prefix : nullptr);
+  }
+
+  if (statistics_ != nullptr) {
+    local_stats_.prev_count_++;
+    if (valid_) {
+      local_stats_.prev_found_count_++;
+      local_stats_.bytes_read_ += (key().size() + value().size());
+    }
+  }
+}
+
+bool DBIter::ReverseToForward() {
+  assert(iter_.status().ok());
+
+  // When moving backwards, iter_ is positioned on _previous_ key, which may
+  // not exist or may have different prefix than the current key().
+  // If that's the case, seek iter_ to current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    IterKey last_key;
+    ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                            kValueTypeForSeek);
+    if (timestamp_size_ > 0) {
+      // TODO: pre-create kTsMax.
+      const std::string kTsMax(timestamp_size_, '\xff');
+      pikey.SetTimestamp(kTsMax);
+    }
+    last_key.SetInternalKey(pikey);
+    iter_.Seek(last_key.GetInternalKey());
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  direction_ = kForward;
+  // Skip keys less than the current key() (a.k.a. saved_key_).
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (user_comparator_.Compare(ikey.user_key, saved_key_.GetUserKey()) >= 0) {
+      return true;
+    }
+    iter_.Next();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+// Move iter_ to the key before saved_key_.
+bool DBIter::ReverseToBackward() {
+  assert(iter_.status().ok());
+
+  // When current_entry_is_merged_ is true, iter_ may be positioned on the next
+  // key, which may not exist or may have prefix different from current.
+  // If that's the case, seek to saved_key_.
+  if (current_entry_is_merged_ &&
+      (!expect_total_order_inner_iter() || !iter_.Valid())) {
+    IterKey last_key;
+    // Using kMaxSequenceNumber and kValueTypeForSeek
+    // (not kValueTypeForSeekForPrev) to seek to a key strictly smaller
+    // than saved_key_.
+    last_key.SetInternalKey(ParsedInternalKey(
+        saved_key_.GetUserKey(), kMaxSequenceNumber, kValueTypeForSeek));
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key.GetInternalKey());
+    } else {
+      // Some iterators may not support SeekForPrev(), so we avoid using it
+      // when prefix seek mode is disabled. This is somewhat expensive
+      // (an extra Prev(), as well as an extra change of direction of iter_),
+      // so we may need to reconsider it later.
+      iter_.Seek(last_key.GetInternalKey());
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  direction_ = kReverse;
+  return FindUserKeyBeforeSavedKey();
+}
+
+void DBIter::PrevInternal(const Slice* prefix) {
+  while (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+
+    assert(prefix == nullptr || prefix_extractor_ != nullptr);
+    if (prefix != nullptr &&
+        prefix_extractor_
+                ->Transform(StripTimestampFromUserKey(saved_key_.GetUserKey(),
+                                                      timestamp_size_))
+                .compare(*prefix) != 0) {
+      assert(prefix_same_as_start_);
+      // Current key does not have the same prefix as start
+      valid_ = false;
+      return;
+    }
+
+    assert(iterate_lower_bound_ == nullptr || iter_.MayBeOutOfLowerBound() ||
+           user_comparator_.CompareWithoutTimestamp(
+               saved_key_.GetUserKey(), /*a_has_ts=*/true,
+               *iterate_lower_bound_, /*b_has_ts=*/false) >= 0);
+    if (iterate_lower_bound_ != nullptr && iter_.MayBeOutOfLowerBound() &&
+        user_comparator_.CompareWithoutTimestamp(
+            saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+            /*b_has_ts=*/false) < 0) {
+      // We've iterated earlier than the user-specified lower bound.
+      valid_ = false;
+      return;
+    }
+
+    if (!FindValueForCurrentKey()) {  // assigns valid_
+      return;
+    }
+
+    // Whether or not we found a value for current key, we need iter_ to end up
+    // on a smaller key.
+    if (!FindUserKeyBeforeSavedKey()) {
+      return;
+    }
+
+    if (valid_) {
+      // Found the value.
+      return;
+    }
+
+    if (TooManyInternalKeysSkipped(false)) {
+      return;
+    }
+  }
+
+  // We haven't found any key - iterator is not valid
+  valid_ = false;
+}
+
+// Used for backwards iteration.
+// Looks at the entries with user key saved_key_ and finds the most up-to-date
+// value for it, or executes a merge, or determines that the value was deleted.
+// Sets valid_ to true if the value is found and is ready to be presented to
+// the user through value().
+// Sets valid_ to false if the value was deleted, and we should try another key.
+// Returns false if an error occurred, and !status().ok() and !valid_.
+//
+// PRE: iter_ is positioned on the last entry with user key equal to saved_key_.
+// POST: iter_ is positioned on one of the entries equal to saved_key_, or on
+//       the entry just before them, or on the entry just after them.
+bool DBIter::FindValueForCurrentKey() {
+  assert(iter_.Valid());
+  merge_context_.Clear();
+  current_entry_is_merged_ = false;
+  // last entry before merge (could be kTypeDeletion,
+  // kTypeDeletionWithTimestamp, kTypeSingleDeletion, kTypeValue,
+  // kTypeBlobIndex, or kTypeWideColumnEntity)
+  ValueType last_not_merge_type = kTypeDeletion;
+  ValueType last_key_entry_type = kTypeDeletion;
+
+  // If false, it indicates that we have not seen any valid entry, even though
+  // last_key_entry_type is initialized to kTypeDeletion.
+  bool valid_entry_seen = false;
+
+  // Temporarily pin blocks that hold (merge operands / the value)
+  ReleaseTempPinnedData();
+  TempPinData();
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // Found a smaller user key, thus we are done with current user key.
+      break;
+    }
+
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+
+    bool visible = IsVisible(ikey.sequence, ts);
+    if (!visible &&
+        (timestamp_lb_ == nullptr ||
+         user_comparator_.CompareTimestamp(ts, *timestamp_ub_) > 0)) {
+      // Found an invisible version of the current user key, and it must have
+      // a higher sequence number or timestamp. Therefore, we are done with the
+      // current user key.
+      break;
+    }
+
+    if (!ts.empty()) {
+      saved_timestamp_.assign(ts.data(), ts.size());
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    // This user key has lots of entries.
+    // We're going from old to new, and it's taking too long. Let's do a Seek()
+    // and go from new to old. This helps when a key was overwritten many times.
+    if (num_skipped >= max_skip_) {
+      return FindValueForCurrentKeyUsingSeek();
+    }
+
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (timestamp_lb_ != nullptr) {
+      // Only needed when timestamp_lb_ is not null
+      [[maybe_unused]] const bool ret = ParseKey(&ikey_);
+      saved_ikey_.assign(iter_.key().data(), iter_.key().size());
+      // Since the preceding ParseKey(&ikey) succeeds, so must this.
+      assert(ret);
+    }
+
+    valid_entry_seen = true;
+    last_key_entry_type = ikey.type;
+    switch (last_key_entry_type) {
+      case kTypeValue:
+      case kTypeBlobIndex:
+      case kTypeWideColumnEntity:
+        if (iter_.iter()->IsValuePinned()) {
+          pinned_value_ = iter_.value();
+        } else {
+          valid_ = false;
+          status_ = Status::NotSupported(
+              "Backward iteration not supported if underlying iterator's value "
+              "cannot be pinned.");
+        }
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        if (!status_.ok()) {
+          return false;
+        }
+        break;
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+        merge_context_.Clear();
+        last_not_merge_type = last_key_entry_type;
+        PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
+        break;
+      case kTypeMerge: {
+        assert(merge_operator_ != nullptr);
+        merge_context_.PushOperandBack(
+            iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+        PERF_COUNTER_ADD(internal_merge_count, 1);
+      } break;
+      default:
+        valid_ = false;
+        status_ = Status::Corruption(
+            "Unknown value type: " +
+            std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+        return false;
+    }
+
+    PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    iter_.Prev();
+    ++num_skipped;
+
+    if (visible && timestamp_lb_ != nullptr) {
+      // If timestamp_lb_ is not nullptr, we do not have to look further for
+      // another internal key. We can return this current internal key. Yet we
+      // still keep the invariant that iter_ is positioned before the returned
+      // key.
+      break;
+    }
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  if (!valid_entry_seen) {
+    // Since we haven't seen any valid entry, last_key_entry_type remains
+    // unchanged and the same as its initial value.
+    assert(last_key_entry_type == kTypeDeletion);
+    assert(last_not_merge_type == kTypeDeletion);
+    valid_ = false;
+    return true;
+  }
+
+  if (timestamp_lb_ != nullptr) {
+    assert(last_key_entry_type == ikey_.type);
+  }
+
+  Status s;
+  s.PermitUncheckedError();
+
+  switch (last_key_entry_type) {
+    case kTypeDeletion:
+    case kTypeDeletionWithTimestamp:
+    case kTypeSingleDeletion:
+      if (timestamp_lb_ == nullptr) {
+        valid_ = false;
+      } else {
+        saved_key_.SetInternalKey(saved_ikey_);
+        valid_ = true;
+      }
+      return true;
+    case kTypeMerge:
+      current_entry_is_merged_ = true;
+      if (last_not_merge_type == kTypeDeletion ||
+          last_not_merge_type == kTypeSingleDeletion ||
+          last_not_merge_type == kTypeDeletionWithTimestamp) {
+        if (!Merge(nullptr, saved_key_.GetUserKey())) {
+          return false;
+        }
+        return true;
+      } else if (last_not_merge_type == kTypeBlobIndex) {
+        if (expose_blob_index_) {
+          status_ =
+              Status::NotSupported("BlobDB does not support merge operator.");
+          valid_ = false;
+          return false;
+        }
+        if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+          return false;
+        }
+        valid_ = true;
+        if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+
+        ResetBlobValue();
+
+        return true;
+      } else if (last_not_merge_type == kTypeWideColumnEntity) {
+        if (!MergeEntity(pinned_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+
+        return true;
+      } else {
+        assert(last_not_merge_type == kTypeValue);
+        if (!Merge(&pinned_value_, saved_key_.GetUserKey())) {
+          return false;
+        }
+        return true;
+      }
+      break;
+    case kTypeValue:
+      if (timestamp_lb_ != nullptr) {
+        saved_key_.SetInternalKey(saved_ikey_);
+      }
+
+      SetValueAndColumnsFromPlain(pinned_value_);
+
+      break;
+    case kTypeBlobIndex:
+      if (!SetBlobValueIfNeeded(saved_key_.GetUserKey(), pinned_value_)) {
+        return false;
+      }
+
+      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+                                                     : blob_value_);
+
+      break;
+    case kTypeWideColumnEntity:
+      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+        return false;
+      }
+      break;
+    default:
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(last_key_entry_type)));
+      return false;
+  }
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+  valid_ = true;
+  return true;
+}
+
+// This function is used in FindValueForCurrentKey.
+// We use Seek() function instead of Prev() to find necessary value
+// TODO: This is very similar to FindNextUserEntry() and MergeValuesNewToOld().
+//       Would be nice to reuse some code.
+bool DBIter::FindValueForCurrentKeyUsingSeek() {
+  // FindValueForCurrentKey will enable pinning before calling
+  // FindValueForCurrentKeyUsingSeek()
+  assert(pinned_iters_mgr_.PinningEnabled());
+  std::string last_key;
+  if (0 == timestamp_size_) {
+    AppendInternalKey(&last_key,
+                      ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                                        kValueTypeForSeek));
+  } else {
+    AppendInternalKeyWithDifferentTimestamp(
+        &last_key,
+        ParsedInternalKey(saved_key_.GetUserKey(), sequence_,
+                          kValueTypeForSeek),
+        timestamp_lb_ == nullptr ? *timestamp_ub_ : *timestamp_lb_);
+  }
+  iter_.Seek(last_key);
+  RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+
+  // In case read_callback presents, the value we seek to may not be visible.
+  // Find the next value that's visible.
+  ParsedInternalKey ikey;
+
+  while (true) {
+    if (!iter_.Valid()) {
+      valid_ = false;
+      return iter_.status().ok();
+    }
+
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      // No visible values for this key, even though FindValueForCurrentKey()
+      // has seen some. This is possible if we're using a tailing iterator, and
+      // the entries were discarded in a compaction.
+      valid_ = false;
+      return true;
+    }
+
+    if (IsVisible(ikey.sequence, ts)) {
+      break;
+    }
+
+    iter_.Next();
+  }
+
+  if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+      kTypeDeletionWithTimestamp == ikey.type) {
+    if (timestamp_lb_ == nullptr) {
+      valid_ = false;
+    } else {
+      valid_ = true;
+      saved_key_.SetInternalKey(ikey);
+    }
+    return true;
+  }
+  if (!iter_.PrepareValue()) {
+    valid_ = false;
+    return false;
+  }
+  if (timestamp_size_ > 0) {
+    Slice ts = ExtractTimestampFromUserKey(ikey.user_key, timestamp_size_);
+    saved_timestamp_.assign(ts.data(), ts.size());
+  }
+  if (ikey.type == kTypeValue || ikey.type == kTypeBlobIndex ||
+      ikey.type == kTypeWideColumnEntity) {
+    assert(iter_.iter()->IsValuePinned());
+    pinned_value_ = iter_.value();
+    if (ikey.type == kTypeBlobIndex) {
+      if (!SetBlobValueIfNeeded(ikey.user_key, pinned_value_)) {
+        return false;
+      }
+
+      SetValueAndColumnsFromPlain(expose_blob_index_ ? pinned_value_
+                                                     : blob_value_);
+    } else if (ikey.type == kTypeWideColumnEntity) {
+      if (!SetValueAndColumnsFromEntity(pinned_value_)) {
+        return false;
+      }
+    } else {
+      assert(ikey.type == kTypeValue);
+      SetValueAndColumnsFromPlain(pinned_value_);
+    }
+
+    if (timestamp_lb_ != nullptr) {
+      saved_key_.SetInternalKey(ikey);
+    }
+
+    valid_ = true;
+    return true;
+  }
+
+  // kTypeMerge. We need to collect all kTypeMerge values and save them
+  // in operands
+  assert(ikey.type == kTypeMerge);
+  current_entry_is_merged_ = true;
+  merge_context_.Clear();
+  merge_context_.PushOperand(
+      iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+  while (true) {
+    iter_.Next();
+
+    if (!iter_.Valid()) {
+      if (!iter_.status().ok()) {
+        valid_ = false;
+        return false;
+      }
+      break;
+    }
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+    if (!user_comparator_.EqualWithoutTimestamp(ikey.user_key,
+                                                saved_key_.GetUserKey())) {
+      break;
+    }
+    if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion ||
+        ikey.type == kTypeDeletionWithTimestamp) {
+      break;
+    }
+    if (!iter_.PrepareValue()) {
+      valid_ = false;
+      return false;
+    }
+
+    if (ikey.type == kTypeValue) {
+      const Slice val = iter_.value();
+      if (!Merge(&val, saved_key_.GetUserKey())) {
+        return false;
+      }
+      return true;
+    } else if (ikey.type == kTypeMerge) {
+      merge_context_.PushOperand(
+          iter_.value(), iter_.iter()->IsValuePinned() /* operand_pinned */);
+      PERF_COUNTER_ADD(internal_merge_count, 1);
+    } else if (ikey.type == kTypeBlobIndex) {
+      if (expose_blob_index_) {
+        status_ =
+            Status::NotSupported("BlobDB does not support merge operator.");
+        valid_ = false;
+        return false;
+      }
+      if (!SetBlobValueIfNeeded(ikey.user_key, iter_.value())) {
+        return false;
+      }
+      valid_ = true;
+      if (!Merge(&blob_value_, saved_key_.GetUserKey())) {
+        return false;
+      }
+
+      ResetBlobValue();
+
+      return true;
+    } else if (ikey.type == kTypeWideColumnEntity) {
+      if (!MergeEntity(iter_.value(), saved_key_.GetUserKey())) {
+        return false;
+      }
+
+      return true;
+    } else {
+      valid_ = false;
+      status_ = Status::Corruption(
+          "Unknown value type: " +
+          std::to_string(static_cast<unsigned int>(ikey.type)));
+      return false;
+    }
+  }
+
+  if (!Merge(nullptr, saved_key_.GetUserKey())) {
+    return false;
+  }
+
+  // Make sure we leave iter_ in a good state. If it's valid and we don't care
+  // about prefixes, that's already good enough. Otherwise it needs to be
+  // seeked to the current key.
+  if (!expect_total_order_inner_iter() || !iter_.Valid()) {
+    if (!expect_total_order_inner_iter()) {
+      iter_.SeekForPrev(last_key);
+    } else {
+      iter_.Seek(last_key);
+      if (!iter_.Valid() && iter_.status().ok()) {
+        iter_.SeekToLast();
+      }
+    }
+    RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+  }
+
+  valid_ = true;
+  return true;
+}
+
+bool DBIter::Merge(const Slice* val, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMerge(
+      merge_operator_, user_key, val, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_, &pinned_value_,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  SetValueAndColumnsFromPlain(pinned_value_.data() ? pinned_value_
+                                                   : saved_value_);
+
+  valid_ = true;
+  return true;
+}
+
+bool DBIter::MergeEntity(const Slice& entity, const Slice& user_key) {
+  Status s = MergeHelper::TimedFullMergeWithEntity(
+      merge_operator_, user_key, entity, merge_context_.GetOperands(),
+      &saved_value_, logger_, statistics_, clock_,
+      /* update_num_ops_stats */ true);
+  if (!s.ok()) {
+    valid_ = false;
+    status_ = s;
+    return false;
+  }
+
+  if (!SetValueAndColumnsFromEntity(saved_value_)) {
+    return false;
+  }
+
+  valid_ = true;
+  return true;
+}
+
+// Move backwards until the key smaller than saved_key_.
+// Changes valid_ only if return value is false.
+bool DBIter::FindUserKeyBeforeSavedKey() {
+  assert(status_.ok());
+  size_t num_skipped = 0;
+  while (iter_.Valid()) {
+    ParsedInternalKey ikey;
+    if (!ParseKey(&ikey)) {
+      return false;
+    }
+
+    if (CompareKeyForSkip(ikey.user_key, saved_key_.GetUserKey()) < 0) {
+      return true;
+    }
+
+    if (TooManyInternalKeysSkipped()) {
+      return false;
+    }
+
+    assert(ikey.sequence != kMaxSequenceNumber);
+    assert(ikey.user_key.size() >= timestamp_size_);
+    Slice ts;
+    if (timestamp_size_ > 0) {
+      ts = Slice(ikey.user_key.data() + ikey.user_key.size() - timestamp_size_,
+                 timestamp_size_);
+    }
+    if (!IsVisible(ikey.sequence, ts)) {
+      PERF_COUNTER_ADD(internal_recent_skipped_count, 1);
+    } else {
+      PERF_COUNTER_ADD(internal_key_skipped_count, 1);
+    }
+
+    if (num_skipped >= max_skip_) {
+      num_skipped = 0;
+      IterKey last_key;
+      ParsedInternalKey pikey(saved_key_.GetUserKey(), kMaxSequenceNumber,
+                              kValueTypeForSeek);
+      if (timestamp_size_ > 0) {
+        // TODO: pre-create kTsMax.
+        const std::string kTsMax(timestamp_size_, '\xff');
+        pikey.SetTimestamp(kTsMax);
+      }
+      last_key.SetInternalKey(pikey);
+      // It would be more efficient to use SeekForPrev() here, but some
+      // iterators may not support it.
+      iter_.Seek(last_key.GetInternalKey());
+      RecordTick(statistics_, NUMBER_OF_RESEEKS_IN_ITERATION);
+      if (!iter_.Valid()) {
+        break;
+      }
+    } else {
+      ++num_skipped;
+    }
+
+    iter_.Prev();
+  }
+
+  if (!iter_.status().ok()) {
+    valid_ = false;
+    return false;
+  }
+
+  return true;
+}
+
+bool DBIter::TooManyInternalKeysSkipped(bool increment) {
+  if ((max_skippable_internal_keys_ > 0) &&
+      (num_internal_keys_skipped_ > max_skippable_internal_keys_)) {
+    valid_ = false;
+    status_ = Status::Incomplete("Too many internal keys skipped.");
+    return true;
+  } else if (increment) {
+    num_internal_keys_skipped_++;
+  }
+  return false;
+}
+
+bool DBIter::IsVisible(SequenceNumber sequence, const Slice& ts,
+                       bool* more_recent) {
+  // Remember that comparator orders preceding timestamp as larger.
+  // TODO(yanqin): support timestamp in read_callback_.
+  bool visible_by_seq = (read_callback_ == nullptr)
+                            ? sequence <= sequence_
+                            : read_callback_->IsVisible(sequence);
+
+  bool visible_by_ts =
+      (timestamp_ub_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_ub_) <= 0) &&
+      (timestamp_lb_ == nullptr ||
+       user_comparator_.CompareTimestamp(ts, *timestamp_lb_) >= 0);
+
+  if (more_recent) {
+    *more_recent = !visible_by_seq;
+  }
+  return visible_by_seq && visible_by_ts;
+}
+
+void DBIter::SetSavedKeyToSeekTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  SequenceNumber seq = sequence_;
+  saved_key_.Clear();
+  saved_key_.SetInternalKey(target, seq, kValueTypeForSeek, timestamp_ub_);
+
+  if (iterate_lower_bound_ != nullptr &&
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_lower_bound_,
+          /*b_has_ts=*/false) < 0) {
+    // Seek key is smaller than the lower bound.
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_lower_bound_, seq, kValueTypeForSeek,
+                              timestamp_ub_);
+  }
+}
+
+void DBIter::SetSavedKeyToSeekForPrevTarget(const Slice& target) {
+  is_key_seqnum_zero_ = false;
+  saved_key_.Clear();
+  // now saved_key is used to store internal key.
+  saved_key_.SetInternalKey(target, 0 /* sequence_number */,
+                            kValueTypeForSeekForPrev, timestamp_ub_);
+
+  if (timestamp_size_ > 0) {
+    const std::string kTsMin(timestamp_size_, '\0');
+    Slice ts = kTsMin;
+    saved_key_.UpdateInternalKey(
+        /*seq=*/0, kValueTypeForSeekForPrev,
+        timestamp_lb_ == nullptr ? &ts : timestamp_lb_);
+  }
+
+  if (iterate_upper_bound_ != nullptr &&
+      user_comparator_.CompareWithoutTimestamp(
+          saved_key_.GetUserKey(), /*a_has_ts=*/true, *iterate_upper_bound_,
+          /*b_has_ts=*/false) >= 0) {
+    saved_key_.Clear();
+    saved_key_.SetInternalKey(*iterate_upper_bound_, kMaxSequenceNumber,
+                              kValueTypeForSeekForPrev, timestamp_ub_);
+    if (timestamp_size_ > 0) {
+      const std::string kTsMax(timestamp_size_, '\xff');
+      Slice ts = kTsMax;
+      saved_key_.UpdateInternalKey(
+          kMaxSequenceNumber, kValueTypeForSeekForPrev,
+          timestamp_lb_ != nullptr ? timestamp_lb_ : &ts);
+    }
+  }
+}
+
+void DBIter::Seek(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_->TraceIteratorSeek(cfd_->GetID(), target, lower_bound, upper_bound)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+
+    SetSavedKeyToSeekTarget(target);
+    iter_.Seek(saved_key_.GetInternalKey());
+
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kForward;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the next key that is visible to the user.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exhausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      &target_prefix /* prefix */);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Next() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    FindNextUserEntry(false /* not skipping saved_key */, nullptr);
+  }
+  if (!valid_) {
+    return;
+  }
+
+  // Updating stats and perf context counters.
+  if (statistics_ != nullptr) {
+    // Decrement since we don't want to count this key as skipped
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+  }
+  PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+}
+
+void DBIter::SeekForPrev(const Slice& target) {
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  StopWatch sw(clock_, statistics_, DB_SEEK);
+
+#ifndef ROCKSDB_LITE
+  if (db_impl_ != nullptr && cfd_ != nullptr) {
+    // TODO: What do we do if this returns an error?
+    Slice lower_bound, upper_bound;
+    if (iterate_lower_bound_ != nullptr) {
+      lower_bound = *iterate_lower_bound_;
+    } else {
+      lower_bound = Slice("");
+    }
+    if (iterate_upper_bound_ != nullptr) {
+      upper_bound = *iterate_upper_bound_;
+    } else {
+      upper_bound = Slice("");
+    }
+    db_impl_
+        ->TraceIteratorSeekForPrev(cfd_->GetID(), target, lower_bound,
+                                   upper_bound)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  status_ = Status::OK();
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+
+  // Seek the inner iterator based on the target key.
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    SetSavedKeyToSeekForPrevTarget(target);
+    iter_.SeekForPrev(saved_key_.GetInternalKey());
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+  }
+  if (!iter_.Valid()) {
+    valid_ = false;
+    return;
+  }
+  direction_ = kReverse;
+
+  // Now the inner iterator is placed to the target position. From there,
+  // we need to find out the first key that is visible to the user in the
+  // backward direction.
+  ClearSavedValue();
+  if (prefix_same_as_start_) {
+    // The case where the iterator needs to be invalidated if it has exhausted
+    // keys within the same prefix of the seek key.
+    assert(prefix_extractor_ != nullptr);
+    Slice target_prefix = prefix_extractor_->Transform(target);
+    PrevInternal(&target_prefix);
+    if (valid_) {
+      // Remember the prefix of the seek key for the future Prev() call to
+      // check.
+      prefix_.SetUserKey(target_prefix);
+    }
+  } else {
+    PrevInternal(nullptr);
+  }
+
+  // Report stats and perf context.
+  if (statistics_ != nullptr && valid_) {
+    RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+    RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+    PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+  }
+}
+
+void DBIter::SeekToFirst() {
+  if (iterate_lower_bound_ != nullptr) {
+    Seek(*iterate_lower_bound_);
+    return;
+  }
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  // if iterator is empty, this status_ could be unchecked.
+  status_.PermitUncheckedError();
+  direction_ = kForward;
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToFirst();
+  }
+
+  RecordTick(statistics_, NUMBER_DB_SEEK);
+  if (iter_.Valid()) {
+    saved_key_.SetUserKey(
+        ExtractUserKey(iter_.key()),
+        !iter_.iter()->IsKeyPinned() || !pin_thru_lifetime_ /* copy */);
+    FindNextUserEntry(false /* not skipping saved_key */,
+                      nullptr /* no prefix check */);
+    if (statistics_ != nullptr) {
+      if (valid_) {
+        RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+        RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+        PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+      }
+    }
+  } else {
+    valid_ = false;
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+  }
+}
+
+void DBIter::SeekToLast() {
+  if (iterate_upper_bound_ != nullptr) {
+    // Seek to last key strictly less than ReadOptions.iterate_upper_bound.
+    SeekForPrev(*iterate_upper_bound_);
+    const bool is_ikey = (timestamp_size_ > 0 && timestamp_lb_ != nullptr);
+    Slice k = Valid() ? key() : Slice();
+    if (is_ikey && Valid()) {
+      k.remove_suffix(kNumInternalBytes + timestamp_size_);
+    }
+    while (Valid() && 0 == user_comparator_.CompareWithoutTimestamp(
+                               *iterate_upper_bound_, /*a_has_ts=*/false, k,
+                               /*b_has_ts=*/false)) {
+      ReleaseTempPinnedData();
+      ResetBlobValue();
+      ResetValueAndColumns();
+      PrevInternal(nullptr);
+
+      k = key();
+      if (is_ikey) {
+        k.remove_suffix(kNumInternalBytes + timestamp_size_);
+      }
+    }
+    return;
+  }
+
+  PERF_CPU_TIMER_GUARD(iter_seek_cpu_nanos, clock_);
+  // Don't use iter_::Seek() if we set a prefix extractor
+  // because prefix seek will be used.
+  if (!expect_total_order_inner_iter()) {
+    max_skip_ = std::numeric_limits<uint64_t>::max();
+  }
+  status_ = Status::OK();
+  // if iterator is empty, this status_ could be unchecked.
+  status_.PermitUncheckedError();
+  direction_ = kReverse;
+  ReleaseTempPinnedData();
+  ResetBlobValue();
+  ResetValueAndColumns();
+  ResetInternalKeysSkippedCounter();
+  ClearSavedValue();
+  is_key_seqnum_zero_ = false;
+
+  {
+    PERF_TIMER_GUARD(seek_internal_seek_time);
+    iter_.SeekToLast();
+  }
+  PrevInternal(nullptr);
+  if (statistics_ != nullptr) {
+    RecordTick(statistics_, NUMBER_DB_SEEK);
+    if (valid_) {
+      RecordTick(statistics_, NUMBER_DB_SEEK_FOUND);
+      RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size());
+      PERF_COUNTER_ADD(iter_read_bytes, key().size() + value().size());
+    }
+  }
+  if (valid_ && prefix_same_as_start_) {
+    assert(prefix_extractor_ != nullptr);
+    prefix_.SetUserKey(prefix_extractor_->Transform(
+        StripTimestampFromUserKey(saved_key_.GetUserKey(), timestamp_size_)));
+  }
+}
+
+Iterator* NewDBIterator(Env* env, const ReadOptions& read_options,
+                        const ImmutableOptions& ioptions,
+                        const MutableCFOptions& mutable_cf_options,
+                        const Comparator* user_key_comparator,
+                        InternalIterator* internal_iter, const Version* version,
+                        const SequenceNumber& sequence,
+                        uint64_t max_sequential_skip_in_iterations,
+                        ReadCallback* read_callback, DBImpl* db_impl,
+                        ColumnFamilyData* cfd, bool expose_blob_index) {
+  DBIter* db_iter =
+      new DBIter(env, read_options, ioptions, mutable_cf_options,
+                 user_key_comparator, internal_iter, version, sequence, false,
+                 max_sequential_skip_in_iterations, read_callback, db_impl, cfd,
+                 expose_blob_index);
+  return db_iter;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter.h b/src/rocksdb/db/db_iter.h
new file mode 100644
index 000000000..e87c2b4c9
--- /dev/null
+++ b/src/rocksdb/db/db_iter.h
@@ -0,0 +1,420 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/range_del_aggregator.h"
+#include "memory/arena.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/wide_columns.h"
+#include "table/iterator_wrapper.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Version;
+
+// This file declares the factory functions of DBIter, in its original form
+// or a wrapped form with class ArenaWrappedDBIter, which is defined here.
+// Class DBIter, which is declared and implemented inside db_iter.cc, is
+// an iterator that converts internal keys (yielded by an InternalIterator)
+// that were live at the specified sequence number into appropriate user
+// keys.
+// Each internal key consists of a user key, a sequence number, and a value
+// type. DBIter deals with multiple key versions, tombstones, merge operands,
+// etc, and exposes an Iterator.
+// For example, DBIter may wrap following InternalIterator:
+//    user key: AAA  value: v3   seqno: 100    type: Put
+//    user key: AAA  value: v2   seqno: 97     type: Put
+//    user key: AAA  value: v1   seqno: 95     type: Put
+//    user key: BBB  value: v1   seqno: 90     type: Put
+//    user key: BBC  value: N/A  seqno: 98     type: Delete
+//    user key: BBC  value: v1   seqno: 95     type: Put
+// If the snapshot passed in is 102, then the DBIter is expected to
+// expose the following iterator:
+//    key: AAA  value: v3
+//    key: BBB  value: v1
+// If the snapshot passed in is 96, then it should expose:
+//    key: AAA  value: v1
+//    key: BBB  value: v1
+//    key: BBC  value: v1
+//
+
+// Memtables and sstables that make the DB representation contain
+// (userkey,seq,type) => uservalue entries.  DBIter
+// combines multiple entries for the same userkey found in the DB
+// representation into a single entry while accounting for sequence
+// numbers, deletion markers, overwrites, etc.
+class DBIter final : public Iterator {
+ public:
+  // The following is grossly complicated. TODO: clean it up
+  // Which direction is the iterator currently moving?
+  // (1) When moving forward:
+  //   (1a) if current_entry_is_merged_ = false, the internal iterator is
+  //        positioned at the exact entry that yields this->key(), this->value()
+  //   (1b) if current_entry_is_merged_ = true, the internal iterator is
+  //        positioned immediately after the last entry that contributed to the
+  //        current this->value(). That entry may or may not have key equal to
+  //        this->key().
+  // (2) When moving backwards, the internal iterator is positioned
+  //     just before all entries whose user key == this->key().
+  enum Direction : uint8_t { kForward, kReverse };
+
+  // LocalStatistics contain Statistics counters that will be aggregated per
+  // each iterator instance and then will be sent to the global statistics when
+  // the iterator is destroyed.
+  //
+  // The purpose of this approach is to avoid perf regression happening
+  // when multiple threads bump the atomic counters from a DBIter::Next().
+  struct LocalStatistics {
+    explicit LocalStatistics() { ResetCounters(); }
+
+    void ResetCounters() {
+      next_count_ = 0;
+      next_found_count_ = 0;
+      prev_count_ = 0;
+      prev_found_count_ = 0;
+      bytes_read_ = 0;
+      skip_count_ = 0;
+    }
+
+    void BumpGlobalStatistics(Statistics* global_statistics) {
+      RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_);
+      RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_);
+      RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_);
+      RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_);
+      RecordTick(global_statistics, NUMBER_ITER_SKIP, skip_count_);
+      PERF_COUNTER_ADD(iter_read_bytes, bytes_read_);
+      ResetCounters();
+    }
+
+    // Map to Tickers::NUMBER_DB_NEXT
+    uint64_t next_count_;
+    // Map to Tickers::NUMBER_DB_NEXT_FOUND
+    uint64_t next_found_count_;
+    // Map to Tickers::NUMBER_DB_PREV
+    uint64_t prev_count_;
+    // Map to Tickers::NUMBER_DB_PREV_FOUND
+    uint64_t prev_found_count_;
+    // Map to Tickers::ITER_BYTES_READ
+    uint64_t bytes_read_;
+    // Map to Tickers::NUMBER_ITER_SKIP
+    uint64_t skip_count_;
+  };
+
+  DBIter(Env* _env, const ReadOptions& read_options,
+         const ImmutableOptions& ioptions,
+         const MutableCFOptions& mutable_cf_options, const Comparator* cmp,
+         InternalIterator* iter, const Version* version, SequenceNumber s,
+         bool arena_mode, uint64_t max_sequential_skip_in_iterations,
+         ReadCallback* read_callback, DBImpl* db_impl, ColumnFamilyData* cfd,
+         bool expose_blob_index);
+
+  // No copying allowed
+  DBIter(const DBIter&) = delete;
+  void operator=(const DBIter&) = delete;
+
+  ~DBIter() override {
+    // Release pinned data if any
+    if (pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+    RecordTick(statistics_, NO_ITERATOR_DELETED);
+    ResetInternalKeysSkippedCounter();
+    local_stats_.BumpGlobalStatistics(statistics_);
+    iter_.DeleteIter(arena_mode_);
+  }
+  void SetIter(InternalIterator* iter) {
+    assert(iter_.iter() == nullptr);
+    iter_.Set(iter);
+    iter_.iter()->SetPinnedItersMgr(&pinned_iters_mgr_);
+  }
+
+  bool Valid() const override {
+#ifdef ROCKSDB_ASSERT_STATUS_CHECKED
+    if (valid_) {
+      status_.PermitUncheckedError();
+    }
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+    return valid_;
+  }
+  Slice key() const override {
+    assert(valid_);
+    if (timestamp_lb_) {
+      return saved_key_.GetInternalKey();
+    } else {
+      const Slice ukey_and_ts = saved_key_.GetUserKey();
+      return Slice(ukey_and_ts.data(), ukey_and_ts.size() - timestamp_size_);
+    }
+  }
+  Slice value() const override {
+    assert(valid_);
+
+    return value_;
+  }
+
+  const WideColumns& columns() const override {
+    assert(valid_);
+
+    return wide_columns_;
+  }
+
+  Status status() const override {
+    if (status_.ok()) {
+      return iter_.status();
+    } else {
+      assert(!valid_);
+      return status_;
+    }
+  }
+  Slice timestamp() const override {
+    assert(valid_);
+    assert(timestamp_size_ > 0);
+    if (direction_ == kReverse) {
+      return saved_timestamp_;
+    }
+    const Slice ukey_and_ts = saved_key_.GetUserKey();
+    assert(timestamp_size_ < ukey_and_ts.size());
+    return ExtractTimestampFromUserKey(ukey_and_ts, timestamp_size_);
+  }
+  bool IsBlob() const {
+    assert(valid_);
+    return is_blob_;
+  }
+
+  Status GetProperty(std::string prop_name, std::string* prop) override;
+
+  void Next() final override;
+  void Prev() final override;
+  // 'target' does not contain timestamp, even if user timestamp feature is
+  // enabled.
+  void Seek(const Slice& target) final override;
+  void SeekForPrev(const Slice& target) final override;
+  void SeekToFirst() final override;
+  void SeekToLast() final override;
+  Env* env() const { return env_; }
+  void set_sequence(uint64_t s) {
+    sequence_ = s;
+    if (read_callback_) {
+      read_callback_->Refresh(s);
+    }
+  }
+  void set_valid(bool v) { valid_ = v; }
+
+ private:
+  // For all methods in this block:
+  // PRE: iter_->Valid() && status_.ok()
+  // Return false if there was an error, and status() is non-ok, valid_ = false;
+  // in this case callers would usually stop what they were doing and return.
+  bool ReverseToForward();
+  bool ReverseToBackward();
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is smaller than iterator lower bound.
+  // target does not have timestamp.
+  void SetSavedKeyToSeekTarget(const Slice& target);
+  // Set saved_key_ to the seek key to target, with proper sequence number set.
+  // It might get adjusted if the seek key is larger than iterator upper bound.
+  // target does not have timestamp.
+  void SetSavedKeyToSeekForPrevTarget(const Slice& target);
+  bool FindValueForCurrentKey();
+  bool FindValueForCurrentKeyUsingSeek();
+  bool FindUserKeyBeforeSavedKey();
+  // If `skipping_saved_key` is true, the function will keep iterating until it
+  // finds a user key that is larger than `saved_key_`.
+  // If `prefix` is not null, the iterator needs to stop when all keys for the
+  // prefix are exhausted and the iterator is set to invalid.
+  bool FindNextUserEntry(bool skipping_saved_key, const Slice* prefix);
+  // Internal implementation of FindNextUserEntry().
+  bool FindNextUserEntryInternal(bool skipping_saved_key, const Slice* prefix);
+  bool ParseKey(ParsedInternalKey* key);
+  bool MergeValuesNewToOld();
+
+  // If prefix is not null, we need to set the iterator to invalid if no more
+  // entry can be found within the prefix.
+  void PrevInternal(const Slice* prefix);
+  bool TooManyInternalKeysSkipped(bool increment = true);
+  bool IsVisible(SequenceNumber sequence, const Slice& ts,
+                 bool* more_recent = nullptr);
+
+  // Temporarily pin the blocks that we encounter until ReleaseTempPinnedData()
+  // is called
+  void TempPinData() {
+    if (!pin_thru_lifetime_) {
+      pinned_iters_mgr_.StartPinning();
+    }
+  }
+
+  // Release blocks pinned by TempPinData()
+  void ReleaseTempPinnedData() {
+    if (!pin_thru_lifetime_ && pinned_iters_mgr_.PinningEnabled()) {
+      pinned_iters_mgr_.ReleasePinnedData();
+    }
+  }
+
+  inline void ClearSavedValue() {
+    if (saved_value_.capacity() > 1048576) {
+      std::string empty;
+      swap(empty, saved_value_);
+    } else {
+      saved_value_.clear();
+    }
+  }
+
+  inline void ResetInternalKeysSkippedCounter() {
+    local_stats_.skip_count_ += num_internal_keys_skipped_;
+    if (valid_) {
+      local_stats_.skip_count_--;
+    }
+    num_internal_keys_skipped_ = 0;
+  }
+
+  bool expect_total_order_inner_iter() {
+    assert(expect_total_order_inner_iter_ || prefix_extractor_ != nullptr);
+    return expect_total_order_inner_iter_;
+  }
+
+  // If lower bound of timestamp is given by ReadOptions.iter_start_ts, we need
+  // to return versions of the same key. We cannot just skip if the key value
+  // is the same but timestamps are different but fall in timestamp range.
+  inline int CompareKeyForSkip(const Slice& a, const Slice& b) {
+    return timestamp_lb_ != nullptr
+               ? user_comparator_.Compare(a, b)
+               : user_comparator_.CompareWithoutTimestamp(a, b);
+  }
+
+  // Retrieves the blob value for the specified user key using the given blob
+  // index when using the integrated BlobDB implementation.
+  bool SetBlobValueIfNeeded(const Slice& user_key, const Slice& blob_index);
+
+  void ResetBlobValue() {
+    is_blob_ = false;
+    blob_value_.Reset();
+  }
+
+  void SetValueAndColumnsFromPlain(const Slice& slice) {
+    assert(value_.empty());
+    assert(wide_columns_.empty());
+
+    value_ = slice;
+    wide_columns_.emplace_back(kDefaultWideColumnName, slice);
+  }
+
+  bool SetValueAndColumnsFromEntity(Slice slice);
+
+  void ResetValueAndColumns() {
+    value_.clear();
+    wide_columns_.clear();
+  }
+
+  // If user-defined timestamp is enabled, `user_key` includes timestamp.
+  bool Merge(const Slice* val, const Slice& user_key);
+  bool MergeEntity(const Slice& entity, const Slice& user_key);
+
+  const SliceTransform* prefix_extractor_;
+  Env* const env_;
+  SystemClock* clock_;
+  Logger* logger_;
+  UserComparatorWrapper user_comparator_;
+  const MergeOperator* const merge_operator_;
+  IteratorWrapper iter_;
+  const Version* version_;
+  ReadCallback* read_callback_;
+  // Max visible sequence number. It is normally the snapshot seq unless we have
+  // uncommitted data in db as in WriteUnCommitted.
+  SequenceNumber sequence_;
+
+  IterKey saved_key_;
+  // Reusable internal key data structure. This is only used inside one function
+  // and should not be used across functions. Reusing this object can reduce
+  // overhead of calling construction of the function if creating it each time.
+  ParsedInternalKey ikey_;
+  std::string saved_value_;
+  Slice pinned_value_;
+  // for prefix seek mode to support prev()
+  PinnableSlice blob_value_;
+  // Value of the default column
+  Slice value_;
+  // All columns (i.e. name-value pairs)
+  WideColumns wide_columns_;
+  Statistics* statistics_;
+  uint64_t max_skip_;
+  uint64_t max_skippable_internal_keys_;
+  uint64_t num_internal_keys_skipped_;
+  const Slice* iterate_lower_bound_;
+  const Slice* iterate_upper_bound_;
+
+  // The prefix of the seek key. It is only used when prefix_same_as_start_
+  // is true and prefix extractor is not null. In Next() or Prev(), current keys
+  // will be checked against this prefix, so that the iterator can be
+  // invalidated if the keys in this prefix has been exhausted. Set it using
+  // SetUserKey() and use it using GetUserKey().
+  IterKey prefix_;
+
+  Status status_;
+  Direction direction_;
+  bool valid_;
+  bool current_entry_is_merged_;
+  // True if we know that the current entry's seqnum is 0.
+  // This information is used as that the next entry will be for another
+  // user key.
+  bool is_key_seqnum_zero_;
+  const bool prefix_same_as_start_;
+  // Means that we will pin all data blocks we read as long the Iterator
+  // is not deleted, will be true if ReadOptions::pin_data is true
+  const bool pin_thru_lifetime_;
+  // Expect the inner iterator to maintain a total order.
+  // prefix_extractor_ must be non-NULL if the value is false.
+  const bool expect_total_order_inner_iter_;
+  ReadTier read_tier_;
+  bool fill_cache_;
+  bool verify_checksums_;
+  // Whether the iterator is allowed to expose blob references. Set to true when
+  // the stacked BlobDB implementation is used, false otherwise.
+  bool expose_blob_index_;
+  bool is_blob_;
+  bool arena_mode_;
+  // List of operands for merge operator.
+  MergeContext merge_context_;
+  LocalStatistics local_stats_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  DBImpl* db_impl_;
+#ifdef ROCKSDB_LITE
+  ROCKSDB_FIELD_UNUSED
+#endif
+  ColumnFamilyData* cfd_;
+  const Slice* const timestamp_ub_;
+  const Slice* const timestamp_lb_;
+  const size_t timestamp_size_;
+  std::string saved_timestamp_;
+
+  // Used only if timestamp_lb_ is not nullptr.
+  std::string saved_ikey_;
+};
+
+// Return a new iterator that converts internal keys (yielded by
+// "*internal_iter") that were live at the specified `sequence` number
+// into appropriate user keys.
+extern Iterator* NewDBIterator(
+    Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options,
+    const Comparator* user_key_comparator, InternalIterator* internal_iter,
+    const Version* version, const SequenceNumber& sequence,
+    uint64_t max_sequential_skip_in_iterations, ReadCallback* read_callback,
+    DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+    bool expose_blob_index = false);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_iter_stress_test.cc b/src/rocksdb/db/db_iter_stress_test.cc
new file mode 100644
index 000000000..872f7e6bd
--- /dev/null
+++ b/src/rocksdb/db/db_iter_stress_test.cc
@@ -0,0 +1,658 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifdef GFLAGS
+
+#include "util/gflags_compat.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(verbose, false,
+            "Print huge, detailed trace. Intended for debugging failures.");
+
+#else
+
+void ParseCommandLineFlags(int*, char***, bool) {}
+bool FLAGS_verbose = false;
+
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBIteratorStressTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorStressTest() : env_(Env::Default()) {}
+};
+
+namespace {
+
+struct Entry {
+  std::string key;
+  ValueType type;  // kTypeValue, kTypeDeletion, kTypeMerge
+  uint64_t sequence;
+  std::string ikey;  // internal key, made from `key`, `sequence` and `type`
+  std::string value;
+  // If false, we'll pretend that this entry doesn't exist.
+  bool visible = true;
+
+  bool operator<(const Entry& e) const {
+    if (key != e.key) return key < e.key;
+    return std::tie(sequence, type) > std::tie(e.sequence, e.type);
+  }
+};
+
+struct Data {
+  std::vector<Entry> entries;
+
+  // Indices in `entries` with `visible` = false.
+  std::vector<size_t> hidden;
+  // Keys of entries whose `visible` changed since the last seek of iterators.
+  std::set<std::string> recently_touched_keys;
+};
+
+struct StressTestIterator : public InternalIterator {
+  Data* data;
+  Random64* rnd;
+  InternalKeyComparator cmp;
+
+  // Each operation will return error with this probability...
+  double error_probability = 0;
+  // ... and add/remove entries with this probability.
+  double mutation_probability = 0;
+  // The probability of adding vs removing entries will be chosen so that the
+  // amount of removed entries stays somewhat close to this number.
+  double target_hidden_fraction = 0;
+  // If true, print all mutations to stdout for debugging.
+  bool trace = false;
+
+  int iter = -1;
+  Status status_;
+
+  StressTestIterator(Data* _data, Random64* _rnd, const Comparator* _cmp)
+      : data(_data), rnd(_rnd), cmp(_cmp) {}
+
+  bool Valid() const override {
+    if (iter >= 0 && iter < (int)data->entries.size()) {
+      assert(status_.ok());
+      return true;
+    }
+    return false;
+  }
+
+  Status status() const override { return status_; }
+
+  bool MaybeFail() {
+    if (rnd->Next() >=
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            error_probability) {
+      return false;
+    }
+    if (rnd->Next() % 2) {
+      status_ = Status::Incomplete("test");
+    } else {
+      status_ = Status::IOError("test");
+    }
+    if (trace) {
+      std::cout << "injecting " << status_.ToString() << std::endl;
+    }
+    iter = -1;
+    return true;
+  }
+
+  void MaybeMutate() {
+    if (rnd->Next() >=
+        static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+            mutation_probability) {
+      return;
+    }
+    do {
+      // If too many entries are hidden, hide less, otherwise hide more.
+      double hide_probability =
+          data->hidden.size() > data->entries.size() * target_hidden_fraction
+              ? 1. / 3
+              : 2. / 3;
+      if (data->hidden.empty()) {
+        hide_probability = 1;
+      }
+      bool do_hide = rnd->Next() <
+                     static_cast<double>(std::numeric_limits<uint64_t>::max()) *
+                         hide_probability;
+      if (do_hide) {
+        // Hide a random entry.
+        size_t idx = rnd->Next() % data->entries.size();
+        Entry& e = data->entries[idx];
+        if (e.visible) {
+          if (trace) {
+            std::cout << "hiding idx " << idx << std::endl;
+          }
+          e.visible = false;
+          data->hidden.push_back(idx);
+          data->recently_touched_keys.insert(e.key);
+        } else {
+          // Already hidden. Let's go unhide something instead, just because
+          // it's easy and it doesn't really matter what we do.
+          do_hide = false;
+        }
+      }
+      if (!do_hide) {
+        // Unhide a random entry.
+        size_t hi = rnd->Next() % data->hidden.size();
+        size_t idx = data->hidden[hi];
+        if (trace) {
+          std::cout << "unhiding idx " << idx << std::endl;
+        }
+        Entry& e = data->entries[idx];
+        assert(!e.visible);
+        e.visible = true;
+        data->hidden[hi] = data->hidden.back();
+        data->hidden.pop_back();
+        data->recently_touched_keys.insert(e.key);
+      }
+    } while (rnd->Next() % 3 != 0);  // do 3 mutations on average
+  }
+
+  void SkipForward() {
+    while (iter < (int)data->entries.size() && !data->entries[iter].visible) {
+      ++iter;
+    }
+  }
+  void SkipBackward() {
+    while (iter >= 0 && !data->entries[iter].visible) {
+      --iter;
+    }
+  }
+
+  void SeekToFirst() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = 0;
+    SkipForward();
+  }
+  void SeekToLast() override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    iter = (int)data->entries.size() - 1;
+    SkipBackward();
+  }
+
+  void Seek(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) < 0; });
+    iter = (int)(it - data->entries.begin());
+    SkipForward();
+  }
+  void SeekForPrev(const Slice& target) override {
+    if (MaybeFail()) return;
+    MaybeMutate();
+
+    status_ = Status::OK();
+    // Binary search.
+    auto it = std::partition_point(
+        data->entries.begin(), data->entries.end(),
+        [&](const Entry& e) { return cmp.Compare(e.ikey, target) <= 0; });
+    iter = (int)(it - data->entries.begin());
+    --iter;
+    SkipBackward();
+  }
+
+  void Next() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    ++iter;
+    SkipForward();
+  }
+  void Prev() override {
+    assert(Valid());
+    if (MaybeFail()) return;
+    MaybeMutate();
+    --iter;
+    SkipBackward();
+  }
+
+  Slice key() const override {
+    assert(Valid());
+    return data->entries[iter].ikey;
+  }
+  Slice value() const override {
+    assert(Valid());
+    return data->entries[iter].value;
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+};
+
+// A small reimplementation of DBIter, supporting only some of the features,
+// and doing everything in O(log n).
+// Skips all keys that are in recently_touched_keys.
+struct ReferenceIterator {
+  Data* data;
+  uint64_t sequence;  // ignore entries with sequence number below this
+
+  bool valid = false;
+  std::string key;
+  std::string value;
+
+  ReferenceIterator(Data* _data, uint64_t _sequence)
+      : data(_data), sequence(_sequence) {}
+
+  bool Valid() const { return valid; }
+
+  // Finds the first entry with key
+  // greater/less/greater-or-equal/less-or-equal than `key`, depending on
+  // arguments: if `skip`, inequality is strict; if `forward`, it's
+  // greater/greater-or-equal, otherwise less/less-or-equal.
+  // Sets `key` to the result.
+  // If no such key exists, returns false. Doesn't check `visible`.
+  bool FindNextKey(bool skip, bool forward) {
+    valid = false;
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (forward != skip) {
+                                       return e.key < key;
+                                     } else {
+                                       return e.key <= key;
+                                     }
+                                   });
+    if (forward) {
+      if (it != data->entries.end()) {
+        key = it->key;
+        return true;
+      }
+    } else {
+      if (it != data->entries.begin()) {
+        --it;
+        key = it->key;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool FindValueForCurrentKey() {
+    if (data->recently_touched_keys.count(key)) {
+      return false;
+    }
+
+    // Find the first entry for the key. The caller promises that it exists.
+    auto it = std::partition_point(data->entries.begin(), data->entries.end(),
+                                   [&](const Entry& e) {
+                                     if (e.key != key) {
+                                       return e.key < key;
+                                     }
+                                     return e.sequence > sequence;
+                                   });
+
+    // Find the first visible entry.
+    for (;; ++it) {
+      if (it == data->entries.end()) {
+        return false;
+      }
+      Entry& e = *it;
+      if (e.key != key) {
+        return false;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        return false;
+      }
+      if (e.type == kTypeValue) {
+        value = e.value;
+        valid = true;
+        return true;
+      }
+      assert(e.type == kTypeMerge);
+      break;
+    }
+
+    // Collect merge operands.
+    std::vector<Slice> operands;
+    for (; it != data->entries.end(); ++it) {
+      Entry& e = *it;
+      if (e.key != key) {
+        break;
+      }
+      assert(e.sequence <= sequence);
+      if (!e.visible) continue;
+      if (e.type == kTypeDeletion) {
+        break;
+      }
+      operands.push_back(e.value);
+      if (e.type == kTypeValue) {
+        break;
+      }
+    }
+
+    // Do a merge.
+    value = operands.back().ToString();
+    for (int i = (int)operands.size() - 2; i >= 0; --i) {
+      value.append(",");
+      value.append(operands[i].data(), operands[i].size());
+    }
+
+    valid = true;
+    return true;
+  }
+
+  // Start at `key` and move until we encounter a valid value.
+  // `forward` defines the direction of movement.
+  // If `skip` is true, we're looking for key not equal to `key`.
+  void DoTheThing(bool skip, bool forward) {
+    while (FindNextKey(skip, forward) && !FindValueForCurrentKey()) {
+      skip = true;
+    }
+  }
+
+  void Seek(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, true);
+  }
+  void SeekForPrev(const Slice& target) {
+    key = target.ToString();
+    DoTheThing(false, false);
+  }
+  void SeekToFirst() { Seek(""); }
+  void SeekToLast() {
+    key = data->entries.back().key;
+    DoTheThing(false, false);
+  }
+  void Next() {
+    assert(Valid());
+    DoTheThing(true, true);
+  }
+  void Prev() {
+    assert(Valid());
+    DoTheThing(true, false);
+  }
+};
+
+}  // anonymous namespace
+
+// Use an internal iterator that sometimes returns errors and sometimes
+// adds/removes entries on the fly. Do random operations on a DBIter and
+// check results.
+// TODO: can be improved for more coverage:
+//   * Override IsKeyPinned() and IsValuePinned() to actually use
+//     PinnedIteratorManager and check that there's no use-after free.
+//   * Try different combinations of prefix_extractor, total_order_seek,
+//     prefix_same_as_start, iterate_lower_bound, iterate_upper_bound.
+TEST_F(DBIteratorStressTest, StressTest) {
+  // We use a deterministic RNG, and everything happens in a single thread.
+  Random64 rnd(826909345792864532ll);
+
+  auto gen_key = [&](int max_key) {
+    assert(max_key > 0);
+    int len = 0;
+    int a = max_key;
+    while (a) {
+      a /= 10;
+      ++len;
+    }
+    std::string s = std::to_string(rnd.Next() % static_cast<uint64_t>(max_key));
+    s.insert(0, len - (int)s.size(), '0');
+    return s;
+  };
+
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ReadOptions ropt;
+
+  size_t num_matching = 0;
+  size_t num_at_end = 0;
+  size_t num_not_ok = 0;
+  size_t num_recently_removed = 0;
+
+  // Number of iterations for each combination of parameters
+  // (there are ~250 of those).
+  // Tweak this to change the test run time.
+  // As of the time of writing, the test takes ~4 seconds for value of 5000.
+  const int num_iterations = 5000;
+  // Enable this to print all the operations for debugging.
+  bool trace = FLAGS_verbose;
+
+  for (int num_entries : {5, 10, 100}) {
+    for (double key_space : {0.1, 1.0, 3.0}) {
+      for (ValueType prevalent_entry_type :
+           {kTypeValue, kTypeDeletion, kTypeMerge}) {
+        for (double error_probability : {0.01, 0.1}) {
+          for (double mutation_probability : {0.01, 0.5}) {
+            for (double target_hidden_fraction : {0.1, 0.5}) {
+              std::string trace_str =
+                  "entries: " + std::to_string(num_entries) +
+                  ", key_space: " + std::to_string(key_space) +
+                  ", error_probability: " + std::to_string(error_probability) +
+                  ", mutation_probability: " +
+                  std::to_string(mutation_probability) +
+                  ", target_hidden_fraction: " +
+                  std::to_string(target_hidden_fraction);
+              SCOPED_TRACE(trace_str);
+              if (trace) {
+                std::cout << trace_str << std::endl;
+              }
+
+              // Generate data.
+              Data data;
+              int max_key = (int)(num_entries * key_space) + 1;
+              for (int i = 0; i < num_entries; ++i) {
+                Entry e;
+                e.key = gen_key(max_key);
+                if (rnd.Next() % 10 != 0) {
+                  e.type = prevalent_entry_type;
+                } else {
+                  const ValueType types[] = {kTypeValue, kTypeDeletion,
+                                             kTypeMerge};
+                  e.type =
+                      types[rnd.Next() % (sizeof(types) / sizeof(types[0]))];
+                }
+                e.sequence = i;
+                e.value = "v" + std::to_string(i);
+                ParsedInternalKey internal_key(e.key, e.sequence, e.type);
+                AppendInternalKey(&e.ikey, internal_key);
+
+                data.entries.push_back(e);
+              }
+              std::sort(data.entries.begin(), data.entries.end());
+              if (trace) {
+                std::cout << "entries:";
+                for (size_t i = 0; i < data.entries.size(); ++i) {
+                  Entry& e = data.entries[i];
+                  std::cout << "\n  idx " << i << ": \"" << e.key << "\": \""
+                            << e.value << "\" seq: " << e.sequence << " type: "
+                            << (e.type == kTypeValue      ? "val"
+                                : e.type == kTypeDeletion ? "del"
+                                                          : "merge");
+                }
+                std::cout << std::endl;
+              }
+
+              std::unique_ptr<Iterator> db_iter;
+              std::unique_ptr<ReferenceIterator> ref_iter;
+              for (int iteration = 0; iteration < num_iterations; ++iteration) {
+                SCOPED_TRACE(iteration);
+                // Create a new iterator every ~30 operations.
+                if (db_iter == nullptr || rnd.Next() % 30 == 0) {
+                  uint64_t sequence = rnd.Next() % (data.entries.size() + 2);
+                  ref_iter.reset(new ReferenceIterator(&data, sequence));
+                  if (trace) {
+                    std::cout << "new iterator, seq: " << sequence << std::endl;
+                  }
+
+                  auto internal_iter =
+                      new StressTestIterator(&data, &rnd, BytewiseComparator());
+                  internal_iter->error_probability = error_probability;
+                  internal_iter->mutation_probability = mutation_probability;
+                  internal_iter->target_hidden_fraction =
+                      target_hidden_fraction;
+                  internal_iter->trace = trace;
+                  db_iter.reset(NewDBIterator(
+                      env_, ropt, ImmutableOptions(options),
+                      MutableCFOptions(options), BytewiseComparator(),
+                      internal_iter, nullptr /* version */, sequence,
+                      options.max_sequential_skip_in_iterations,
+                      nullptr /*read_callback*/));
+                }
+
+                // Do a random operation. It's important to do it on ref_it
+                // later than on db_iter to make sure ref_it sees the correct
+                // recently_touched_keys.
+                std::string old_key;
+                bool forward = rnd.Next() % 2 > 0;
+                // Do Next()/Prev() ~90% of the time.
+                bool seek = !ref_iter->Valid() || rnd.Next() % 10 == 0;
+                if (trace) {
+                  std::cout << iteration << ": ";
+                }
+
+                if (!seek) {
+                  assert(db_iter->Valid());
+                  old_key = ref_iter->key;
+                  if (trace) {
+                    std::cout << (forward ? "Next" : "Prev") << std::endl;
+                  }
+
+                  if (forward) {
+                    db_iter->Next();
+                    ref_iter->Next();
+                  } else {
+                    db_iter->Prev();
+                    ref_iter->Prev();
+                  }
+                } else {
+                  data.recently_touched_keys.clear();
+                  // Do SeekToFirst less often than Seek.
+                  if (rnd.Next() % 4 == 0) {
+                    if (trace) {
+                      std::cout << (forward ? "SeekToFirst" : "SeekToLast")
+                                << std::endl;
+                    }
+
+                    if (forward) {
+                      old_key = "";
+                      db_iter->SeekToFirst();
+                      ref_iter->SeekToFirst();
+                    } else {
+                      old_key = data.entries.back().key;
+                      db_iter->SeekToLast();
+                      ref_iter->SeekToLast();
+                    }
+                  } else {
+                    old_key = gen_key(max_key);
+                    if (trace) {
+                      std::cout << (forward ? "Seek" : "SeekForPrev") << " \""
+                                << old_key << '"' << std::endl;
+                    }
+                    if (forward) {
+                      db_iter->Seek(old_key);
+                      ref_iter->Seek(old_key);
+                    } else {
+                      db_iter->SeekForPrev(old_key);
+                      ref_iter->SeekForPrev(old_key);
+                    }
+                  }
+                }
+
+                // Check the result.
+                if (db_iter->Valid()) {
+                  ASSERT_TRUE(db_iter->status().ok());
+                  if (data.recently_touched_keys.count(
+                          db_iter->key().ToString())) {
+                    // Ended on a key that may have been mutated during the
+                    // operation. Reference iterator skips such keys, so we
+                    // can't check the exact result.
+
+                    // Check that the key moved in the right direction.
+                    if (forward) {
+                      if (seek)
+                        ASSERT_GE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_GT(db_iter->key().ToString(), old_key);
+                    } else {
+                      if (seek)
+                        ASSERT_LE(db_iter->key().ToString(), old_key);
+                      else
+                        ASSERT_LT(db_iter->key().ToString(), old_key);
+                    }
+
+                    if (ref_iter->Valid()) {
+                      // Check that DBIter didn't miss any non-mutated key.
+                      if (forward) {
+                        ASSERT_LT(db_iter->key().ToString(), ref_iter->key);
+                      } else {
+                        ASSERT_GT(db_iter->key().ToString(), ref_iter->key);
+                      }
+                    }
+                    // Tell the next iteration of the loop to reseek the
+                    // iterators.
+                    ref_iter->valid = false;
+
+                    ++num_recently_removed;
+                  } else {
+                    ASSERT_TRUE(ref_iter->Valid());
+                    ASSERT_EQ(ref_iter->key, db_iter->key().ToString());
+                    ASSERT_EQ(ref_iter->value, db_iter->value());
+                    ++num_matching;
+                  }
+                } else if (db_iter->status().ok()) {
+                  ASSERT_FALSE(ref_iter->Valid());
+                  ++num_at_end;
+                } else {
+                  // Non-ok status. Nothing to check here.
+                  // Tell the next iteration of the loop to reseek the
+                  // iterators.
+                  ref_iter->valid = false;
+                  ++num_not_ok;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Check that all cases were hit many times.
+  EXPECT_GT(num_matching, 10000);
+  EXPECT_GT(num_at_end, 10000);
+  EXPECT_GT(num_not_ok, 10000);
+  EXPECT_GT(num_recently_removed, 10000);
+
+  std::cout << "stats:\n  exact matches: " << num_matching
+            << "\n  end reached: " << num_at_end
+            << "\n  non-ok status: " << num_not_ok
+            << "\n  mutated on the fly: " << num_recently_removed << std::endl;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iter_test.cc b/src/rocksdb/db/db_iter_test.cc
new file mode 100644
index 000000000..65290bfad
--- /dev/null
+++ b/src/rocksdb/db/db_iter_test.cc
@@ -0,0 +1,3195 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_iter.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/statistics.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static uint64_t TestGetTickerCount(const Options& options,
+                                   Tickers ticker_type) {
+  return options.statistics->getTickerCount(ticker_type);
+}
+
+class TestIterator : public InternalIterator {
+ public:
+  explicit TestIterator(const Comparator* comparator)
+      : initialized_(false),
+        valid_(false),
+        sequence_number_(0),
+        iter_(0),
+        cmp(comparator) {
+    data_.reserve(16);
+  }
+
+  void AddPut(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeValue, argvalue);
+  }
+
+  void AddDeletion(std::string argkey) {
+    Add(argkey, kTypeDeletion, std::string());
+  }
+
+  void AddSingleDeletion(std::string argkey) {
+    Add(argkey, kTypeSingleDeletion, std::string());
+  }
+
+  void AddMerge(std::string argkey, std::string argvalue) {
+    Add(argkey, kTypeMerge, argvalue);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue) {
+    Add(argkey, type, argvalue, sequence_number_++);
+  }
+
+  void Add(std::string argkey, ValueType type, std::string argvalue,
+           size_t seq_num, bool update_iter = false) {
+    valid_ = true;
+    ParsedInternalKey internal_key(argkey, seq_num, type);
+    data_.push_back(
+        std::pair<std::string, std::string>(std::string(), argvalue));
+    AppendInternalKey(&data_.back().first, internal_key);
+    if (update_iter && valid_ && cmp.Compare(data_.back().first, key()) < 0) {
+      // insert a key smaller than current key
+      Finish();
+      // data_[iter_] is not anymore the current element of the iterator.
+      // Increment it to reposition it to the right position.
+      iter_++;
+    }
+  }
+
+  // should be called before operations with iterator
+  void Finish() {
+    initialized_ = true;
+    std::sort(data_.begin(), data_.end(),
+              [this](std::pair<std::string, std::string> a,
+                     std::pair<std::string, std::string> b) {
+                return (cmp.Compare(a.first, b.first) < 0);
+              });
+  }
+
+  // Removes the key from the set of keys over which this iterator iterates.
+  // Not to be confused with AddDeletion().
+  // If the iterator is currently positioned on this key, the deletion will
+  // apply next time the iterator moves.
+  // Used for simulating ForwardIterator updating to a new version that doesn't
+  // have some of the keys (e.g. after compaction with a filter).
+  void Vanish(std::string _key) {
+    if (valid_ && data_[iter_].first == _key) {
+      delete_current_ = true;
+      return;
+    }
+    for (auto it = data_.begin(); it != data_.end(); ++it) {
+      ParsedInternalKey ikey;
+      Status pik_status =
+          ParseInternalKey(it->first, &ikey, true /* log_err_key */);
+      pik_status.PermitUncheckedError();
+      assert(pik_status.ok());
+      if (!pik_status.ok() || ikey.user_key != _key) {
+        continue;
+      }
+      if (valid_ && data_.begin() + iter_ > it) {
+        --iter_;
+      }
+      data_.erase(it);
+      return;
+    }
+    assert(false);
+  }
+
+  // Number of operations done on this iterator since construction.
+  size_t steps() const { return steps_; }
+
+  bool Valid() const override {
+    assert(initialized_);
+    return valid_;
+  }
+
+  void SeekToFirst() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = 0;
+  }
+
+  void SeekToLast() override {
+    assert(initialized_);
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    valid_ = (data_.size() > 0);
+    iter_ = data_.size() - 1;
+  }
+
+  void Seek(const Slice& target) override {
+    assert(initialized_);
+    SeekToFirst();
+    ++steps_;
+    if (!valid_) {
+      return;
+    }
+    while (iter_ < data_.size() &&
+           (cmp.Compare(data_[iter_].first, target) < 0)) {
+      ++iter_;
+    }
+
+    if (iter_ == data_.size()) {
+      valid_ = false;
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    assert(initialized_);
+    DeleteCurrentIfNeeded();
+    SeekForPrevImpl(target, &cmp);
+  }
+
+  void Next() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    if (delete_current_) {
+      DeleteCurrentIfNeeded();
+    } else {
+      ++iter_;
+    }
+    valid_ = iter_ < data_.size();
+  }
+
+  void Prev() override {
+    assert(initialized_);
+    assert(valid_);
+    assert(iter_ < data_.size());
+
+    ++steps_;
+    DeleteCurrentIfNeeded();
+    if (iter_ == 0) {
+      valid_ = false;
+    } else {
+      --iter_;
+    }
+  }
+
+  Slice key() const override {
+    assert(initialized_);
+    return data_[iter_].first;
+  }
+
+  Slice value() const override {
+    assert(initialized_);
+    return data_[iter_].second;
+  }
+
+  Status status() const override {
+    assert(initialized_);
+    return Status::OK();
+  }
+
+  bool IsKeyPinned() const override { return true; }
+  bool IsValuePinned() const override { return true; }
+
+ private:
+  bool initialized_;
+  bool valid_;
+  size_t sequence_number_;
+  size_t iter_;
+  size_t steps_ = 0;
+
+  InternalKeyComparator cmp;
+  std::vector<std::pair<std::string, std::string>> data_;
+  bool delete_current_ = false;
+
+  void DeleteCurrentIfNeeded() {
+    if (!delete_current_) {
+      return;
+    }
+    data_.erase(data_.begin() + iter_);
+    delete_current_ = false;
+  }
+};
+
+class DBIteratorTest : public testing::Test {
+ public:
+  Env* env_;
+
+  DBIteratorTest() : env_(Env::Default()) {}
+};
+
+TEST_F(DBIteratorTest, DBIteratorPrevNext) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound not set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->AddPut("f", "val_f");
+    internal_iter->Finish();
+
+    Slice prefix("d");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() iterate_upper_bound set to a key that
+  // is not Put yet
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    Slice prefix("z");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set to the
+  // first key
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+  // Test case to check SeekToLast with iterate_upper_bound set
+  // (same key put may times - SeekToLast should start with the
+  // maximum sequence id of the upper bound)
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(static_cast<int>(get_perf_context()->internal_key_skipped_count),
+              1);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Checking the value of the key which has sequence ids greater than
+  // and less that the iterator's sequence id)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+
+    internal_iter->AddPut("a", "val_a1");
+    internal_iter->AddPut("a", "val_a2");
+    internal_iter->AddPut("b", "val_b1");
+    internal_iter->AddPut("c", "val_c1");
+    internal_iter->AddPut("c", "val_c2");
+    internal_iter->AddPut("c", "val_c3");
+    internal_iter->AddPut("b", "val_b2");
+    internal_iter->AddPut("d", "val_d1");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b1");
+  }
+
+  // Test to check the SeekToLast() with the iterate_upper_bound set to the
+  // key that is deleted
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("a");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+  // Test to check the SeekToLast() with the iterate_upper_bound set
+  // (Deletion cases)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+  // Test to check the SeekToLast() with iterate_upper_bound set
+  // (Deletion cases - Lot of internal keys after the upper_bound
+  // is deleted)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("e");
+    internal_iter->AddDeletion("f");
+    internal_iter->AddDeletion("g");
+    internal_iter->AddDeletion("h");
+    internal_iter->Finish();
+
+    Slice prefix("c");
+
+    ReadOptions ro;
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    SetPerfLevel(kEnableCount);
+    ASSERT_TRUE(GetPerfLevel() == kEnableCount);
+
+    get_perf_context()->Reset();
+    db_iter->SeekToLast();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+
+    SetPerfLevel(kDisable);
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("a", "val_a");
+
+    internal_iter->AddPut("b", "val_b");
+
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val_b");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorEmpty) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) {
+  ReadOptions ro;
+  Options options;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (size_t i = 0; i < 200; ++i) {
+    internal_iter->AddPut("a", "a");
+    internal_iter->AddPut("b", "b");
+    internal_iter->AddPut("c", "c");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+
+  db_iter->Prev();
+  ASSERT_TRUE(!db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3u);
+}
+
+TEST_F(DBIteratorTest, DBIteratorUseSkip) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+
+    {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t i = 0; i < 200; ++i) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 202 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "200");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddDeletion("c");
+      }
+      internal_iter->AddPut("c", "200");
+      internal_iter->Finish();
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (size_t i = 0; i < 200; ++i) {
+      internal_iter->AddDeletion("c");
+    }
+    internal_iter->AddPut("c", "200");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 200 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "200");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "merge_1");
+      internal_iter->AddMerge("a", "merge_2");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("d", std::to_string(k));
+      }
+
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddPut("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "d");
+      ASSERT_EQ(db_iter->value().ToString(), std::to_string(i));
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "merge_2");
+      db_iter->Prev();
+
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+
+  {
+    for (size_t i = 0; i < 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddMerge("b", "b");
+      internal_iter->AddMerge("a", "a");
+      for (size_t k = 0; k < 200; ++k) {
+        internal_iter->AddMerge("c", std::to_string(k));
+      }
+      internal_iter->Finish();
+
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, i + 2 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      std::string merge_result = "0";
+      for (size_t j = 1; j <= i; ++j) {
+        merge_result += "," + std::to_string(j);
+      }
+      ASSERT_EQ(db_iter->value().ToString(), merge_result);
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "b");
+      ASSERT_EQ(db_iter->value().ToString(), "b");
+
+      db_iter->Prev();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "a");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIteratorSkipInternalKeys) {
+  Options options;
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+  ReadOptions ro;
+
+  // Basic test case ... Make sure explicityly passing the default value works.
+  // Skipping internal keys is disabled by default, when the value is 0.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 0;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "val_d");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+  }
+
+  // Test to make sure that the request will *not* fail as incomplete if
+  // num_internal_keys_skipped is *equal* to max_skippable_internal_keys
+  // threshold. (It will fail as incomplete only when the threshold is
+  // exceeded.)
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().ok());
+  }
+
+  // Fail the request as incomplete when num_internal_keys_skipped >
+  // max_skippable_internal_keys
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Next();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that the num_internal_keys_skipped counter resets after a successful
+  // read.
+  // Reverse direction
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+    db_iter->Prev();  // num_internal_keys_skipped counter resets here.
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test that skipping separate keys is handled
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test if alternating puts and deletes of the same key are handled correctly.
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->AddDeletion("b");
+    internal_iter->AddPut("c", "val_c");
+    internal_iter->AddDeletion("c");
+    internal_iter->AddPut("d", "val_d");
+    internal_iter->AddDeletion("d");
+    internal_iter->AddPut("e", "val_e");
+    internal_iter->Finish();
+
+    ro.max_skippable_internal_keys = 2;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+    db_iter->Next();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "e");
+    ASSERT_EQ(db_iter->value().ToString(), "val_e");
+
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+    ASSERT_TRUE(db_iter->status().IsIncomplete());
+  }
+
+  // Test for large number of skippable internal keys with *default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "c");
+        ASSERT_EQ(db_iter->value().ToString(), "val_c");
+      }
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      if ((options.max_sequential_skip_in_iterations + 1) >=
+          ro.max_skippable_internal_keys) {
+        ASSERT_TRUE(!db_iter->Valid());
+        ASSERT_TRUE(db_iter->status().IsIncomplete());
+      } else {
+        ASSERT_TRUE(db_iter->Valid());
+        ASSERT_EQ(db_iter->key().ToString(), "a");
+        ASSERT_EQ(db_iter->value().ToString(), "val_a");
+      }
+    }
+  }
+
+  // Test for large number of skippable internal keys with a *non-default*
+  // max_sequential_skip_in_iterations.
+  {
+    for (size_t i = 1; i <= 200; ++i) {
+      TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+      internal_iter->AddPut("a", "val_a");
+      for (size_t j = 1; j <= i; ++j) {
+        internal_iter->AddPut("b", "val_b");
+        internal_iter->AddDeletion("b");
+      }
+      internal_iter->AddPut("c", "val_c");
+      internal_iter->Finish();
+
+      options.max_sequential_skip_in_iterations = 1000;
+      ro.max_skippable_internal_keys = i;
+      std::unique_ptr<Iterator> db_iter(NewDBIterator(
+          env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+          internal_iter, nullptr /* version */, 2 * i + 1 /* sequence */,
+          options.max_sequential_skip_in_iterations,
+          nullptr /* read_callback */));
+
+      db_iter->SeekToFirst();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "a");
+      ASSERT_EQ(db_iter->value().ToString(), "val_a");
+
+      db_iter->Next();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+
+      db_iter->SeekToLast();
+      ASSERT_TRUE(db_iter->Valid());
+      ASSERT_EQ(db_iter->key().ToString(), "c");
+      ASSERT_EQ(db_iter->value().ToString(), "val_c");
+
+      db_iter->Prev();
+      ASSERT_TRUE(!db_iter->Valid());
+      ASSERT_TRUE(db_iter->status().IsIncomplete());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator1) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator2) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      0 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator3) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator4) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0,1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_TRUE(!db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator5) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddPut("a", "put_1");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    // put, singledelete, merge
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddPut("a", "val_a");
+    internal_iter->AddSingleDeletion("a");
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddPut("b", "val_b");
+    internal_iter->Finish();
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 10 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator6) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 1 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 3 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("a", "merge_3");
+    internal_iter->AddDeletion("a");
+    internal_iter->AddMerge("a", "merge_4");
+    internal_iter->AddMerge("a", "merge_5");
+    internal_iter->AddMerge("a", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator7) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  ImmutableOptions ioptions = ImmutableOptions(options);
+  MutableCFOptions mutable_cf_options = MutableCFOptions(options);
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 0 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 2 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "val,merge_2");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 4 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 5 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4");
+    db_iter->Prev();
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 6 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 7 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 9 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 13 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "c");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddPut("b", "val");
+    internal_iter->AddMerge("b", "merge_2");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_3");
+
+    internal_iter->AddMerge("c", "merge_4");
+    internal_iter->AddMerge("c", "merge_5");
+
+    internal_iter->AddDeletion("b");
+    internal_iter->AddMerge("b", "merge_6");
+    internal_iter->AddMerge("b", "merge_7");
+    internal_iter->AddMerge("b", "merge_8");
+    internal_iter->AddMerge("b", "merge_9");
+    internal_iter->AddMerge("b", "merge_10");
+    internal_iter->AddMerge("b", "merge_11");
+
+    internal_iter->AddDeletion("c");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ioptions, mutable_cf_options, BytewiseComparator(),
+        internal_iter, nullptr /* version */, 14 /* sequence */,
+        options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(),
+              "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1");
+    db_iter->Prev();
+    ASSERT_TRUE(!db_iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, DBIterator8) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("a");
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator9) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+  {
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    internal_iter->AddMerge("a", "merge_1");
+    internal_iter->AddMerge("a", "merge_2");
+    internal_iter->AddMerge("b", "merge_3");
+    internal_iter->AddMerge("b", "merge_4");
+    internal_iter->AddMerge("d", "merge_5");
+    internal_iter->AddMerge("d", "merge_6");
+    internal_iter->Finish();
+
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToLast();
+    ASSERT_TRUE(db_iter->Valid());
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "a");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2");
+
+    db_iter->SeekForPrev("b");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+
+    db_iter->Seek("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+    db_iter->Prev();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+
+    db_iter->SeekForPrev("c");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "b");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_3,merge_4");
+    db_iter->Next();
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(db_iter->key().ToString(), "d");
+    ASSERT_EQ(db_iter->value().ToString(), "merge_5,merge_6");
+  }
+}
+
+// TODO(3.13): fix the issue of Seek() then Prev() which might not necessary
+//             return the biggest element smaller than the seek key.
+TEST_F(DBIteratorTest, DBIterator10) {
+  ReadOptions ro;
+  Options options;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddPut("d", "4");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->Seek("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+
+  db_iter->SeekForPrev("c");
+  ASSERT_TRUE(db_iter->Valid());
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "d");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+}
+
+TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator11) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = MergeOperators::CreateFromStringId("stringappend");
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "0");
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->AddMerge("a", "1");
+  internal_iter->AddMerge("b", "2");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      1 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+  db_iter->SeekToFirst();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "0");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  db_iter->Next();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator12) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("c", "3");
+  internal_iter->AddSingleDeletion("b");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, 0 /* force seek */, nullptr /* read_callback */));
+  db_iter->SeekToLast();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "c");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->Prev();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "1");
+  db_iter->Prev();
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, DBIterator13) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key;
+  key.resize(9);
+  key.assign(9, static_cast<char>(0));
+  key[0] = 'b';
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut(key, "0");
+  internal_iter->AddPut(key, "1");
+  internal_iter->AddPut(key, "2");
+  internal_iter->AddPut(key, "3");
+  internal_iter->AddPut(key, "4");
+  internal_iter->AddPut(key, "5");
+  internal_iter->AddPut(key, "6");
+  internal_iter->AddPut(key, "7");
+  internal_iter->AddPut(key, "8");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      2 /* sequence */, 3 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), key);
+  ASSERT_EQ(db_iter->value().ToString(), "2");
+}
+
+TEST_F(DBIteratorTest, DBIterator14) {
+  ReadOptions ro;
+  Options options;
+  options.merge_operator = nullptr;
+
+  std::string key("b");
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("b", "0");
+  internal_iter->AddPut("b", "1");
+  internal_iter->AddPut("b", "2");
+  internal_iter->AddPut("b", "3");
+  internal_iter->AddPut("a", "4");
+  internal_iter->AddPut("a", "5");
+  internal_iter->AddPut("a", "6");
+  internal_iter->AddPut("c", "7");
+  internal_iter->AddPut("c", "8");
+  internal_iter->AddPut("c", "9");
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      4 /* sequence */, 1 /* max_sequential_skip_in_iterations */,
+      nullptr /* read_callback */));
+  db_iter->Seek("b");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(db_iter->key().ToString(), "b");
+  ASSERT_EQ(db_iter->value().ToString(), "3");
+  db_iter->SeekToFirst();
+  ASSERT_EQ(db_iter->key().ToString(), "a");
+  ASSERT_EQ(db_iter->value().ToString(), "4");
+}
+
+class DBIterWithMergeIterTest : public testing::Test {
+ public:
+  DBIterWithMergeIterTest()
+      : env_(Env::Default()), icomp_(BytewiseComparator()) {
+    options_.merge_operator = nullptr;
+
+    internal_iter1_ = new TestIterator(BytewiseComparator());
+    internal_iter1_->Add("a", kTypeValue, "1", 3u);
+    internal_iter1_->Add("f", kTypeValue, "2", 5u);
+    internal_iter1_->Add("g", kTypeValue, "3", 7u);
+    internal_iter1_->Finish();
+
+    internal_iter2_ = new TestIterator(BytewiseComparator());
+    internal_iter2_->Add("a", kTypeValue, "4", 6u);
+    internal_iter2_->Add("b", kTypeValue, "5", 1u);
+    internal_iter2_->Add("c", kTypeValue, "6", 2u);
+    internal_iter2_->Add("d", kTypeValue, "7", 3u);
+    internal_iter2_->Finish();
+
+    std::vector<InternalIterator*> child_iters;
+    child_iters.push_back(internal_iter1_);
+    child_iters.push_back(internal_iter2_);
+    InternalKeyComparator icomp(BytewiseComparator());
+    InternalIterator* merge_iter =
+        NewMergingIterator(&icomp_, &child_iters[0], 2u);
+
+    db_iter_.reset(NewDBIterator(
+        env_, ro_, ImmutableOptions(options_), MutableCFOptions(options_),
+        BytewiseComparator(), merge_iter, nullptr /* version */,
+        8 /* read data earlier than seqId 8 */,
+        3 /* max iterators before reseek */, nullptr /* read_callback */));
+  }
+
+  Env* env_;
+  ReadOptions ro_;
+  Options options_;
+  TestIterator* internal_iter1_;
+  TestIterator* internal_iter2_;
+  InternalKeyComparator icomp_;
+  Iterator* merge_iter_;
+  std::unique_ptr<Iterator> db_iter_;
+};
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator1) {
+  db_iter_->SeekToFirst();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Next();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Next();
+  ASSERT_FALSE(db_iter_->Valid());
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
+  // Test Prev() when one child iterator is at its end.
+  db_iter_->SeekForPrev("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts a key in the end of the mem table after
+  // MergeIterator::Prev() realized the mem table iterator is at its end
+  // and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev",
+      [&](void* /*arg*/) { internal_iter2_->Add("z", kTypeValue, "7", 12u); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace2) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem tableiterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 12u);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace3) {
+  // Test Prev() when one child iterator is at its end but more rows
+  // are added and max_skipped is triggered.
+  db_iter_->Seek("f");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+
+  // Test call back inserts entries for update a key in the end of the
+  // mem table after MergeIterator::Prev() realized the mem table iterator is at
+  // its end and before an SeekToLast() is called.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* /*arg*/) {
+        internal_iter2_->Add("z", kTypeValue, "7", 16u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 15u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 14u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 13u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 12u, true);
+        internal_iter2_->Add("z", kTypeValue, "7", 11u, true);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace4) {
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace5) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace6) {
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts an entry for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace7) {
+  internal_iter1_->Add("u", kTypeValue, "10", 4u);
+  internal_iter1_->Add("v", kTypeValue, "11", 4u);
+  internal_iter1_->Add("w", kTypeValue, "12", 4u);
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts entries for update a key before "z" in
+  // mem table after MergeIterator::Prev() calls mem table iterator's
+  // Seek() and before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 15u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 14u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 13u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 12u, true);
+          internal_iter2_->Add("x", kTypeValue, "7", 11u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "c");
+  ASSERT_EQ(db_iter_->value().ToString(), "6");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "b");
+  ASSERT_EQ(db_iter_->value().ToString(), "5");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "a");
+  ASSERT_EQ(db_iter_->value().ToString(), "4");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
+  // internal_iter1_: a, f, g
+  // internal_iter2_: a, b, c, d, adding (z)
+  internal_iter2_->Add("z", kTypeValue, "9", 4u);
+
+  // Test Prev() when one child iterator has more rows inserted
+  // between Seek() and Prev() when changing directions.
+  db_iter_->Seek("g");
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "g");
+  ASSERT_EQ(db_iter_->value().ToString(), "3");
+
+  // Test call back inserts two keys before "z" in mem table after
+  // MergeIterator::Prev() calls mem table iterator's Seek() and
+  // before calling Prev()
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "MergeIterator::Prev:BeforePrev", [&](void* arg) {
+        IteratorWrapper* it = reinterpret_cast<IteratorWrapper*>(arg);
+        if (it->key().starts_with("z")) {
+          internal_iter2_->Add("x", kTypeValue, "7", 16u, true);
+          internal_iter2_->Add("y", kTypeValue, "7", 17u, true);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "f");
+  ASSERT_EQ(db_iter_->value().ToString(), "2");
+  db_iter_->Prev();
+  ASSERT_TRUE(db_iter_->Valid());
+  ASSERT_EQ(db_iter_->key().ToString(), "d");
+  ASSERT_EQ(db_iter_->value().ToString(), "7");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBIteratorTest, SeekPrefixTombstones) {
+  ReadOptions ro;
+  Options options;
+  options.prefix_extractor.reset(NewNoopTransform());
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddDeletion("b");
+  internal_iter->AddDeletion("c");
+  internal_iter->AddDeletion("d");
+  internal_iter->AddDeletion("e");
+  internal_iter->AddDeletion("f");
+  internal_iter->AddDeletion("g");
+  internal_iter->Finish();
+
+  ro.prefix_same_as_start = true;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  int skipped_keys = 0;
+
+  get_perf_context()->Reset();
+  db_iter->SeekForPrev("z");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+
+  get_perf_context()->Reset();
+  db_iter->Seek("a");
+  skipped_keys =
+      static_cast<int>(get_perf_context()->internal_key_skipped_count);
+  ASSERT_EQ(skipped_keys, 0);
+}
+
+TEST_F(DBIteratorTest, SeekToFirstLowerBound) {
+  const int kNumKeys = 3;
+  for (int i = 0; i < kNumKeys + 2; ++i) {
+    // + 2 for two special cases: lower bound before and lower bound after the
+    // internal iterator's keys
+    TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+    for (int j = 1; j <= kNumKeys; ++j) {
+      internal_iter->AddPut(std::to_string(j), "val");
+    }
+    internal_iter->Finish();
+
+    ReadOptions ro;
+    auto lower_bound_str = std::to_string(i);
+    Slice lower_bound(lower_bound_str);
+    ro.iterate_lower_bound = &lower_bound;
+    Options options;
+    std::unique_ptr<Iterator> db_iter(NewDBIterator(
+        env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+        BytewiseComparator(), internal_iter, nullptr /* version */,
+        10 /* sequence */, options.max_sequential_skip_in_iterations,
+        nullptr /* read_callback */));
+
+    db_iter->SeekToFirst();
+    if (i == kNumKeys + 1) {
+      // lower bound was beyond the last key
+      ASSERT_FALSE(db_iter->Valid());
+      ASSERT_OK(db_iter->status());
+    } else {
+      ASSERT_TRUE(db_iter->Valid());
+      int expected;
+      if (i == 0) {
+        // lower bound was before the first key
+        expected = 1;
+      } else {
+        // lower bound was at the ith key
+        expected = i;
+      }
+      ASSERT_EQ(std::to_string(expected), db_iter->key().ToString());
+    }
+  }
+}
+
+TEST_F(DBIteratorTest, PrevLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->SeekToLast();
+  for (int i = kNumKeys; i >= kLowerBound; --i) {
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_EQ(std::to_string(i), db_iter->key().ToString());
+    db_iter->Prev();
+  }
+  ASSERT_FALSE(db_iter->Valid());
+}
+
+TEST_F(DBIteratorTest, SeekLessLowerBound) {
+  const int kNumKeys = 3;
+  const int kLowerBound = 2;
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  for (int j = 1; j <= kNumKeys; ++j) {
+    internal_iter->AddPut(std::to_string(j), "val");
+  }
+  internal_iter->Finish();
+
+  ReadOptions ro;
+  auto lower_bound_str = std::to_string(kLowerBound);
+  Slice lower_bound(lower_bound_str);
+  ro.iterate_lower_bound = &lower_bound;
+  Options options;
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ro, ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  auto before_lower_bound_str = std::to_string(kLowerBound - 1);
+  Slice before_lower_bound(lower_bound_str);
+
+  db_iter->Seek(before_lower_bound);
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_EQ(lower_bound_str, db_iter->key().ToString());
+}
+
+TEST_F(DBIteratorTest, ReverseToForwardWithDisappearingKeys) {
+  Options options;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(0));
+
+  TestIterator* internal_iter = new TestIterator(BytewiseComparator());
+  internal_iter->AddPut("a", "A");
+  internal_iter->AddPut("b", "B");
+  for (int i = 0; i < 100; ++i) {
+    internal_iter->AddPut("c" + std::to_string(i), "");
+  }
+  internal_iter->Finish();
+
+  std::unique_ptr<Iterator> db_iter(NewDBIterator(
+      env_, ReadOptions(), ImmutableOptions(options), MutableCFOptions(options),
+      BytewiseComparator(), internal_iter, nullptr /* version */,
+      10 /* sequence */, options.max_sequential_skip_in_iterations,
+      nullptr /* read_callback */));
+
+  db_iter->SeekForPrev("a");
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("a", db_iter->key().ToString());
+
+  internal_iter->Vanish("a");
+  db_iter->Next();
+  ASSERT_TRUE(db_iter->Valid());
+  ASSERT_OK(db_iter->status());
+  ASSERT_EQ("b", db_iter->key().ToString());
+
+  // A (sort of) bug used to cause DBIter to pointlessly drag the internal
+  // iterator all the way to the end. But this doesn't really matter at the time
+  // of writing because the only iterator that can see disappearing keys is
+  // ForwardIterator, which doesn't support SeekForPrev().
+  EXPECT_LT(internal_iter->steps(), 20);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_iterator_test.cc b/src/rocksdb/db/db_iterator_test.cc
new file mode 100644
index 000000000..aaf1408b4
--- /dev/null
+++ b/src/rocksdb/db/db_iterator_test.cc
@@ -0,0 +1,3265 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <functional>
+
+#include "db/arena_wrapped_db_iter.h"
+#include "db/db_iter.h"
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/perf_context.h"
+#include "table/block_based/flush_block_policy.h"
+#include "util/random.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A dumb ReadCallback which saying every key is committed.
+class DummyReadCallback : public ReadCallback {
+ public:
+  DummyReadCallback() : ReadCallback(kMaxSequenceNumber) {}
+  bool IsVisibleFullCheck(SequenceNumber /*seq*/) override { return true; }
+  void SetSnapshot(SequenceNumber seq) { max_visible_seq_ = seq; }
+};
+
+// Test param:
+//   bool: whether to pass read_callback to NewIterator().
+class DBIteratorTest : public DBTestBase,
+                       public testing::WithParamInterface<bool> {
+ public:
+  DBIteratorTest() : DBTestBase("db_iterator_test", /*env_do_fsync=*/true) {}
+
+  Iterator* NewIterator(const ReadOptions& read_options,
+                        ColumnFamilyHandle* column_family = nullptr) {
+    if (column_family == nullptr) {
+      column_family = db_->DefaultColumnFamily();
+    }
+    auto* cfd =
+        static_cast_with_check<ColumnFamilyHandleImpl>(column_family)->cfd();
+    SequenceNumber seq = read_options.snapshot != nullptr
+                             ? read_options.snapshot->GetSequenceNumber()
+                             : db_->GetLatestSequenceNumber();
+    bool use_read_callback = GetParam();
+    DummyReadCallback* read_callback = nullptr;
+    if (use_read_callback) {
+      read_callback = new DummyReadCallback();
+      read_callback->SetSnapshot(seq);
+      InstrumentedMutexLock lock(&mutex_);
+      read_callbacks_.push_back(
+          std::unique_ptr<DummyReadCallback>(read_callback));
+    }
+    return dbfull()->NewIteratorImpl(read_options, cfd, seq, read_callback);
+  }
+
+ private:
+  InstrumentedMutex mutex_;
+  std::vector<std::unique_ptr<DummyReadCallback>> read_callbacks_;
+};
+
+TEST_P(DBIteratorTest, IteratorProperty) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "1", "2"));
+  ASSERT_OK(Delete(1, "2"));
+  ReadOptions ropt;
+  ropt.pin_data = false;
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ropt, handles_[1]));
+    iter->SeekToFirst();
+    std::string prop_value;
+    ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value));
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("0", prop_value);
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    iter->Next();
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("Iterator is not valid.", prop_value);
+
+    // Get internal key at which the iteration stopped (tombstone in this case).
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+    ASSERT_EQ("2", prop_value);
+  }
+  Close();
+}
+
+TEST_P(DBIteratorTest, PersistedTierOnIterator) {
+  // The test needs to be changed if kPersistedTier is supported in iterator.
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions ropt;
+  ropt.read_tier = kPersistedTier;
+
+  auto* iter = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_TRUE(iter->status().IsNotSupported());
+  delete iter;
+
+  std::vector<Iterator*> iters;
+  ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported());
+  Close();
+}
+
+TEST_P(DBIteratorTest, NonBlockingIteration) {
+  do {
+    ReadOptions non_blocking_opts, regular_opts;
+    anon::OptionsOverride options_override;
+    options_override.full_block_cache = true;
+    Options options = CurrentOptions(options_override);
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    non_blocking_opts.read_tier = kBlockCacheTier;
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // write one kv to the database.
+    ASSERT_OK(Put(1, "a", "b"));
+
+    // scan using non-blocking iterator. We should find it because
+    // it is in memtable.
+    Iterator* iter = NewIterator(non_blocking_opts, handles_[1]);
+    int count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    delete iter;
+
+    // flush memtable to storage. Now, the key should not be in the
+    // memtable neither in the block cache.
+    ASSERT_OK(Flush(1));
+
+    // verify that a non-blocking iterator does not find any
+    // kvs. Neither does it do any IOs to storage.
+    uint64_t numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    uint64_t cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      count++;
+    }
+    ASSERT_EQ(count, 0);
+    ASSERT_TRUE(iter->status().IsIncomplete());
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // read in the specified block via a regular get
+    ASSERT_EQ(Get(1, "a"), "b");
+
+    // verify that we can find it via a non-blocking scan
+    numopen = TestGetTickerCount(options, NO_FILE_OPENS);
+    cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
+    iter = NewIterator(non_blocking_opts, handles_[1]);
+    count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(iter->status());
+      count++;
+    }
+    ASSERT_EQ(count, 1);
+    ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
+    ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+    delete iter;
+
+    // This test verifies block cache behaviors, which is not used by plain
+    // table format.
+  } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipMmapReads));
+}
+
+TEST_P(DBIteratorTest, IterSeekBeforePrev) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter->Prev();
+  iter->Seek(Slice("a"));
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterReseekNewUpperBound) {
+  Random rnd(301);
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.block_size_deviation = 50;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("a", rnd.RandomString(400)));
+  ASSERT_OK(Put("aabb", rnd.RandomString(400)));
+  ASSERT_OK(Put("aaef", rnd.RandomString(400)));
+  ASSERT_OK(Put("b", rnd.RandomString(400)));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ReadOptions opts;
+  Slice ub = Slice("aa");
+  opts.iterate_upper_bound = &ub;
+  auto iter = NewIterator(opts);
+  iter->Seek(Slice("a"));
+  ub = Slice("b");
+  iter->Seek(Slice("aabc"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aaef");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevBeforeNext) {
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("0", "f"));
+  ASSERT_OK(Put("1", "h"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("2", "j"));
+  auto iter = NewIterator(ReadOptions());
+  iter->SeekForPrev(Slice("0"));
+  iter->Next();
+  iter->SeekForPrev(Slice("1"));
+  iter->Next();
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // anonymous namespace
+
+TEST_P(DBIteratorTest, IterLongKeys) {
+  ASSERT_OK(Put(MakeLongKey(20, 0), "0"));
+  ASSERT_OK(Put(MakeLongKey(32, 2), "2"));
+  ASSERT_OK(Put("a", "b"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put(MakeLongKey(50, 1), "1"));
+  ASSERT_OK(Put(MakeLongKey(127, 3), "3"));
+  ASSERT_OK(Put(MakeLongKey(64, 4), "4"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  iter->Seek(MakeLongKey(20, 0));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(20, 0) + "->0");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(64, 4) + "->4");
+
+  iter->SeekForPrev(MakeLongKey(127, 3));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  delete iter;
+
+  iter = NewIterator(ReadOptions());
+  iter->Seek(MakeLongKey(50, 1));
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(50, 1) + "->1");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(32, 2) + "->2");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), MakeLongKey(127, 3) + "->3");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterNextWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("a"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->SeekForPrev(Slice("b"));
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("d", "e"));
+  auto iter = NewIterator(ReadOptions());
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Seek(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "d->e");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterPrevWithNewerSeq2) {
+  ASSERT_OK(Put("0", "0"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Put("e", "f"));
+  auto iter = NewIterator(ReadOptions());
+  auto iter2 = NewIterator(ReadOptions());
+  iter->Seek(Slice("c"));
+  iter2->SeekForPrev(Slice("d"));
+  ASSERT_EQ(IterStatus(iter), "c->d");
+  ASSERT_EQ(IterStatus(iter2), "c->d");
+
+  // Create a key that needs to be skipped for Seq too new
+  for (uint64_t i = 0; i < last_options_.max_sequential_skip_in_iterations + 1;
+       i++) {
+    ASSERT_OK(Put("b", "f"));
+  }
+
+  iter->Prev();
+  ASSERT_EQ(IterStatus(iter), "a->b");
+  iter->Prev();
+  iter2->Prev();
+  ASSERT_EQ(IterStatus(iter2), "a->b");
+  iter2->Prev();
+  delete iter;
+  delete iter2;
+}
+
+TEST_P(DBIteratorTest, IterEmpty) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekForPrev("foo");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    ASSERT_OK(iter->status());
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterSingle) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMulti) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", "vb"));
+    ASSERT_OK(Put(1, "c", "vc"));
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->Seek("");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("a");
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Seek("ax");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("d");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("c");
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->SeekForPrev("bx");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    iter->Seek("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Seek("z");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekForPrev("b");
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->SeekForPrev("");
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    // Switch from reverse to forward
+    iter->SeekToLast();
+    iter->Prev();
+    iter->Prev();
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Switch from forward to reverse
+    iter->SeekToFirst();
+    iter->Next();
+    iter->Next();
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+
+    // Make sure iter stays at snapshot
+    ASSERT_OK(Put(1, "a", "va2"));
+    ASSERT_OK(Put(1, "a2", "va3"));
+    ASSERT_OK(Put(1, "b", "vb2"));
+    ASSERT_OK(Put(1, "c", "vc2"));
+    ASSERT_OK(Delete(1, "b"));
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->vb");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+// Check that we can skip over a run of user keys
+// by using reseek rather than sequential scan
+TEST_P(DBIteratorTest, IterReseek) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  options.max_sequential_skip_in_iterations = 3;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // insert three keys with same userkey and verify that
+  // reseek is not invoked. For each of these test cases,
+  // verify that we can find the next key "b".
+  ASSERT_OK(Put(1, "a", "zero"));
+  ASSERT_OK(Put(1, "a", "one"));
+  ASSERT_OK(Put(1, "a", "two"));
+  ASSERT_OK(Put(1, "b", "bone"));
+  Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->two");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of three keys with same userkey and verify
+  // that reseek is still not invoked.
+  ASSERT_OK(Put(1, "a", "three"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->three");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // insert a total of four keys with same userkey and verify
+  // that reseek is invoked.
+  ASSERT_OK(Put(1, "a", "four"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "b->bone");
+  delete iter;
+
+  // Testing reverse iterator
+  // At this point, we have three versions of "a" and one version of "b".
+  // The reseek statistics is already at 1.
+  int num_reseeks = static_cast<int>(
+      TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  // Insert another version of b and assert that reseek is not invoked
+  ASSERT_OK(Put(1, "b", "btwo"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->btwo");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 1);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+
+  // insert two more versions of b. This makes a total of 4 versions
+  // of b and 4 versions of a.
+  ASSERT_OK(Put(1, "b", "bthree"));
+  ASSERT_OK(Put(1, "b", "bfour"));
+  iter = NewIterator(ReadOptions(), handles_[1]);
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), "b->bfour");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 2);
+  iter->Prev();
+
+  // the previous Prev call should have invoked reseek
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
+            num_reseeks + 3);
+  ASSERT_EQ(IterStatus(iter), "a->four");
+  delete iter;
+}
+
+TEST_F(DBIteratorTest, ReseekUponDirectionChange) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(
+      new StringAppendTESTOperator(/*delim_char=*/' '));
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Put("bar", "value"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->SeekToLast();
+    it->Prev();
+    it->Next();
+  }
+  ASSERT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  const std::string merge_key("good");
+  ASSERT_OK(Put(merge_key, "orig"));
+  ASSERT_OK(Merge(merge_key, "suffix"));
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek(merge_key);
+    ASSERT_TRUE(it->Valid());
+    const uint64_t prev_reseek_count =
+        options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+    it->Prev();
+    ASSERT_EQ(prev_reseek_count + 1, options.statistics->getTickerCount(
+                                         NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+}
+
+TEST_P(DBIteratorTest, IterSmallAndLargeMix) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "a", "va"));
+    ASSERT_OK(Put(1, "b", std::string(100000, 'b')));
+    ASSERT_OK(Put(1, "c", "vc"));
+    ASSERT_OK(Put(1, "d", std::string(100000, 'd')));
+    ASSERT_OK(Put(1, "e", std::string(100000, 'e')));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    iter->SeekToLast();
+    ASSERT_EQ(IterStatus(iter), "e->" + std::string(100000, 'e'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "d->" + std::string(100000, 'd'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "c->vc");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "b->" + std::string(100000, 'b'));
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "a->va");
+    iter->Prev();
+    ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IterMultiWithDelete) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "ka", "va"));
+    ASSERT_OK(Put(1, "kb", "vb"));
+    ASSERT_OK(Put(1, "kc", "vc"));
+    ASSERT_OK(Delete(1, "kb"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "kb"));
+
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+    iter->Seek("kc");
+    ASSERT_EQ(IterStatus(iter), "kc->vc");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ &&
+          kHashSkipList != option_config_) {  // doesn't support SeekToLast
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "ka->va");
+      }
+    }
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IterPrevMaxSkip) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(Put(1, "key1", "v1"));
+      ASSERT_OK(Put(1, "key2", "v2"));
+      ASSERT_OK(Put(1, "key3", "v3"));
+      ASSERT_OK(Put(1, "key4", "v4"));
+      ASSERT_OK(Put(1, "key5", "v5"));
+    }
+
+    VerifyIterLast("key5->v5", 1);
+
+    ASSERT_OK(Delete(1, "key5"));
+    VerifyIterLast("key4->v4", 1);
+
+    ASSERT_OK(Delete(1, "key4"));
+    VerifyIterLast("key3->v3", 1);
+
+    ASSERT_OK(Delete(1, "key3"));
+    VerifyIterLast("key2->v2", 1);
+
+    ASSERT_OK(Delete(1, "key2"));
+    VerifyIterLast("key1->v1", 1);
+
+    ASSERT_OK(Delete(1, "key1"));
+    VerifyIterLast("(invalid)", 1);
+  } while (ChangeOptions(kSkipMergePut | kSkipNoSeekToLast));
+}
+
+TEST_P(DBIteratorTest, IterWithSnapshot) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(1, "key1", "val1"));
+    ASSERT_OK(Put(1, "key2", "val2"));
+    ASSERT_OK(Put(1, "key3", "val3"));
+    ASSERT_OK(Put(1, "key4", "val4"));
+    ASSERT_OK(Put(1, "key5", "val5"));
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ReadOptions options;
+    options.snapshot = snapshot;
+    Iterator* iter = NewIterator(options, handles_[1]);
+
+    ASSERT_OK(Put(1, "key0", "val0"));
+    // Put more values after the snapshot
+    ASSERT_OK(Put(1, "key100", "val100"));
+    ASSERT_OK(Put(1, "key101", "val101"));
+
+    iter->Seek("key5");
+    ASSERT_EQ(IterStatus(iter), "key5->val5");
+    if (!CurrentOptions().merge_operator) {
+      // TODO: merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key4->val4");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key5->val5");
+      }
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+    }
+
+    if (!CurrentOptions().merge_operator) {
+      // TODO(gzh): merge operator does not support backward iteration yet
+      if (kPlainTableAllBytesPrefix != option_config_ &&
+          kBlockBasedTableWithWholeKeyHashIndex != option_config_ &&
+          kHashLinkList != option_config_ && kHashSkipList != option_config_) {
+        iter->SeekForPrev("key1");
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Next();
+        ASSERT_EQ(IterStatus(iter), "key3->val3");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key2->val2");
+        iter->Prev();
+        ASSERT_EQ(IterStatus(iter), "key1->val1");
+        iter->Prev();
+        ASSERT_TRUE(!iter->Valid());
+      }
+    }
+    db_->ReleaseSnapshot(snapshot);
+    delete iter;
+  } while (ChangeOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorPinsRef) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "hello"));
+
+    // Get iterator that will yield the current contents of the DB.
+    Iterator* iter = NewIterator(ReadOptions(), handles_[1]);
+
+    // Write to force compactions
+    ASSERT_OK(Put(1, "foo", "newvalue1"));
+    for (int i = 0; i < 100; i++) {
+      // 100K values
+      ASSERT_OK(Put(1, Key(i), Key(i) + std::string(100000, 'v')));
+    }
+    ASSERT_OK(Put(1, "foo", "newvalue2"));
+
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ("hello", iter->value().ToString());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    delete iter;
+  } while (ChangeCompactOptions());
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDelete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ASSERT_OK(Put(1, "foo", "delete-cf-then-delete-iter"));
+  ASSERT_OK(Put(1, "hello", "value2"));
+
+  ColumnFamilyHandle* cf = handles_[1];
+  ReadOptions ro;
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->delete-cf-then-delete-iter");
+
+  // delete CF handle
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is deleted
+  iter->Next();
+  ASSERT_EQ(IterStatus(iter), "hello->value2");
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IteratorDeleteAfterCfDrop) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ASSERT_OK(Put(1, "foo", "drop-cf-then-delete-iter"));
+
+  ReadOptions ro;
+  ColumnFamilyHandle* cf = handles_[1];
+
+  auto* iter = db_->NewIterator(ro, cf);
+  iter->SeekToFirst();
+  ASSERT_EQ(IterStatus(iter), "foo->drop-cf-then-delete-iter");
+
+  // drop and delete CF
+  EXPECT_OK(db_->DropColumnFamily(cf));
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(cf));
+  handles_.erase(std::begin(handles_) + 1);
+
+  // delete Iterator after CF handle is dropped
+  delete iter;
+}
+
+// SetOptions not defined in ROCKSDB LITE
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, DBIteratorBoundTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing basic case with no iterate_upper_bound and no prefix_extractor
+  {
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->SeekForPrev("g1");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("g1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+  }
+
+  // testing iterate_upper_bound and forward iterator
+  // to make sure it stops at bound
+  {
+    ReadOptions ro;
+    // iterate_upper_bound points beyond the last expected entry
+    Slice prefix("foo2");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("foo1")), 0);
+
+    iter->Next();
+    // should stop here...
+    ASSERT_TRUE(!iter->Valid());
+  }
+  // Testing SeekToLast with iterate_upper_bound set
+  {
+    ReadOptions ro;
+
+    Slice prefix("foo");
+    ro.iterate_upper_bound = &prefix;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("a")), 0);
+  }
+
+  // prefix is the first letter of the key
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("g1", "0"));
+
+  // testing with iterate_upper_bound and prefix_extractor
+  // Seek target and iterate_upper_bound are not is same prefix
+  // This should be an error
+  {
+    ReadOptions ro;
+    Slice upper_bound("g");
+    ro.iterate_upper_bound = &upper_bound;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo1", iter->key().ToString());
+
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+  }
+
+  // testing that iterate_upper_bound prevents iterating over deleted items
+  // if the bound has already reached
+  {
+    options.prefix_extractor = nullptr;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("a", "0"));
+    ASSERT_OK(Put("b", "0"));
+    ASSERT_OK(Put("b1", "0"));
+    ASSERT_OK(Put("c", "0"));
+    ASSERT_OK(Put("d", "0"));
+    ASSERT_OK(Put("e", "0"));
+    ASSERT_OK(Delete("c"));
+    ASSERT_OK(Delete("d"));
+
+    // base case with no bound
+    ReadOptions ro;
+    ro.iterate_upper_bound = nullptr;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    get_perf_context()->Reset();
+    iter->Next();
+
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 2);
+
+    // now testing with iterate_bound
+    Slice prefix("c");
+    ro.iterate_upper_bound = &prefix;
+
+    iter.reset(NewIterator(ro));
+
+    get_perf_context()->Reset();
+
+    iter->Seek("b");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("b")), 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(("b1")), 0);
+
+    iter->Next();
+    // the iteration should stop as soon as the bound key is reached
+    // even though the key is deleted
+    // hence internal_delete_skipped_count should be 0
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_EQ(
+        static_cast<int>(get_perf_context()->internal_delete_skipped_count), 0);
+  }
+}
+
+TEST_P(DBIteratorTest, DBIteratorBoundMultiSeek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.prefix_extractor = nullptr;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("a", "0"));
+  ASSERT_OK(Put("z", "0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  {
+    std::string up_str = "foo5";
+    Slice up(up_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &up;
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+
+    uint64_t prev_block_cache_hit =
+        TestGetTickerCount(options, BLOCK_CACHE_HIT);
+    uint64_t prev_block_cache_miss =
+        TestGetTickerCount(options, BLOCK_CACHE_MISS);
+
+    ASSERT_GT(prev_block_cache_hit + prev_block_cache_miss, 0);
+
+    iter->Seek("foo4");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo4")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+
+    iter->Seek("foo2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo3")), 0);
+    ASSERT_EQ(prev_block_cache_hit,
+              TestGetTickerCount(options, BLOCK_CACHE_HIT));
+    ASSERT_EQ(prev_block_cache_miss,
+              TestGetTickerCount(options, BLOCK_CACHE_MISS));
+  }
+}
+#endif
+
+TEST_P(DBIteratorTest, DBIteratorBoundOptimizationTest) {
+  for (auto format_version : {2, 3, 4}) {
+    int upper_bound_hits = 0;
+    Options options = CurrentOptions();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTableIterator:out_of_bound",
+        [&upper_bound_hits](void*) { upper_bound_hits++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    BlockBasedTableOptions table_options;
+    table_options.format_version = format_version;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo1", "bar1"));
+    ASSERT_OK(Put("foo2", "bar2"));
+    ASSERT_OK(Put("foo4", "bar4"));
+    ASSERT_OK(Flush());
+
+    Slice ub("foo3");
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo1")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Slice("foo2")), 0);
+    ASSERT_EQ(upper_bound_hits, 0);
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_EQ(upper_bound_hits, 1);
+  }
+}
+
+// Enable kBinarySearchWithFirstKey, do some iterator operations and check that
+// they don't do unnecessary block reads.
+TEST_P(DBIteratorTest, IndexWithFirstKey) {
+  for (int tailing = 0; tailing < 2; ++tailing) {
+    SCOPED_TRACE("tailing = " + std::to_string(tailing));
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.create_if_missing = true;
+    options.prefix_extractor = nullptr;
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Statistics* stats = options.statistics.get();
+    BlockBasedTableOptions table_options;
+    table_options.index_type =
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+    table_options.index_shortening =
+        BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+    table_options.block_cache =
+        NewLRUCache(8000);  // fits all blocks and their cache metadata overhead
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Merge("a1", "x1"));
+    ASSERT_OK(Merge("b1", "y1"));
+    ASSERT_OK(Merge("c0", "z1"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a2", "x2"));
+    ASSERT_OK(Merge("b2", "y2"));
+    ASSERT_OK(Merge("c0", "z2"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Merge("a3", "x3"));
+    ASSERT_OK(Merge("b3", "y3"));
+    ASSERT_OK(Merge("c3", "z3"));
+    ASSERT_OK(Flush());
+
+    // Block cache is not important for this test.
+    // We use BLOCK_CACHE_DATA_* counters just because they're the most readily
+    // available way of counting block accesses.
+
+    ReadOptions ropt;
+    ropt.tailing = tailing;
+    std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+    ropt.read_tier = ReadTier::kBlockCacheTier;
+    std::unique_ptr<Iterator> nonblocking_iter(NewIterator(ropt));
+
+    iter->Seek("b10");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    // The cache-only iterator should succeed too, using the blocks pulled into
+    // the cache by the previous iterator.
+    nonblocking_iter->Seek("b10");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b2", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y2", nonblocking_iter->value().ToString());
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // ... but it shouldn't be able to step forward since the next block is
+    // not in cache yet.
+    nonblocking_iter->Next();
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+    // ... nor should a seek to the next key succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_FALSE(nonblocking_iter->Valid());
+    ASSERT_TRUE(nonblocking_iter->status().IsIncomplete());
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b3", iter->key().ToString());
+    EXPECT_EQ("y3", iter->value().ToString());
+    EXPECT_EQ(4, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+    EXPECT_EQ(1, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    // After the blocking iterator loaded the next block, the nonblocking
+    // iterator's seek should succeed.
+    nonblocking_iter->Seek("b20");
+    ASSERT_TRUE(nonblocking_iter->Valid());
+    EXPECT_EQ("b3", nonblocking_iter->key().ToString());
+    EXPECT_EQ("y3", nonblocking_iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+    iter->Seek("c0");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c0", iter->key().ToString());
+    EXPECT_EQ("z1,z2", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(6, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("c3", iter->key().ToString());
+    EXPECT_EQ("z3", iter->value().ToString());
+    EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter.reset();
+
+    // Enable iterate_upper_bound and check that iterator is not trying to read
+    // blocks that are fully above upper bound.
+    std::string ub = "b3";
+    Slice ub_slice(ub);
+    ropt.iterate_upper_bound = &ub_slice;
+    iter.reset(NewIterator(ropt));
+
+    iter->Seek("b2");
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ("b2", iter->key().ToString());
+    EXPECT_EQ("y2", iter->value().ToString());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+    EXPECT_EQ(7, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  }
+}
+
+TEST_P(DBIteratorTest, IndexWithFirstKeyGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor = nullptr;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Statistics* stats = options.statistics.get();
+  BlockBasedTableOptions table_options;
+  table_options.index_type =
+      BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  table_options.block_cache = NewLRUCache(1000);  // fits all blocks
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("a", "x1"));
+  ASSERT_OK(Merge("c", "y1"));
+  ASSERT_OK(Merge("e", "z1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("c", "y2"));
+  ASSERT_OK(Merge("e", "z2"));
+  ASSERT_OK(Flush());
+
+  // Get() between blocks shouldn't read any blocks.
+  ASSERT_EQ("NOT_FOUND", Get("b"));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  // Get() of an existing key shouldn't read any unnecessary blocks when there's
+  // only one key per block.
+
+  ASSERT_EQ("y1,y2", Get("c"));
+  EXPECT_EQ(2, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  ASSERT_EQ("x1", Get("a"));
+  EXPECT_EQ(3, stats->getTickerCount(BLOCK_CACHE_DATA_MISS));
+  EXPECT_EQ(0, stats->getTickerCount(BLOCK_CACHE_DATA_HIT));
+
+  EXPECT_EQ(std::vector<std::string>({"NOT_FOUND", "z1,z2"}),
+            MultiGet({"b", "e"}));
+}
+
+// TODO(3.13): fix the issue of Seek() + Prev() which might not necessary
+//             return the biggest key which is smaller than the seek key.
+TEST_P(DBIteratorTest, PrevAfterAndNextAfterMerge) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // write three entries with different keys using Merge()
+  WriteOptions wopts;
+  ASSERT_OK(db_->Merge(wopts, "1", "data1"));
+  ASSERT_OK(db_->Merge(wopts, "2", "data2"));
+  ASSERT_OK(db_->Merge(wopts, "3", "data3"));
+
+  std::unique_ptr<Iterator> it(NewIterator(ReadOptions()));
+
+  it->Seek("2");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+
+  it->Prev();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->SeekForPrev("1");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("1", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("2", it->key().ToString());
+}
+
+class DBIteratorTestForPinnedData : public DBIteratorTest {
+ public:
+  enum TestConfig {
+    NORMAL,
+    CLOSE_AND_OPEN,
+    COMPACT_BEFORE_READ,
+    FLUSH_EVERY_1000,
+    MAX
+  };
+  DBIteratorTestForPinnedData() : DBIteratorTest() {}
+  void PinnedDataIteratorRandomized(TestConfig run_config) {
+    // Generate Random data
+    Random rnd(301);
+
+    int puts = 100000;
+    int key_pool = static_cast<int>(puts * 0.7);
+    int key_size = 100;
+    int val_size = 1000;
+    int seeks_percentage = 20;   // 20% of keys will be used to test seek()
+    int delete_percentage = 20;  // 20% of keys will be deleted
+    int merge_percentage = 20;   // 20% of keys will be added using Merge()
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options;
+    table_options.use_delta_encoding = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.merge_operator = MergeOperators::CreatePutOperator();
+    DestroyAndReopen(options);
+
+    std::vector<std::string> generated_keys(key_pool);
+    for (int i = 0; i < key_pool; i++) {
+      generated_keys[i] = rnd.RandomString(key_size);
+    }
+
+    std::map<std::string, std::string> true_data;
+    std::vector<std::string> random_keys;
+    std::vector<std::string> deleted_keys;
+    for (int i = 0; i < puts; i++) {
+      auto& k = generated_keys[rnd.Next() % key_pool];
+      auto v = rnd.RandomString(val_size);
+
+      // Insert data to true_data map and to DB
+      true_data[k] = v;
+      if (rnd.PercentTrue(merge_percentage)) {
+        ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+      } else {
+        ASSERT_OK(Put(k, v));
+      }
+
+      // Pick random keys to be used to test Seek()
+      if (rnd.PercentTrue(seeks_percentage)) {
+        random_keys.push_back(k);
+      }
+
+      // Delete some random keys
+      if (rnd.PercentTrue(delete_percentage)) {
+        deleted_keys.push_back(k);
+        true_data.erase(k);
+        ASSERT_OK(Delete(k));
+      }
+
+      if (run_config == TestConfig::FLUSH_EVERY_1000) {
+        if (i && i % 1000 == 0) {
+          ASSERT_OK(Flush());
+        }
+      }
+    }
+
+    if (run_config == TestConfig::CLOSE_AND_OPEN) {
+      Close();
+      Reopen(options);
+    } else if (run_config == TestConfig::COMPACT_BEFORE_READ) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    ReadOptions ro;
+    ro.pin_data = true;
+    auto iter = NewIterator(ro);
+
+    {
+      // Test Seek to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->Seek(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.lower_bound(k), true_data.end());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back(true_data.lower_bound(k)->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test SeekForPrev to random keys
+      std::vector<Slice> keys_slices;
+      std::vector<std::string> true_keys;
+      for (auto& k : random_keys) {
+        iter->SeekForPrev(k);
+        if (!iter->Valid()) {
+          ASSERT_EQ(true_data.upper_bound(k), true_data.begin());
+          continue;
+        }
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        keys_slices.push_back(iter->key());
+        true_keys.push_back((--true_data.upper_bound(k))->first);
+      }
+
+      for (size_t i = 0; i < keys_slices.size(); i++) {
+        ASSERT_EQ(keys_slices[i].ToString(), true_keys[i]);
+      }
+    }
+
+    {
+      // Test iterating all data forward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid
+      auto data_iter = true_data.begin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    {
+      // Test iterating all data backward
+      std::vector<Slice> all_keys;
+      for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+        std::string prop_value;
+        ASSERT_OK(
+            iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+        ASSERT_EQ("1", prop_value);
+        all_keys.push_back(iter->key());
+      }
+      ASSERT_EQ(all_keys.size(), true_data.size());
+
+      // Verify that all keys slices are valid (backward)
+      auto data_iter = true_data.rbegin();
+      for (size_t i = 0; i < all_keys.size(); i++) {
+        ASSERT_EQ(all_keys[i].ToString(), data_iter->first);
+        data_iter++;
+      }
+    }
+
+    delete iter;
+  }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedNormal) {
+  PinnedDataIteratorRandomized(TestConfig::NORMAL);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedCLoseAndOpen) {
+  PinnedDataIteratorRandomized(TestConfig::CLOSE_AND_OPEN);
+}
+
+TEST_P(DBIteratorTestForPinnedData,
+       PinnedDataIteratorRandomizedCompactBeforeRead) {
+  PinnedDataIteratorRandomized(TestConfig::COMPACT_BEFORE_READ);
+}
+
+TEST_P(DBIteratorTestForPinnedData, PinnedDataIteratorRandomizedFlush) {
+  PinnedDataIteratorRandomized(TestConfig::FLUSH_EVERY_1000);
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestForPinnedDataInstance,
+                        DBIteratorTestForPinnedData,
+                        testing::Values(true, false));
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBIteratorTest, PinnedDataIteratorMultipleFiles) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 1024 * 1024 * 10;  // 10 Mb
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // Generate 4 sst files in L2
+  Random rnd(301);
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 3);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(FilesPerLevel(0), "0,4");
+
+  // Generate 4 sst files in L0
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i * 2);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+    if (i % 250 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  // Add some keys/values in memtables
+  for (int i = 1; i <= 1000; i++) {
+    std::string k = Key(i);
+    std::string v = rnd.RandomString(100);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_EQ(FilesPerLevel(0), "4,4");
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), true_data.size());
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+#endif
+
+TEST_P(DBIteratorTest, PinnedDataIteratorMergeOperator) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  DestroyAndReopen(options);
+
+  std::string numbers[7];
+  for (int val = 0; val <= 6; val++) {
+    PutFixed64(numbers + val, val);
+  }
+
+  // +1 all keys in range [ 0 => 999]
+  for (int i = 0; i < 1000; i++) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[1]));
+  }
+
+  // +2 all keys divisible by 2 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 2) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[2]));
+  }
+
+  // +3 all keys divisible by 5 in range [ 0 => 999]
+  for (int i = 0; i < 1000; i += 5) {
+    WriteOptions wo;
+    ASSERT_OK(db_->Merge(wo, Key(i), numbers[3]));
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  ASSERT_EQ(results.size(), 1000);
+  for (size_t i = 0; i < results.size(); i++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, Key(static_cast<int>(i)));
+    int expected_val = 1;
+    if (i % 2 == 0) {
+      expected_val += 2;
+    }
+    if (i % 5 == 0) {
+      expected_val += 3;
+    }
+    ASSERT_EQ(kv.second, numbers[expected_val]);
+  }
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, PinnedDataIteratorReadAfterUpdate) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.use_delta_encoding = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 100000;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < 1000; i++) {
+    std::string k = rnd.RandomString(10);
+    std::string v = rnd.RandomString(1000);
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+
+  ReadOptions ro;
+  ro.pin_data = true;
+  auto iter = NewIterator(ro);
+
+  // Delete 50% of the keys and update the other 50%
+  for (auto& kv : true_data) {
+    if (rnd.OneIn(2)) {
+      ASSERT_OK(Delete(kv.first));
+    } else {
+      std::string new_val = rnd.RandomString(1000);
+      ASSERT_OK(Put(kv.first, new_val));
+    }
+  }
+
+  std::vector<std::pair<Slice, std::string>> results;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string prop_value;
+    ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value));
+    ASSERT_EQ("1", prop_value);
+    results.emplace_back(iter->key(), iter->value().ToString());
+  }
+
+  auto data_iter = true_data.begin();
+  for (size_t i = 0; i < results.size(); i++, data_iter++) {
+    auto& kv = results[i];
+    ASSERT_EQ(kv.first, data_iter->first);
+    ASSERT_EQ(kv.second, data_iter->second);
+  }
+
+  delete iter;
+}
+
+class SliceTransformLimitedDomainGeneric : public SliceTransform {
+  const char* Name() const override {
+    return "SliceTransformLimitedDomainGeneric";
+  }
+
+  Slice Transform(const Slice& src) const override {
+    return Slice(src.data(), 1);
+  }
+
+  bool InDomain(const Slice& src) const override {
+    // prefix will be x????
+    return src.size() >= 1;
+  }
+
+  bool InRange(const Slice& dst) const override {
+    // prefix will be x????
+    return dst.size() == 1;
+  }
+};
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFiles) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterSeekForPrevCrossingFilesCustomPrefixExtractor) {
+  Options options = CurrentOptions();
+  options.prefix_extractor =
+      std::make_shared<SliceTransformLimitedDomainGeneric>();
+  options.disable_auto_compactions = true;
+  // Enable prefix bloom for SST files
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a1", "va1"));
+  ASSERT_OK(Put("a2", "va2"));
+  ASSERT_OK(Put("a3", "va3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b1", "vb1"));
+  ASSERT_OK(Put("b2", "vb2"));
+  ASSERT_OK(Put("b3", "vb3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b4", "vb4"));
+  ASSERT_OK(Put("d1", "vd1"));
+  ASSERT_OK(Put("d2", "vd2"));
+  ASSERT_OK(Put("d4", "vd4"));
+  ASSERT_OK(Flush());
+
+  MoveFilesToLevel(1);
+  {
+    ReadOptions ro;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekForPrev("a4");
+    ASSERT_EQ(iter->key().ToString(), "a3");
+    ASSERT_EQ(iter->value().ToString(), "va3");
+
+    iter->SeekForPrev("c2");
+    ASSERT_EQ(iter->key().ToString(), "b3");
+    iter->SeekForPrev("d3");
+    ASSERT_EQ(iter->key().ToString(), "d2");
+    iter->SeekForPrev("b5");
+    ASSERT_EQ(iter->key().ToString(), "b4");
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.prefix_same_as_start = true;
+    Iterator* iter = NewIterator(ro);
+    iter->SeekForPrev("c2");
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.max_sequential_skip_in_iterations = 8;
+
+  DestroyAndReopen(options);
+
+  // Putting such deletes will force DBIter::Prev() to fallback to a Seek
+  for (int file_num = 0; file_num < 10; file_num++) {
+    ASSERT_OK(Delete("key4"));
+    ASSERT_OK(Flush());
+  }
+
+  // First File containing 5 blocks of puts
+  ASSERT_OK(Put("key1", "val1.0"));
+  ASSERT_OK(Put("key2", "val2.0"));
+  ASSERT_OK(Put("key3", "val3.0"));
+  ASSERT_OK(Put("key4", "val4.0"));
+  ASSERT_OK(Put("key5", "val5.0"));
+  ASSERT_OK(Flush());
+
+  // Second file containing 9 blocks of merge operands
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key1", "val1.2"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key2", "val2.3"));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.1"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key3", "val3.4"));
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+
+    iter->SeekToLast();
+    ASSERT_EQ(iter->key().ToString(), "key5");
+    ASSERT_EQ(iter->value().ToString(), "val5.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key4");
+    ASSERT_EQ(iter->value().ToString(), "val4.0");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key3");
+    ASSERT_EQ(iter->value().ToString(), "val3.0,val3.1,val3.2,val3.3,val3.4");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key2");
+    ASSERT_EQ(iter->value().ToString(), "val2.0,val2.1,val2.2,val2.3");
+
+    iter->Prev();
+    ASSERT_EQ(iter->key().ToString(), "key1");
+    ASSERT_EQ(iter->value().ToString(), "val1.0,val1.1,val1.2");
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IterPrevKeyCrossingBlocksRandomized) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_sequential_skip_in_iterations = 8;
+  DestroyAndReopen(options);
+
+  const int kNumKeys = 500;
+  // Small number of merge operands to make sure that DBIter::Prev() don't
+  // fall back to Seek()
+  const int kNumMergeOperands = 3;
+  // Use value size that will make sure that every block contain 1 key
+  const int kValSize =
+      static_cast<int>(BlockBasedTableOptions().block_size) * 4;
+  // Percentage of keys that wont get merge operations
+  const int kNoMergeOpPercentage = 20;
+  // Percentage of keys that will be deleted
+  const int kDeletePercentage = 10;
+
+  // For half of the key range we will write multiple deletes first to
+  // force DBIter::Prev() to fall back to Seek()
+  for (int file_num = 0; file_num < 10; file_num++) {
+    for (int i = 0; i < kNumKeys; i += 2) {
+      ASSERT_OK(Delete(Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  std::string gen_key;
+  std::string gen_val;
+
+  for (int i = 0; i < kNumKeys; i++) {
+    gen_key = Key(i);
+    gen_val = rnd.RandomString(kValSize);
+
+    ASSERT_OK(Put(gen_key, gen_val));
+    true_data[gen_key] = gen_val;
+  }
+  ASSERT_OK(Flush());
+
+  // Separate values and merge operands in different file so that we
+  // make sure that we don't merge them while flushing but actually
+  // merge them in the read path
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kNoMergeOpPercentage)) {
+      // Dont give merge operations for some keys
+      continue;
+    }
+
+    for (int j = 0; j < kNumMergeOperands; j++) {
+      gen_key = Key(i);
+      gen_val = rnd.RandomString(kValSize);
+
+      ASSERT_OK(db_->Merge(WriteOptions(), gen_key, gen_val));
+      true_data[gen_key] += "," + gen_val;
+    }
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < kNumKeys; i++) {
+    if (rnd.PercentTrue(kDeletePercentage)) {
+      gen_key = Key(i);
+
+      ASSERT_OK(Delete(gen_key));
+      true_data.erase(gen_key);
+    }
+  }
+  ASSERT_OK(Flush());
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+
+  {
+    ReadOptions ro;
+    ro.fill_cache = false;
+    Iterator* iter = NewIterator(ro);
+    auto data_iter = true_data.rbegin();
+
+    int entries_right = 0;
+    std::string seek_key;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      // Verify key/value of current position
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+
+      bool restore_position_with_seek = rnd.Uniform(2);
+      if (restore_position_with_seek) {
+        seek_key = iter->key().ToString();
+      }
+
+      // Do some Next() operations the restore the iterator to orignal position
+      int next_count =
+          entries_right > 0 ? rnd.Uniform(std::min(entries_right, 10)) : 0;
+      for (int i = 0; i < next_count; i++) {
+        iter->Next();
+        data_iter--;
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+
+      if (restore_position_with_seek) {
+        // Restore orignal position using Seek()
+        iter->Seek(seek_key);
+        for (int i = 0; i < next_count; i++) {
+          data_iter++;
+        }
+
+        ASSERT_EQ(iter->key().ToString(), data_iter->first);
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      } else {
+        // Restore original position using Prev()
+        for (int i = 0; i < next_count; i++) {
+          iter->Prev();
+          data_iter++;
+
+          ASSERT_EQ(iter->key().ToString(), data_iter->first);
+          ASSERT_EQ(iter->value().ToString(), data_iter->second);
+        }
+      }
+
+      entries_right++;
+      data_iter++;
+    }
+    ASSERT_EQ(data_iter, true_data.rend());
+
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, IteratorWithLocalStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 1000; i++) {
+    // Key 10 bytes / Value 10 bytes
+    ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+  }
+
+  std::atomic<uint64_t> total_next(0);
+  std::atomic<uint64_t> total_next_found(0);
+  std::atomic<uint64_t> total_prev(0);
+  std::atomic<uint64_t> total_prev_found(0);
+  std::atomic<uint64_t> total_bytes(0);
+
+  std::vector<port::Thread> threads;
+  std::function<void()> reader_func_next = [&]() {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToFirst();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Next();
+      total_next++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_next_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  std::function<void()> reader_func_prev = [&]() {
+    SetPerfLevel(kEnableCount);
+    Iterator* iter = NewIterator(ReadOptions());
+
+    iter->SeekToLast();
+    // Seek will bump ITER_BYTES_READ
+    uint64_t bytes = 0;
+    bytes += iter->key().size();
+    bytes += iter->value().size();
+    while (true) {
+      iter->Prev();
+      total_prev++;
+
+      if (!iter->Valid()) {
+        break;
+      }
+      total_prev_found++;
+      bytes += iter->key().size();
+      bytes += iter->value().size();
+    }
+
+    delete iter;
+    ASSERT_EQ(bytes, get_perf_context()->iter_read_bytes);
+    SetPerfLevel(kDisable);
+    total_bytes += bytes;
+  };
+
+  for (int i = 0; i < 10; i++) {
+    threads.emplace_back(reader_func_next);
+  }
+  for (int i = 0; i < 15; i++) {
+    threads.emplace_back(reader_func_prev);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), (uint64_t)total_next);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND),
+            (uint64_t)total_next_found);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), (uint64_t)total_prev);
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND),
+            (uint64_t)total_prev_found);
+  ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ),
+            (uint64_t)total_bytes);
+}
+
+TEST_P(DBIteratorTest, ReadAhead) {
+  Options options;
+  env_->count_random_reads_ = true;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 4 << 20;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1024;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  std::string value(1024, 'a');
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), value));
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,1,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  ReadOptions read_options;
+  auto* iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read = env_->random_read_bytes_counter_;
+  delete iter;
+
+  int64_t num_file_closes = TestGetTickerCount(options, NO_FILE_CLOSES);
+  env_->random_read_bytes_counter_ = 0;
+  options.statistics->setTickerCount(NO_FILE_OPENS, 0);
+  read_options.readahead_size = 1024 * 10;
+  iter = NewIterator(read_options);
+  iter->SeekToFirst();
+  int64_t num_file_opens_readahead = TestGetTickerCount(options, NO_FILE_OPENS);
+  size_t bytes_read_readahead = env_->random_read_bytes_counter_;
+  delete iter;
+  int64_t num_file_closes_readahead =
+      TestGetTickerCount(options, NO_FILE_CLOSES);
+  ASSERT_EQ(num_file_opens, num_file_opens_readahead);
+  ASSERT_EQ(num_file_closes, num_file_closes_readahead);
+  ASSERT_GT(bytes_read_readahead, bytes_read);
+  ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3);
+
+  // Verify correctness.
+  iter = NewIterator(read_options);
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(value, iter->value());
+    count++;
+  }
+  ASSERT_EQ(100, count);
+  for (int i = 0; i < 100; i++) {
+    iter->Seek(Key(i));
+    ASSERT_EQ(value, iter->value());
+  }
+  delete iter;
+}
+
+// Insert a key, create a snapshot iterator, overwrite key lots of times,
+// seek to a smaller key. Expect DBIter to fall back to a seek instead of
+// going through all the overwrites linearly.
+TEST_P(DBIteratorTest, DBIteratorSkipRecentDuplicatesTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 3;
+  options.prefix_extractor = nullptr;
+  options.write_buffer_size = 1 << 27;  // big enough to avoid flush
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  // Insert.
+  ASSERT_OK(Put("b", "0"));
+
+  // Create iterator.
+  ReadOptions ro;
+  std::unique_ptr<Iterator> iter(NewIterator(ro));
+
+  // Insert a lot.
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put("b", std::to_string(i + 1).c_str()));
+  }
+
+#ifndef ROCKSDB_LITE
+  // Check that memtable wasn't flushed.
+  std::string val;
+  ASSERT_TRUE(db_->GetProperty("rocksdb.num-files-at-level0", &val));
+  EXPECT_EQ("0", val);
+#endif
+
+  // Seek iterator to a smaller key.
+  get_perf_context()->Reset();
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ("b", iter->key().ToString());
+  EXPECT_EQ("0", iter->value().ToString());
+
+  // Check that the seek didn't do too much work.
+  // Checks are not tight, just make sure that everything is well below 100.
+  EXPECT_LT(get_perf_context()->internal_key_skipped_count, 4);
+  EXPECT_LT(get_perf_context()->internal_recent_skipped_count, 8);
+  EXPECT_LT(get_perf_context()->seek_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->next_on_memtable_count, 10);
+  EXPECT_LT(get_perf_context()->prev_on_memtable_count, 10);
+
+  // Check that iterator did something like what we expect.
+  EXPECT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+  EXPECT_EQ(get_perf_context()->internal_merge_count, 0);
+  EXPECT_GE(get_perf_context()->internal_recent_skipped_count, 2);
+  EXPECT_GE(get_perf_context()->seek_on_memtable_count, 2);
+  EXPECT_EQ(1,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+}
+
+TEST_P(DBIteratorTest, Refresh) {
+  ASSERT_OK(Put("x", "y"));
+
+  std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+  ASSERT_OK(iter->status());
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+  ASSERT_OK(Put("m", "n"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  ASSERT_OK(iter->Refresh());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("c")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("m")), 0);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter.reset();
+}
+
+TEST_P(DBIteratorTest, RefreshWithSnapshot) {
+  ASSERT_OK(Put("x", "y"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ReadOptions options;
+  options.snapshot = snapshot;
+  Iterator* iter = NewIterator(options);
+  ASSERT_OK(iter->status());
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(Put("c", "d"));
+
+  iter->Seek(Slice("a"));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().compare(Slice("x")), 0);
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  ASSERT_OK(iter->status());
+  Status s = iter->Refresh();
+  ASSERT_TRUE(s.IsNotSupported());
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, CreationFailure) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::NewInternalIterator:StatusCallback", [](void* arg) {
+        *(reinterpret_cast<Status*>(arg)) = Status::Corruption("test status");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Iterator* iter = NewIterator(ReadOptions());
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithChangeDirection) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("y1", "1"));
+  ASSERT_OK(Put("y2", "1"));
+  ASSERT_OK(Put("y3", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.iterate_upper_bound = &ub_slice;
+  ro.max_skippable_internal_keys = 1000;
+
+  Iterator* iter = NewIterator(ro);
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, TableFilter) {
+  ASSERT_OK(Put("a", "1"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Put("c", "3"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(Put("d", "4"));
+  ASSERT_OK(Put("e", "5"));
+  ASSERT_OK(Put("f", "6"));
+  EXPECT_OK(dbfull()->Flush(FlushOptions()));
+
+  // Ensure the table_filter callback is called once for each table.
+  {
+    std::set<uint64_t> unseen{1, 2, 3};
+    ReadOptions opts;
+    opts.table_filter = [&](const TableProperties& props) {
+      auto it = unseen.find(props.num_entries);
+      if (it == unseen.end()) {
+        ADD_FAILURE() << "saw table properties with an unexpected "
+                      << props.num_entries << " entries";
+      } else {
+        unseen.erase(it);
+      }
+      return true;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "b->2");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "c->3");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(unseen.empty());
+    delete iter;
+  }
+
+  // Ensure returning false in the table_filter hides the keys from that table
+  // during iteration.
+  {
+    ReadOptions opts;
+    opts.table_filter = [](const TableProperties& props) {
+      return props.num_entries != 2;
+    };
+    auto iter = NewIterator(opts);
+    iter->SeekToFirst();
+    ASSERT_EQ(IterStatus(iter), "a->1");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "d->4");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "e->5");
+    iter->Next();
+    ASSERT_EQ(IterStatus(iter), "f->6");
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    delete iter;
+  }
+}
+
+TEST_P(DBIteratorTest, UpperBoundWithPrevReseek) {
+  Options options = CurrentOptions();
+  options.max_sequential_skip_in_iterations = 3;
+  DestroyAndReopen(options);
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("y", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("z", "1"));
+  ASSERT_OK(Put("bar", "1"));
+  ASSERT_OK(Put("foo", "1"));
+  ASSERT_OK(Put("foo", "2"));
+
+  ASSERT_OK(Put("foo", "3"));
+  ASSERT_OK(Put("foo", "4"));
+  ASSERT_OK(Put("foo", "5"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(Put("foo", "6"));
+
+  std::string upper_bound = "x";
+  Slice ub_slice(upper_bound);
+  ReadOptions ro;
+  ro.snapshot = snapshot;
+  ro.iterate_upper_bound = &ub_slice;
+
+  Iterator* iter = NewIterator(ro);
+  iter->SeekForPrev("goo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  iter->Prev();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_P(DBIteratorTest, SkipStatistics) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int skip_count = 0;
+
+  // write a bunch of kvs to the database.
+  ASSERT_OK(Put("a", "1"));
+  ASSERT_OK(Put("b", "1"));
+  ASSERT_OK(Put("c", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("d", "1"));
+  ASSERT_OK(Put("e", "1"));
+  ASSERT_OK(Put("f", "1"));
+  ASSERT_OK(Put("a", "2"));
+  ASSERT_OK(Put("b", "2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("d"));
+  ASSERT_OK(Delete("e"));
+  ASSERT_OK(Delete("f"));
+
+  Iterator* iter = NewIterator(ReadOptions());
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8;  // 3 deletes + 3 original keys + 2 lower in sequence
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ReadOptions());
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 3);
+  delete iter;
+  skip_count += 8;  // Same as above, but in reverse order
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  ASSERT_OK(Put("aa", "1"));
+  ASSERT_OK(Put("ab", "1"));
+  ASSERT_OK(Put("ac", "1"));
+  ASSERT_OK(Put("ad", "1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("ab"));
+  ASSERT_OK(Delete("ac"));
+  ASSERT_OK(Delete("ad"));
+
+  ReadOptions ro;
+  Slice prefix("b");
+  ro.iterate_upper_bound = &prefix;
+
+  iter = NewIterator(ro);
+  count = 0;
+  for (iter->Seek("aa"); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 1);
+  delete iter;
+  skip_count += 6;  // 3 deletes + 3 original keys
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+
+  iter = NewIterator(ro);
+  count = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    ASSERT_OK(iter->status());
+    count++;
+  }
+  ASSERT_EQ(count, 2);
+  delete iter;
+  // 3 deletes + 3 original keys + lower sequence of "a"
+  skip_count += 7;
+  ASSERT_EQ(skip_count, TestGetTickerCount(options, NUMBER_ITER_SKIP));
+}
+
+TEST_P(DBIteratorTest, SeekAfterHittingManyInternalKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ReadOptions ropts;
+  ropts.max_skippable_internal_keys = 2;
+
+  ASSERT_OK(Put("1", "val_1"));
+  // Add more tombstones than max_skippable_internal_keys so that Next() fails.
+  ASSERT_OK(Delete("2"));
+  ASSERT_OK(Delete("3"));
+  ASSERT_OK(Delete("4"));
+  ASSERT_OK(Delete("5"));
+  ASSERT_OK(Put("6", "val_6"));
+
+  std::unique_ptr<Iterator> iter(NewIterator(ropts));
+  iter->SeekToFirst();
+
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "1");
+  ASSERT_EQ(iter->value().ToString(), "val_1");
+
+  // This should fail as incomplete due to too many non-visible internal keys on
+  // the way to the next valid user key.
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_TRUE(iter->status().IsIncomplete());
+
+  // Get the internal key at which Next() failed.
+  std::string prop_value;
+  ASSERT_OK(iter->GetProperty("rocksdb.iterator.internal-key", &prop_value));
+  ASSERT_EQ("4", prop_value);
+
+  // Create a new iterator to seek to the internal key.
+  std::unique_ptr<Iterator> iter2(NewIterator(ropts));
+  iter2->Seek(prop_value);
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+
+  ASSERT_EQ(iter2->key().ToString(), "6");
+  ASSERT_EQ(iter2->value().ToString(), "val_6");
+}
+
+// Reproduces a former bug where iterator would skip some records when DBIter
+// re-seeks subiterator with Incomplete status.
+TEST_P(DBIteratorTest, NonBlockingIterationBugRepro) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  // Make sure the sst file has more than one block.
+  table_options.flush_block_policy_factory =
+      std::make_shared<FlushBlockEveryKeyPolicyFactory>();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  // Two records in sst file, each in its own block.
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Put("d", ""));
+  ASSERT_OK(Flush());
+
+  // Create a nonblocking iterator before writing to memtable.
+  ReadOptions ropt;
+  ropt.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iter(NewIterator(ropt));
+
+  // Overwrite a key in memtable many times to hit
+  // max_sequential_skip_in_iterations (which is 8 by default).
+  for (int i = 0; i < 20; ++i) {
+    ASSERT_OK(Put("c", ""));
+  }
+
+  // Load the second block in sst file into the block cache.
+  {
+    std::unique_ptr<Iterator> iter2(NewIterator(ReadOptions()));
+    iter2->Seek("d");
+  }
+
+  // Finally seek the nonblocking iterator.
+  iter->Seek("a");
+  // With the bug, the status used to be OK, and the iterator used to point to
+  // "d".
+  EXPECT_TRUE(iter->status().IsIncomplete());
+}
+
+TEST_P(DBIteratorTest, SeekBackwardAfterOutOfUpperBound) {
+  ASSERT_OK(Put("a", ""));
+  ASSERT_OK(Put("b", ""));
+  ASSERT_OK(Flush());
+
+  ReadOptions ropt;
+  Slice ub = "b";
+  ropt.iterate_upper_bound = &ub;
+
+  std::unique_ptr<Iterator> it(dbfull()->NewIterator(ropt));
+  it->SeekForPrev("a");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ("a", it->key().ToString());
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  it->SeekForPrev("a");
+  ASSERT_OK(it->status());
+
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("a", it->key().ToString());
+}
+
+TEST_P(DBIteratorTest, AvoidReseekLevelIterator) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 800;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string random_str = rnd.RandomString(180);
+
+  ASSERT_OK(Put("1", random_str));
+  ASSERT_OK(Put("2", random_str));
+  ASSERT_OK(Put("3", random_str));
+  ASSERT_OK(Put("4", random_str));
+  // A new block
+  ASSERT_OK(Put("5", random_str));
+  ASSERT_OK(Put("6", random_str));
+  ASSERT_OK(Put("7", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("8", random_str));
+  ASSERT_OK(Put("9", random_str));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  int num_find_file_in_level = 0;
+  int num_idx_blk_seek = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelIterator::Seek:BeforeFindFile",
+      [&](void* /*arg*/) { num_find_file_in_level++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "IndexBlockIter::Seek:0", [&](void* /*arg*/) { num_idx_blk_seek++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    std::unique_ptr<Iterator> iter(NewIterator(ReadOptions()));
+    iter->Seek("1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("2");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(1, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("6");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(2, num_idx_blk_seek);
+
+    iter->Seek("7");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(1, num_find_file_in_level);
+    ASSERT_EQ(3, num_idx_blk_seek);
+
+    iter->Seek("8");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(2, num_find_file_in_level);
+    // Still re-seek because "8" is the boundary key, which has
+    // the same user key as the seek key.
+    ASSERT_EQ(4, num_idx_blk_seek);
+
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(5, num_idx_blk_seek);
+
+    // Seek backward never triggers the index block seek to be skipped
+    iter->Seek("5");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(3, num_find_file_in_level);
+    ASSERT_EQ(6, num_idx_blk_seek);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// MyRocks may change iterate bounds before seek. Simply test to make sure such
+// usage doesn't break iterator.
+TEST_P(DBIteratorTest, IterateBoundChangedBeforeSeek) {
+  Options options = CurrentOptions();
+  options.compression = CompressionType::kNoCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  std::string value(50, 'v');
+  Reopen(options);
+  ASSERT_OK(Put("aaa", value));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("eee", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  std::string ub1 = "e";
+  std::string ub2 = "c";
+  Slice ub(ub1);
+  ReadOptions read_opts1;
+  read_opts1.iterate_upper_bound = &ub;
+  Iterator* iter = NewIterator(read_opts1);
+  // Seek and iterate accross block boundary.
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  ub = Slice(ub2);
+  iter->Seek("b");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+
+  std::string lb1 = "a";
+  std::string lb2 = "c";
+  Slice lb(lb1);
+  ReadOptions read_opts2;
+  read_opts2.iterate_lower_bound = &lb;
+  iter = NewIterator(read_opts2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  lb = Slice(lb2);
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_P(DBIteratorTest, IterateWithLowerBoundAcrossFileBoundary) {
+  ASSERT_OK(Put("aaa", "v"));
+  ASSERT_OK(Put("bbb", "v"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ccc", "v"));
+  ASSERT_OK(Put("ddd", "v"));
+  ASSERT_OK(Flush());
+  // Move both files to bottom level.
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Slice lower_bound("b");
+  ReadOptions read_opts;
+  read_opts.iterate_lower_bound = &lower_bound;
+  std::unique_ptr<Iterator> iter(NewIterator(read_opts));
+  iter->SeekForPrev("d");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("ccc", iter->key());
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bbb", iter->key());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_P(DBIteratorTest, Blob) {
+  Options options = CurrentOptions();
+  options.enable_blob_files = true;
+  options.max_sequential_skip_in_iterations = 2;
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  // Note: we have 4 KVs (3 of which are hidden) for key "b" and
+  // max_sequential_skip_in_iterations is set to 2. Thus, we need to do a reseek
+  // anytime we move from "b" to "c" or vice versa.
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "vb3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("c", "vc"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter_guard(NewIterator(ReadOptions()));
+  Iterator* const iter = iter_guard.get();
+
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  iter->Seek("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("a");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "a->va");
+  iter->Seek("ax");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->SeekForPrev("d");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("c");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 2);
+  ASSERT_EQ(IterStatus(iter), "c->vc");
+  iter->SeekForPrev("bx");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  iter->Seek("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->Seek("z");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 3);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+  iter->SeekForPrev("b");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+  iter->SeekForPrev("");
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  ASSERT_EQ(IterStatus(iter), "(invalid)");
+
+  // Switch from reverse to forward
+  iter->SeekToLast();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 4);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 5);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+
+  // Switch from forward to reverse
+  iter->SeekToFirst();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 6);
+  iter->Next();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 7);
+  iter->Prev();
+  ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 8);
+  ASSERT_EQ(IterStatus(iter), "b->vb3");
+}
+
+INSTANTIATE_TEST_CASE_P(DBIteratorTestInstance, DBIteratorTest,
+                        testing::Values(true, false));
+
+// Tests how DBIter work with ReadCallback
+class DBIteratorWithReadCallbackTest : public DBIteratorTest {};
+
+TEST_F(DBIteratorWithReadCallbackTest, ReadCallback) {
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber _max_visible_seq)
+        : ReadCallback(_max_visible_seq) {}
+
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= max_visible_seq_;
+    }
+  };
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("foo", "v2"));
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("a", "va"));
+  ASSERT_OK(Put("z", "vz"));
+  SequenceNumber seq1 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback1(seq1);
+  ASSERT_OK(Put("foo", "v4"));
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v7"));
+
+  SequenceNumber seq2 = db_->GetLatestSequenceNumber();
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  // The iterator are suppose to see data before seq1.
+  Iterator* iter =
+      dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq2, &callback1);
+
+  // Seek
+  // The latest value of "foo" before seq1 is "v3"
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Next
+  // Seek to "a"
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("va", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key
+  // "foo".
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+
+  // Prev
+  // Seek to "z"
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // The previous key is "foo", which is visible to the iterator.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->Prev();  // skipping "bar"
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  // SeekForPrev
+  // The previous key is "foo", which is visible to the iterator.
+  iter->SeekForPrev("y");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v3", iter->value());
+  // "bar" is not visible to the iterator. It will move on to the next key "a".
+  iter->SeekForPrev("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("a", iter->key());
+  ASSERT_EQ("va", iter->value());
+
+  delete iter;
+
+  // Prev beyond max_sequential_skip_in_iterations
+  uint64_t num_versions =
+      CurrentOptions().max_sequential_skip_in_iterations + 10;
+  for (uint64_t i = 0; i < num_versions; i++) {
+    ASSERT_OK(Put("bar", std::to_string(i)));
+  }
+  SequenceNumber seq3 = db_->GetLatestSequenceNumber();
+  TestReadCallback callback2(seq3);
+  ASSERT_OK(Put("bar", "v8"));
+  SequenceNumber seq4 = db_->GetLatestSequenceNumber();
+
+  // The iterator is suppose to see data before seq3.
+  iter = dbfull()->NewIteratorImpl(ReadOptions(), cfd, seq4, &callback2);
+  // Seek to "z", which is visible.
+  iter->Seek("z");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("vz", iter->value());
+  // Previous key is "foo" and the last value "v5" is visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("foo", iter->key());
+  ASSERT_EQ("v5", iter->value());
+  // Since the number of values of "bar" is more than
+  // max_sequential_skip_in_iterations, Prev() will ultimately fallback to
+  // seek in forward direction. Here we test the fallback seek is correct.
+  // The last visible value should be (num_versions - 1), as "v8" is not
+  // visible.
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("bar", iter->key());
+  ASSERT_EQ(std::to_string(num_versions - 1), iter->value());
+
+  delete iter;
+}
+
+TEST_F(DBIteratorTest, BackwardIterationOnInplaceUpdateMemtable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.inplace_update_support = false;
+  options.env = env_;
+  DestroyAndReopen(options);
+  constexpr int kNumKeys = 10;
+
+  // Write kNumKeys to WAL.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    int count = 0;
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+      ++count;
+    }
+    ASSERT_EQ(kNumKeys, count);
+  }
+
+  // Reopen and rebuild the memtable from WAL.
+  options.create_if_missing = false;
+  options.avoid_flush_during_recovery = true;
+  options.inplace_update_support = true;
+  options.allow_concurrent_memtable_write = false;
+  Reopen(options);
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    // Backward iteration not supported due to inplace_update_support = true.
+    ASSERT_TRUE(iter->status().IsNotSupported());
+    ASSERT_FALSE(iter->Valid());
+  }
+}
+
+TEST_F(DBIteratorTest, IteratorRefreshReturnSV) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  std::unique_ptr<Iterator> iter{db_->NewIterator(ReadOptions())};
+  SyncPoint::GetInstance()->SetCallBack(
+      "ArenaWrappedDBIter::Refresh:SV", [&](void*) {
+        ASSERT_OK(db_->Put(WriteOptions(), "dummy", "new SV"));
+        // This makes the local SV obselete.
+        ASSERT_OK(Flush());
+        SyncPoint::GetInstance()->DisableProcessing();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(iter->Refresh());
+  iter.reset();
+  // iter used to not cleanup SV, so the Close() below would hit an assertion
+  // error.
+  Close();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_kv_checksum_test.cc b/src/rocksdb/db/db_kv_checksum_test.cc
new file mode 100644
index 000000000..614399243
--- /dev/null
+++ b/src/rocksdb/db/db_kv_checksum_test.cc
@@ -0,0 +1,885 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_test_util.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class WriteBatchOpType {
+  kPut = 0,
+  kDelete,
+  kSingleDelete,
+  kMerge,
+  kPutEntity,
+  kDeleteRange,
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteBatchOpType operator+(WriteBatchOpType lhs, const int rhs) {
+  using T = std::underlying_type<WriteBatchOpType>::type;
+  return static_cast<WriteBatchOpType>(static_cast<T>(lhs) + rhs);
+}
+
+enum class WriteMode {
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key = 0`
+  // and `WriteOptions::protection_bytes_per_key = 0`
+  kWriteUnprotectedBatch = 0,
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key > 0`.
+  kWriteProtectedBatch,
+  // `Write()` a `WriteBatch` constructed with `protection_bytes_per_key == 0`.
+  // Protection is enabled via `WriteOptions::protection_bytes_per_key > 0`.
+  kWriteOptionProtectedBatch,
+  // TODO(ajkr): add a mode that uses `Write()` wrappers, e.g., `Put()`.
+  kNum,
+};
+
+// Integer addition is needed for `::testing::Range()` to take the enum type.
+WriteMode operator+(WriteMode lhs, const int rhs) {
+  using T = std::underlying_type<WriteMode>::type;
+  return static_cast<WriteMode>(static_cast<T>(lhs) + rhs);
+}
+
+std::pair<WriteBatch, Status> GetWriteBatch(ColumnFamilyHandle* cf_handle,
+                                            size_t protection_bytes_per_key,
+                                            WriteBatchOpType op_type) {
+  Status s;
+  WriteBatch wb(0 /* reserved_bytes */, 0 /* max_bytes */,
+                protection_bytes_per_key, 0 /* default_cf_ts_sz */);
+  switch (op_type) {
+    case WriteBatchOpType::kPut:
+      s = wb.Put(cf_handle, "key", "val");
+      break;
+    case WriteBatchOpType::kDelete:
+      s = wb.Delete(cf_handle, "key");
+      break;
+    case WriteBatchOpType::kSingleDelete:
+      s = wb.SingleDelete(cf_handle, "key");
+      break;
+    case WriteBatchOpType::kDeleteRange:
+      s = wb.DeleteRange(cf_handle, "begin", "end");
+      break;
+    case WriteBatchOpType::kMerge:
+      s = wb.Merge(cf_handle, "key", "val");
+      break;
+    case WriteBatchOpType::kPutEntity:
+      s = wb.PutEntity(cf_handle, "key",
+                       {{"attr_name1", "foo"}, {"attr_name2", "bar"}});
+      break;
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  return {std::move(wb), std::move(s)};
+}
+
+class DbKvChecksumTestBase : public DBTestBase {
+ public:
+  DbKvChecksumTestBase(const std::string& path, bool env_do_fsync)
+      : DBTestBase(path, env_do_fsync) {}
+
+  ColumnFamilyHandle* GetCFHandleToUse(ColumnFamilyHandle* column_family,
+                                       WriteBatchOpType op_type) const {
+    // Note: PutEntity cannot be called without column family
+    if (op_type == WriteBatchOpType::kPutEntity && !column_family) {
+      return db_->DefaultColumnFamily();
+    }
+
+    return column_family;
+  }
+};
+
+class DbKvChecksumTest
+    : public DbKvChecksumTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<WriteBatchOpType, char, WriteMode,
+                     uint32_t /* memtable_protection_bytes_per_key */>> {
+ public:
+  DbKvChecksumTest()
+      : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type_ = std::get<0>(GetParam());
+    corrupt_byte_addend_ = std::get<1>(GetParam());
+    write_mode_ = std::get<2>(GetParam());
+    memtable_protection_bytes_per_key_ = std::get<3>(GetParam());
+  }
+
+  Status ExecuteWrite(ColumnFamilyHandle* cf_handle) {
+    switch (write_mode_) {
+      case WriteMode::kWriteUnprotectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          0 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        // Default write option has protection_bytes_per_key = 0
+        return db_->Write(WriteOptions(), &batch_and_status.first);
+      }
+      case WriteMode::kWriteProtectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          8 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        return db_->Write(WriteOptions(), &batch_and_status.first);
+      }
+      case WriteMode::kWriteOptionProtectedBatch: {
+        auto batch_and_status =
+            GetWriteBatch(GetCFHandleToUse(cf_handle, op_type_),
+                          0 /* protection_bytes_per_key */, op_type_);
+        assert(batch_and_status.second.ok());
+        WriteOptions write_opts;
+        write_opts.protection_bytes_per_key = 8;
+        return db_->Write(write_opts, &batch_and_status.first);
+      }
+      case WriteMode::kNum:
+        assert(false);
+    }
+    return Status::NotSupported("WriteMode " +
+                                std::to_string(static_cast<int>(write_mode_)));
+  }
+
+  void CorruptNextByteCallBack(void* arg) {
+    Slice encoded = *static_cast<Slice*>(arg);
+    if (entry_len_ == std::numeric_limits<size_t>::max()) {
+      // We learn the entry size on the first attempt
+      entry_len_ = encoded.size();
+    }
+    char* buf = const_cast<char*>(encoded.data());
+    buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+    ++corrupt_byte_offset_;
+  }
+
+  bool MoreBytesToCorrupt() { return corrupt_byte_offset_ < entry_len_; }
+
+ protected:
+  WriteBatchOpType op_type_;
+  char corrupt_byte_addend_;
+  WriteMode write_mode_;
+  uint32_t memtable_protection_bytes_per_key_;
+  size_t corrupt_byte_offset_ = 0;
+  size_t entry_len_ = std::numeric_limits<size_t>::max();
+};
+
+std::string GetOpTypeString(const WriteBatchOpType& op_type) {
+  switch (op_type) {
+    case WriteBatchOpType::kPut:
+      return "Put";
+    case WriteBatchOpType::kDelete:
+      return "Delete";
+    case WriteBatchOpType::kSingleDelete:
+      return "SingleDelete";
+    case WriteBatchOpType::kDeleteRange:
+      return "DeleteRange";
+    case WriteBatchOpType::kMerge:
+      return "Merge";
+    case WriteBatchOpType::kPutEntity:
+      return "PutEntity";
+    case WriteBatchOpType::kNum:
+      assert(false);
+  }
+  assert(false);
+  return "";
+}
+
+std::string GetWriteModeString(const WriteMode& mode) {
+  switch (mode) {
+    case WriteMode::kWriteUnprotectedBatch:
+      return "WriteUnprotectedBatch";
+    case WriteMode::kWriteProtectedBatch:
+      return "WriteProtectedBatch";
+    case WriteMode::kWriteOptionProtectedBatch:
+      return "kWriteOptionProtectedBatch";
+    case WriteMode::kNum:
+      assert(false);
+  }
+  return "";
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTest, DbKvChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251),
+                       ::testing::Range(WriteMode::kWriteProtectedBatch,
+                                        WriteMode::kNum),
+                       ::testing::Values(0)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<1>(args.param)))
+          << GetWriteModeString(std::get<2>(args.param))
+          << static_cast<uint32_t>(std::get<3>(args.param));
+      return oss.str();
+    });
+
+// TODO(ajkr): add a test that corrupts the `WriteBatch` contents. Such
+// corruptions should only be detectable in `WriteMode::kWriteProtectedBatch`.
+
+TEST_P(DbKvChecksumTest, MemTableAddCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted in its
+  // memtable entry by adding `corrupt_byte_addend_` to its original value. The
+  // test repeats until an attempt has been made on each byte in the encoded
+  // memtable entry. All attempts are expected to fail with `Status::Corruption`
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Options options = CurrentOptions();
+    if (op_type_ == WriteBatchOpType::kMerge) {
+      options.merge_operator = MergeOperators::CreateStringAppendOperator();
+    }
+    Reopen(options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, MemTableAddWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_` to a non-default column family. Each attempt has
+  // one byte corrupted in its memtable entry by adding `corrupt_byte_addend_`
+  // to its original value. The test repeats until an attempt has been made on
+  // each byte in the encoded memtable entry. All attempts are expected to fail
+  // with `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+
+  while (MoreBytesToCorrupt()) {
+    // Failed memtable insert always leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(handles_[1]).IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, NoCorruptionCase) {
+  // If this test fails, we may have found a piece of malfunctioned hardware
+  auto batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type_),
+                    8 /* protection_bytes_per_key */, op_type_);
+  ASSERT_OK(batch_and_status.second);
+  ASSERT_OK(batch_and_status.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+  // `corrupt_byte_addend_` to its original value. The test repeats until an
+  // attempt has been made on each byte in the encoded write batch. All attempts
+  // are expected to fail with `Status::Corruption`
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteToWAL:log_entry",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  // First 8 bytes are for sequence number which is not protected in write batch
+  corrupt_byte_offset_ = 8;
+
+  while (MoreBytesToCorrupt()) {
+    // Corrupted write batch leads to read-only mode, so we have to
+    // reopen for every attempt.
+    Reopen(options);
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    // Confirm that nothing was written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+TEST_P(DbKvChecksumTest, WriteToWALWithColumnFamilyCorrupted) {
+  // This test repeatedly attempts to write `WriteBatch`es containing a single
+  // entry of type `op_type_`. Each attempt has one byte corrupted by adding
+  // `corrupt_byte_addend_` to its original value. The test repeats until an
+  // attempt has been made on each byte in the encoded write batch. All attempts
+  // are expected to fail with `Status::Corruption`
+  Options options = CurrentOptions();
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"pikachu"}, options);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteToWAL:log_entry",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  // First 8 bytes are for sequence number which is not protected in write batch
+  corrupt_byte_offset_ = 8;
+
+  while (MoreBytesToCorrupt()) {
+    // Corrupted write batch leads to read-only mode, so we have to
+    // reopen for every attempt.
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+
+    SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_TRUE(ExecuteWrite(nullptr /* cf_handle */).IsCorruption());
+    // Confirm that nothing was written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+    SyncPoint::GetInstance()->DisableProcessing();
+
+    // In case the above callback is not invoked, this test will run
+    // numeric_limits<size_t>::max() times until it reports an error (or will
+    // exhaust disk space). Added this assert to report error early.
+    ASSERT_TRUE(entry_len_ < std::numeric_limits<size_t>::max());
+  }
+}
+
+class DbKvChecksumTestMergedBatch
+    : public DbKvChecksumTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<WriteBatchOpType, WriteBatchOpType, char>> {
+ public:
+  DbKvChecksumTestMergedBatch()
+      : DbKvChecksumTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {
+    op_type1_ = std::get<0>(GetParam());
+    op_type2_ = std::get<1>(GetParam());
+    corrupt_byte_addend_ = std::get<2>(GetParam());
+  }
+
+ protected:
+  WriteBatchOpType op_type1_;
+  WriteBatchOpType op_type2_;
+  char corrupt_byte_addend_;
+};
+
+void CorruptWriteBatch(Slice* content, size_t offset,
+                       char corrupt_byte_addend) {
+  ASSERT_TRUE(offset < content->size());
+  char* buf = const_cast<char*>(content->data());
+  buf[offset] += corrupt_byte_addend;
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, NoCorruptionCase) {
+  // Veirfy write batch checksum after write batch append
+  auto batch1 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                              8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(batch1.second);
+  auto batch2 = GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                              8 /* protection_bytes_per_key */, op_type2_);
+  ASSERT_OK(batch2.second);
+  ASSERT_OK(WriteBatchInternal::Append(&batch1.first, &batch2.first));
+  ASSERT_OK(batch1.first.VerifyChecksum());
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALCorrupted) {
+  // This test has two writers repeatedly attempt to write `WriteBatch`es
+  // containing a single entry of type op_type1_ and op_type2_ respectively. The
+  // leader of the write group writes the batch containinng the entry of type
+  // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+  // `corrupt_byte_addend_` to the batch's original value during each attempt.
+  // The test repeats until an attempt has been made on each byte in both
+  // pre-merged write batches. All attempts are expected to fail with
+  // `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type1_ == WriteBatchOpType::kMerge ||
+      op_type2_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  auto leader_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                    8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(leader_batch_and_status.second);
+  auto follower_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                    8 /* protection_bytes_per_key */, op_type2_);
+  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+  size_t total_bytes =
+      leader_batch_size + follower_batch_and_status.first.GetDataSize();
+  // First 8 bytes are for sequence number which is not protected in write batch
+  size_t corrupt_byte_offset = 8;
+
+  std::atomic<bool> follower_joined{false};
+  std::atomic<int> leader_count{0};
+  port::Thread follower_thread;
+  // This callback should only be called by the leader thread
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+        auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+        ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+        // This callback should only be called by the follower thread
+        SyncPoint::GetInstance()->SetCallBack(
+            "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+              auto* follower =
+                  reinterpret_cast<WriteThread::Writer*>(arg_follower);
+              // The leader thread will wait on this bool and hence wait until
+              // this writer joins the write group
+              ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+              if (corrupt_byte_offset >= leader_batch_size) {
+                Slice batch_content = follower->batch->Data();
+                CorruptWriteBatch(&batch_content,
+                                  corrupt_byte_offset - leader_batch_size,
+                                  corrupt_byte_addend_);
+              }
+              // Leader busy waits on this flag
+              follower_joined = true;
+              // So the follower does not enter the outer callback at
+              // WriteThread::JoinBatchGroup:Wait2
+              SyncPoint::GetInstance()->DisableProcessing();
+            });
+
+        // Start the other writer thread which will join the write group as
+        // follower
+        follower_thread = port::Thread([&]() {
+          follower_batch_and_status =
+              GetWriteBatch(GetCFHandleToUse(nullptr, op_type2_),
+                            8 /* protection_bytes_per_key */, op_type2_);
+          ASSERT_OK(follower_batch_and_status.second);
+          ASSERT_TRUE(
+              db_->Write(WriteOptions(), &follower_batch_and_status.first)
+                  .IsCorruption());
+        });
+
+        ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+        if (corrupt_byte_offset < leader_batch_size) {
+          Slice batch_content = leader->batch->Data();
+          CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+                            corrupt_byte_addend_);
+        }
+        leader_count++;
+        while (!follower_joined) {
+          // busy waiting
+        }
+      });
+  while (corrupt_byte_offset < total_bytes) {
+    // Reopen DB since it failed WAL write which lead to read-only mode
+    Reopen(options);
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    leader_batch_and_status =
+        GetWriteBatch(GetCFHandleToUse(nullptr, op_type1_),
+                      8 /* protection_bytes_per_key */, op_type1_);
+    ASSERT_OK(leader_batch_and_status.second);
+    ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+                    .IsCorruption());
+    follower_thread.join();
+    // Prevent leader thread from entering this callback
+    SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+    ASSERT_EQ(1, leader_count);
+    // Nothing should have been written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+    corrupt_byte_offset++;
+    if (corrupt_byte_offset == leader_batch_size) {
+      // skip over the sequence number part of follower's write batch
+      corrupt_byte_offset += 8;
+    }
+    follower_joined = false;
+    leader_count = 0;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DbKvChecksumTestMergedBatch, WriteToWALWithColumnFamilyCorrupted) {
+  // This test has two writers repeatedly attempt to write `WriteBatch`es
+  // containing a single entry of type op_type1_ and op_type2_ respectively. The
+  // leader of the write group writes the batch containinng the entry of type
+  // op_type1_. One byte of the pre-merged write batches is corrupted by adding
+  // `corrupt_byte_addend_` to the batch's original value during each attempt.
+  // The test repeats until an attempt has been made on each byte in both
+  // pre-merged write batches. All attempts are expected to fail with
+  // `Status::Corruption`.
+  Options options = CurrentOptions();
+  if (op_type1_ == WriteBatchOpType::kMerge ||
+      op_type2_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+  CreateAndReopenWithCF({"ramen"}, options);
+
+  auto leader_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+                    8 /* protection_bytes_per_key */, op_type1_);
+  ASSERT_OK(leader_batch_and_status.second);
+  auto follower_batch_and_status =
+      GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+                    8 /* protection_bytes_per_key */, op_type2_);
+  size_t leader_batch_size = leader_batch_and_status.first.GetDataSize();
+  size_t total_bytes =
+      leader_batch_size + follower_batch_and_status.first.GetDataSize();
+  // First 8 bytes are for sequence number which is not protected in write batch
+  size_t corrupt_byte_offset = 8;
+
+  std::atomic<bool> follower_joined{false};
+  std::atomic<int> leader_count{0};
+  port::Thread follower_thread;
+  // This callback should only be called by the leader thread
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait2", [&](void* arg_leader) {
+        auto* leader = reinterpret_cast<WriteThread::Writer*>(arg_leader);
+        ASSERT_EQ(leader->state, WriteThread::STATE_GROUP_LEADER);
+
+        // This callback should only be called by the follower thread
+        SyncPoint::GetInstance()->SetCallBack(
+            "WriteThread::JoinBatchGroup:Wait", [&](void* arg_follower) {
+              auto* follower =
+                  reinterpret_cast<WriteThread::Writer*>(arg_follower);
+              // The leader thread will wait on this bool and hence wait until
+              // this writer joins the write group
+              ASSERT_NE(follower->state, WriteThread::STATE_GROUP_LEADER);
+              if (corrupt_byte_offset >= leader_batch_size) {
+                Slice batch_content =
+                    WriteBatchInternal::Contents(follower->batch);
+                CorruptWriteBatch(&batch_content,
+                                  corrupt_byte_offset - leader_batch_size,
+                                  corrupt_byte_addend_);
+              }
+              follower_joined = true;
+              // So the follower does not enter the outer callback at
+              // WriteThread::JoinBatchGroup:Wait2
+              SyncPoint::GetInstance()->DisableProcessing();
+            });
+
+        // Start the other writer thread which will join the write group as
+        // follower
+        follower_thread = port::Thread([&]() {
+          follower_batch_and_status =
+              GetWriteBatch(GetCFHandleToUse(handles_[1], op_type2_),
+                            8 /* protection_bytes_per_key */, op_type2_);
+          ASSERT_OK(follower_batch_and_status.second);
+          ASSERT_TRUE(
+              db_->Write(WriteOptions(), &follower_batch_and_status.first)
+                  .IsCorruption());
+        });
+
+        ASSERT_EQ(leader->batch->GetDataSize(), leader_batch_size);
+        if (corrupt_byte_offset < leader_batch_size) {
+          Slice batch_content = WriteBatchInternal::Contents(leader->batch);
+          CorruptWriteBatch(&batch_content, corrupt_byte_offset,
+                            corrupt_byte_addend_);
+        }
+        leader_count++;
+        while (!follower_joined) {
+          // busy waiting
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  while (corrupt_byte_offset < total_bytes) {
+    // Reopen DB since it failed WAL write which lead to read-only mode
+    ReopenWithColumnFamilies({kDefaultColumnFamilyName, "ramen"}, options);
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto log_size_pre_write = dbfull()->TEST_total_log_size();
+    leader_batch_and_status =
+        GetWriteBatch(GetCFHandleToUse(handles_[1], op_type1_),
+                      8 /* protection_bytes_per_key */, op_type1_);
+    ASSERT_OK(leader_batch_and_status.second);
+    ASSERT_TRUE(db_->Write(WriteOptions(), &leader_batch_and_status.first)
+                    .IsCorruption());
+    follower_thread.join();
+    // Prevent leader thread from entering this callback
+    SyncPoint::GetInstance()->ClearCallBack("WriteThread::JoinBatchGroup:Wait");
+
+    ASSERT_EQ(1, leader_count);
+    // Nothing should have been written to WAL
+    ASSERT_EQ(log_size_pre_write, dbfull()->TEST_total_log_size());
+    ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+
+    corrupt_byte_offset++;
+    if (corrupt_byte_offset == leader_batch_size) {
+      // skip over the sequence number part of follower's write batch
+      corrupt_byte_offset += 8;
+    }
+    follower_joined = false;
+    leader_count = 0;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DbKvChecksumTestMergedBatch, DbKvChecksumTestMergedBatch,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kNum),
+                       ::testing::Values(2, 103, 251)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, WriteBatchOpType, char>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param))
+          << GetOpTypeString(std::get<1>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<2>(args.param)));
+      return oss.str();
+    });
+
+// TODO: add test for transactions
+// TODO: add test for corrupted write batch with WAL disabled
+
+class DbKVChecksumWALToWriteBatchTest : public DBTestBase {
+ public:
+  DbKVChecksumWALToWriteBatchTest()
+      : DBTestBase("db_kv_checksum_test", /*env_do_fsync=*/false) {}
+};
+
+TEST_F(DbKVChecksumWALToWriteBatchTest, WriteBatchChecksumHandoff) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  std::string content = "";
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:batch",
+      [&](void* batch_ptr) {
+        WriteBatch* batch = reinterpret_cast<WriteBatch*>(batch_ptr);
+        content.assign(batch->Data().data(), batch->GetDataSize());
+        Slice batch_content = batch->Data();
+        // Corrupt first bit
+        CorruptWriteBatch(&batch_content, 0, 1);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeUpdateProtectionInfo:checksum",
+      [&](void* checksum_ptr) {
+        // Verify that checksum is produced on the batch content
+        uint64_t checksum = *reinterpret_cast<uint64_t*>(checksum_ptr);
+        ASSERT_EQ(checksum, XXH3_64bits(content.data(), content.size()));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(TryReopen(options).IsCorruption());
+  SyncPoint::GetInstance()->DisableProcessing();
+};
+
+// TODO (cbi): add DeleteRange coverage once it is implemented
+class DbMemtableKVChecksumTest : public DbKvChecksumTest {
+ public:
+  DbMemtableKVChecksumTest() : DbKvChecksumTest() {}
+
+ protected:
+  // Indices in the memtable entry that we will not corrupt.
+  // For memtable entry format, see comments in MemTable::Add().
+  // We do not corrupt key length and value length fields in this test
+  // case since it causes segfault and ASAN will complain.
+  // For this test case, key and value are all of length 3, so
+  // key length field is at index 0 and value length field is at index 12.
+  const std::set<size_t> index_not_to_corrupt{0, 12};
+
+  void SkipNotToCorruptEntry() {
+    if (index_not_to_corrupt.find(corrupt_byte_offset_) !=
+        index_not_to_corrupt.end()) {
+      corrupt_byte_offset_++;
+    }
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DbMemtableKVChecksumTest, DbMemtableKVChecksumTest,
+    ::testing::Combine(::testing::Range(static_cast<WriteBatchOpType>(0),
+                                        WriteBatchOpType::kDeleteRange),
+                       ::testing::Values(2, 103, 251),
+                       ::testing::Range(static_cast<WriteMode>(0),
+                                        WriteMode::kWriteOptionProtectedBatch),
+                       // skip 1 byte checksum as it makes test flaky
+                       ::testing::Values(2, 4, 8)),
+    [](const testing::TestParamInfo<
+        std::tuple<WriteBatchOpType, char, WriteMode, uint32_t>>& args) {
+      std::ostringstream oss;
+      oss << GetOpTypeString(std::get<0>(args.param)) << "Add"
+          << static_cast<int>(
+                 static_cast<unsigned char>(std::get<1>(args.param)))
+          << GetWriteModeString(std::get<2>(args.param))
+          << static_cast<uint32_t>(std::get<3>(args.param));
+      return oss.str();
+    });
+
+TEST_P(DbMemtableKVChecksumTest, GetWithCorruptAfterMemtableInsert) {
+  // Record memtable entry size.
+  // Not corrupting memtable entry here since it will segfault
+  // or fail some asserts inside memtablerep implementation
+  // e.g., when key_len is corrupted.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+        Slice encoded = *static_cast<Slice*>(arg);
+        entry_len_ = encoded.size();
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+        char* buf = *static_cast<char**>(entry);
+        buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+        ++corrupt_byte_offset_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    ASSERT_OK(ExecuteWrite(nullptr));
+    std::string val;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "key", &val).IsCorruption());
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+       GetWithColumnFamilyCorruptAfterMemtableInsert) {
+  // Record memtable entry size.
+  // Not corrupting memtable entry here since it will segfault
+  // or fail some asserts inside memtablerep implementation
+  // e.g., when key_len is corrupted.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded", [&](void* arg) {
+        Slice encoded = *static_cast<Slice*>(arg);
+        entry_len_ = encoded.size();
+      });
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "Memtable::SaveValue:Begin:entry", [&](void* entry) {
+        char* buf = *static_cast<char**>(entry);
+        buf[corrupt_byte_offset_] += corrupt_byte_addend_;
+        ++corrupt_byte_offset_;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(ExecuteWrite(handles_[1]));
+    std::string val;
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), handles_[1], "key", &val).IsCorruption());
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest, IteratorWithCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    ASSERT_OK(ExecuteWrite(nullptr));
+    Iterator* it = db_->NewIterator(ReadOptions());
+    it->SeekToFirst();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_TRUE(it->status().IsCorruption());
+    delete it;
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest,
+       IteratorWithColumnFamilyCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  while (MoreBytesToCorrupt()) {
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(ExecuteWrite(handles_[1]));
+    Iterator* it = db_->NewIterator(ReadOptions(), handles_[1]);
+    it->SeekToFirst();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_TRUE(it->status().IsCorruption());
+    delete it;
+    Destroy(options);
+    SkipNotToCorruptEntry();
+  }
+}
+
+TEST_P(DbMemtableKVChecksumTest, FlushWithCorruptAfterMemtableInsert) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTable::Add:BeforeReturn:Encoded",
+      std::bind(&DbKvChecksumTest::CorruptNextByteCallBack, this,
+                std::placeholders::_1));
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options = CurrentOptions();
+  options.memtable_protection_bytes_per_key =
+      memtable_protection_bytes_per_key_;
+  if (op_type_ == WriteBatchOpType::kMerge) {
+    options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  }
+
+  SkipNotToCorruptEntry();
+  // Not corruping each byte like other tests since Flush() is relatively slow.
+  Reopen(options);
+  ASSERT_OK(ExecuteWrite(nullptr));
+  ASSERT_TRUE(Flush().IsCorruption());
+  // DB enters read-only state when flush reads corrupted data
+  ASSERT_TRUE(dbfull()->TEST_GetBGError().IsCorruption());
+  Destroy(options);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_log_iter_test.cc b/src/rocksdb/db/db_log_iter_test.cc
new file mode 100644
index 000000000..4e982858c
--- /dev/null
+++ b/src/rocksdb/db/db_log_iter_test.cc
@@ -0,0 +1,305 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestXactLogIterator : public DBTestBase {
+ public:
+  DBTestXactLogIterator()
+      : DBTestBase("db_log_iter_test", /*env_do_fsync=*/true) {}
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = dbfull()->GetUpdatesSince(seq, &iter);
+    EXPECT_OK(status);
+    EXPECT_TRUE(iter->Valid());
+    return iter;
+  }
+};
+
+namespace {
+SequenceNumber ReadRecords(std::unique_ptr<TransactionLogIterator>& iter,
+                           int& count, bool expect_ok = true) {
+  count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  if (expect_ok) {
+    EXPECT_OK(iter->status());
+  } else {
+    EXPECT_NOK(iter->status());
+  }
+  return res.sequence;
+}
+
+void ExpectRecords(const int expected_no_records,
+                   std::unique_ptr<TransactionLogIterator>& iter) {
+  int num_records;
+  ReadRecords(iter, num_records);
+  ASSERT_EQ(num_records, expected_no_records);
+}
+}  // anonymous namespace
+
+TEST_F(DBTestXactLogIterator, TransactionLogIterator) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put(0, "key1", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_OK(Put(1, "key2", DummyString(1024)));
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3U);
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(3, iter);
+    }
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    env_->SleepForMicroseconds(2 * 1000 * 1000);
+    {
+      ASSERT_OK(Put(0, "key4", DummyString(1024)));
+      ASSERT_OK(Put(1, "key5", DummyString(1024)));
+      ASSERT_OK(Put(0, "key6", DummyString(1024)));
+    }
+    {
+      auto iter = OpenTransactionLogIter(0);
+      ExpectRecords(6, iter);
+    }
+  } while (ChangeCompactOptions());
+}
+
+#ifndef NDEBUG  // sync point is not included with DNDEBUG build
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorRace) {
+  static const int LOG_ITERATOR_RACE_TEST_COUNT = 2;
+  static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = {
+      {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"},
+      {"WalManager::GetSortedWalsOfType:1", "WalManager::PurgeObsoleteFiles:1",
+       "WalManager::PurgeObsoleteFiles:2",
+       "WalManager::GetSortedWalsOfType:2"}};
+  for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) {
+    // Setup sync point dependency to reproduce the race condition of
+    // a log file moved to archived dir, in the middle of GetSortedWalFiles
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {sync_points[test][0], sync_points[test][1]},
+        {sync_points[test][2], sync_points[test][3]},
+    });
+
+    do {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+      Options options = OptionsForLogIterTest();
+      DestroyAndReopen(options);
+      ASSERT_OK(Put("key1", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key2", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key3", DummyString(1024)));
+      ASSERT_OK(dbfull()->Flush(FlushOptions()));
+      ASSERT_OK(Put("key4", DummyString(1024)));
+      ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4U);
+      ASSERT_OK(dbfull()->FlushWAL(false));
+
+      {
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(4, iter);
+      }
+
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+      // trigger async flush, and log move. Well, log move will
+      // wait until the GetSortedWalFiles:1 to reproduce the race
+      // condition
+      FlushOptions flush_options;
+      flush_options.wait = false;
+      ASSERT_OK(dbfull()->Flush(flush_options));
+
+      // "key5" would be written in a new memtable and log
+      ASSERT_OK(Put("key5", DummyString(1024)));
+      ASSERT_OK(dbfull()->FlushWAL(false));
+      {
+        // this iter would miss "key4" if not fixed
+        auto iter = OpenTransactionLogIter(0);
+        ExpectRecords(5, iter);
+      }
+    } while (ChangeCompactOptions());
+  }
+}
+#endif
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorStallAtLastRecord) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    auto iter = OpenTransactionLogIter(0);
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+    iter->Next();
+    ASSERT_TRUE(!iter->Valid());
+    ASSERT_OK(iter->status());
+    ASSERT_OK(Put("key2", DummyString(1024)));
+    iter->Next();
+    ASSERT_OK(iter->status());
+    ASSERT_TRUE(iter->Valid());
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCheckAfterRestart) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key1", DummyString(1024)));
+    ASSERT_OK(Put("key2", DummyString(1023)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+    Reopen(options);
+    auto iter = OpenTransactionLogIter(0);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorCorruptedLog) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+
+    for (int i = 0; i < 1024; i++) {
+      ASSERT_OK(Put("key" + std::to_string(i), DummyString(10)));
+    }
+
+    ASSERT_OK(Flush());
+    ASSERT_OK(db_->FlushWAL(false));
+
+    // Corrupt this log to create a gap
+    ASSERT_OK(db_->DisableFileDeletions());
+
+    VectorLogPtr wal_files;
+    ASSERT_OK(db_->GetSortedWalFiles(wal_files));
+    ASSERT_FALSE(wal_files.empty());
+
+    const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName();
+    ASSERT_OK(test::TruncateFile(env_, logfile_path,
+                                 wal_files.front()->SizeFileBytes() / 2));
+
+    ASSERT_OK(db_->EnableFileDeletions());
+
+    // Insert a new entry to a new log file
+    ASSERT_OK(Put("key1025", DummyString(10)));
+    ASSERT_OK(db_->FlushWAL(false));
+
+    // Try to read from the beginning. Should stop before the gap and read less
+    // than 1025 entries
+    auto iter = OpenTransactionLogIter(0);
+    int count = 0;
+    SequenceNumber last_sequence_read = ReadRecords(iter, count, false);
+    ASSERT_LT(last_sequence_read, 1025U);
+
+    // Try to read past the gap, should be able to seek to key1025
+    auto iter2 = OpenTransactionLogIter(last_sequence_read + 1);
+    ExpectRecords(1, iter2);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBatchOperations) {
+  do {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Flush(0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_OK(Put(1, "key4", DummyString(1024)));
+    auto iter = OpenTransactionLogIter(3);
+    ExpectRecords(2, iter);
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  {
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(handles_[1], "key1", DummyString(1024)));
+    ASSERT_OK(batch.Put(handles_[0], "key2", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob1")));
+    ASSERT_OK(batch.Put(handles_[1], "key3", DummyString(1024)));
+    ASSERT_OK(batch.PutLogData(Slice("blob2")));
+    ASSERT_OK(batch.Delete(handles_[0], "key2"));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  }
+
+  auto res = OpenTransactionLogIter(0)->GetBatch();
+  struct Handler : public WriteBatch::Handler {
+    std::string seen;
+    Status PutCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) override {
+      seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " +
+              std::to_string(value.size()) + ")";
+      return Status::OK();
+    }
+    void LogData(const Slice& blob) override {
+      seen += "LogData(" + blob.ToString() + ")";
+    }
+    Status DeleteCF(uint32_t cf, const Slice& key) override {
+      seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")";
+      return Status::OK();
+    }
+  } handler;
+  ASSERT_OK(res.writeBatchPtr->Iterate(&handler));
+  ASSERT_EQ(
+      "Put(1, key1, 1024)"
+      "Put(0, key2, 1024)"
+      "LogData(blob1)"
+      "Put(1, key3, 1024)"
+      "LogData(blob2)"
+      "Delete(0, key2)",
+      handler.seen);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_logical_block_size_cache_test.cc b/src/rocksdb/db/db_logical_block_size_cache_test.cc
new file mode 100644
index 000000000..13c16618e
--- /dev/null
+++ b/src/rocksdb/db/db_logical_block_size_cache_test.cc
@@ -0,0 +1,521 @@
+// Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "test_util/testharness.h"
+
+#ifdef OS_LINUX
+#include "env/io_posix.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+
+namespace ROCKSDB_NAMESPACE {
+class EnvWithCustomLogicalBlockSizeCache : public EnvWrapper {
+ public:
+  EnvWithCustomLogicalBlockSizeCache(Env* env, LogicalBlockSizeCache* cache)
+      : EnvWrapper(env), cache_(cache) {}
+
+  Status RegisterDbPaths(const std::vector<std::string>& paths) override {
+    return cache_->RefAndCacheLogicalBlockSize(paths);
+  }
+
+  Status UnregisterDbPaths(const std::vector<std::string>& paths) override {
+    cache_->UnrefAndTryRemoveCachedLogicalBlockSize(paths);
+    return Status::OK();
+  }
+
+ private:
+  LogicalBlockSizeCache* cache_;
+};
+
+class DBLogicalBlockSizeCacheTest : public testing::Test {
+ public:
+  DBLogicalBlockSizeCacheTest()
+      : dbname_(test::PerThreadDBPath("logical_block_size_cache_test")),
+        data_path_0_(dbname_ + "/data_path_0"),
+        data_path_1_(dbname_ + "/data_path_1"),
+        cf_path_0_(dbname_ + "/cf_path_0"),
+        cf_path_1_(dbname_ + "/cf_path_1") {
+    auto get_fd_block_size = [&](int fd) { return fd; };
+    auto get_dir_block_size = [&](const std::string& /*dir*/, size_t* size) {
+      *size = 1024;
+      return Status::OK();
+    };
+    cache_.reset(
+        new LogicalBlockSizeCache(get_fd_block_size, get_dir_block_size));
+    env_.reset(
+        new EnvWithCustomLogicalBlockSizeCache(Env::Default(), cache_.get()));
+  }
+
+ protected:
+  std::string dbname_;
+  std::string data_path_0_;
+  std::string data_path_1_;
+  std::string cf_path_0_;
+  std::string cf_path_1_;
+  std::unique_ptr<LogicalBlockSizeCache> cache_;
+  std::unique_ptr<Env> env_;
+};
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenClose) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and Close will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 2048}, {data_path_1_, 2048}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(data_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+    ASSERT_TRUE(cache_->Contains(data_path_1_));
+    ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+    ASSERT_OK(db->Close());
+    ASSERT_EQ(0, cache_->Size());
+    delete db;
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenDelete) {
+  // Tests that Open will cache the logical block size for data paths,
+  // and delete the db pointer will remove the cached sizes.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db));
+#endif
+    }
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamily) {
+  // Tests that CreateColumnFamily will cache the cf_paths,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}, {cf_path_1_, 2048}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Drop column family does not drop cache.
+  ASSERT_OK(db->DropColumnFamily(cf));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  // Delete handle will drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, CreateColumnFamilies) {
+  // To test:
+  // (1) CreateColumnFamilies will cache the cf_paths in
+  // DBLogicalBlockSizeCache
+  // (2) Dropping column family handles associated with
+  // that cf_paths won't drop the cached cf_paths
+  // (3) Deleting all the column family handles associated
+  //  with that cf_paths will drop the cached cf_paths
+
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+  std::vector<ColumnFamilyHandle*> cfs;
+  ASSERT_OK(db->CreateColumnFamilies(cf_options, {"cf1", "cf2"}, &cfs));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  // Drop column family does not drop cf_path_0_'s entry from cache
+  for (ColumnFamilyHandle* cf : cfs) {
+    ASSERT_OK(db->DropColumnFamily(cf));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+  }
+
+  // Delete one cf handle will not drop cf_path_0_'s entry from cache because
+  // another handle is still referencing cf_path_0_.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+
+  // Delete all cf handles and ensure the ref count of cf_path_0_ in cache_
+  // can be properly decreased by releasing any background reference to the
+  // ColumnFamilyData during db deletion
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  delete db;
+
+  // Now cf_path_0_ in cache_ has been properly decreased and cf_path_0_'s entry
+  // is dropped from cache
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(
+      DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, OpenWithColumnFamilies) {
+  // Tests that Open two column families with the same cf_path will cache the
+  // cf_path and have 2 references to the cached size,
+  // drop the column family handle won't drop the cache,
+  // drop and then delete the column family handle will drop the cache.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  for (int i = 0; i < 2; i++) {
+    DB* db;
+    ColumnFamilyHandle* cf1 = nullptr;
+    ColumnFamilyHandle* cf2 = nullptr;
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf1", &cf1));
+    ASSERT_OK(db->CreateColumnFamily(cf_options, "cf2", &cf2));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf1));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cf2));
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+
+    std::vector<ColumnFamilyHandle*> cfs;
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(options, dbname_,
+                         {{"cf1", cf_options},
+                          {"cf2", cf_options},
+                          {"default", ColumnFamilyOptions()}},
+                         &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(options, dbname_,
+                                    {{"cf1", cf_options},
+                                     {"cf2", cf_options},
+                                     {"default", ColumnFamilyOptions()}},
+                                    &cfs, &db));
+#endif
+    }
+
+    // Logical block sizes of dbname_ and cf_path_0_ are cached during Open.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Drop handles won't drop the cache.
+    ASSERT_OK(db->DropColumnFamily(cfs[0]));
+    ASSERT_OK(db->DropColumnFamily(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 1st handle won't drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Delete 2nd handle will drop the cache for cf_path_0_.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    // Delete the default handle won't affect the cache because db still refers
+    // to the default CF.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[2]));
+    ASSERT_EQ(1, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(
+      DestroyDB(dbname_, options, {{"cf1", cf_options}, {"cf2", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, DestroyColumnFamilyHandle) {
+  // Tests that destroy column family without dropping won't drop the cache,
+  // because compaction and flush might still need to get logical block size
+  // when opening new files.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  DB* db;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ColumnFamilyHandle* cf = nullptr;
+  ASSERT_OK(db->CreateColumnFamily(cf_options, "cf", &cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  // Delete handle won't drop cache.
+  ASSERT_OK(db->DestroyColumnFamilyHandle(cf));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(dbname_));
+  ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  delete db;
+  ASSERT_EQ(0, cache_->Size());
+
+  // Open with column families.
+  std::vector<ColumnFamilyHandle*> cfs;
+  for (int i = 0; i < 2; i++) {
+    if (!i) {
+      printf("Open\n");
+      ASSERT_OK(DB::Open(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+    } else {
+#ifdef ROCKSDB_LITE
+      break;
+#else
+      printf("OpenForReadOnly\n");
+      ASSERT_OK(DB::OpenForReadOnly(
+          options, dbname_,
+          {{"cf", cf_options}, {"default", ColumnFamilyOptions()}}, &cfs, &db));
+#endif
+    }
+    // cf_path_0_ and dbname_ are cached.
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    // Deleting handle won't drop cache.
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[0]));
+    ASSERT_OK(db->DestroyColumnFamilyHandle(cfs[1]));
+    ASSERT_EQ(2, cache_->Size());
+    ASSERT_TRUE(cache_->Contains(dbname_));
+    ASSERT_EQ(1, cache_->GetRefCount(dbname_));
+    ASSERT_TRUE(cache_->Contains(cf_path_0_));
+    ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+    delete db;
+    ASSERT_EQ(0, cache_->Size());
+  }
+  ASSERT_OK(DestroyDB(dbname_, options, {{"cf", cf_options}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithDifferentPaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with different db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, data_path_0_, &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+
+  ColumnFamilyOptions cf_options0;
+  cf_options0.cf_paths = {{cf_path_0_, 1024}};
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options0, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, data_path_1_, &db1));
+  ASSERT_EQ(3, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+
+  ColumnFamilyOptions cf_options1;
+  cf_options1.cf_paths = {{cf_path_1_, 1024}};
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options1, "cf", &cf1));
+  ASSERT_EQ(4, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_1_));
+  ASSERT_TRUE(cache_->Contains(cf_path_1_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_1_));
+  ASSERT_OK(DestroyDB(data_path_0_, options, {{"cf", cf_options0}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(data_path_1_, options, {{"cf", cf_options1}}));
+}
+
+TEST_F(DBLogicalBlockSizeCacheTest, MultiDBWithSamePaths) {
+  // Tests the cache behavior when there are multiple DBs sharing the same env
+  // with the same db_paths and cf_paths.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_.get();
+  options.db_paths = {{data_path_0_, 1024}};
+  ColumnFamilyOptions cf_options;
+  cf_options.cf_paths = {{cf_path_0_, 1024}};
+
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+
+  DB* db0;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db0", &db0));
+  ASSERT_EQ(1, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+
+  ColumnFamilyHandle* cf0;
+  ASSERT_OK(db0->CreateColumnFamily(cf_options, "cf", &cf0));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  DB* db1;
+  ASSERT_OK(DB::Open(options, dbname_ + "/db1", &db1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+
+  ColumnFamilyHandle* cf1;
+  ASSERT_OK(db1->CreateColumnFamily(cf_options, "cf", &cf1));
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(2, cache_->GetRefCount(cf_path_0_));
+
+  ASSERT_OK(db0->DestroyColumnFamilyHandle(cf0));
+  delete db0;
+  ASSERT_EQ(2, cache_->Size());
+  ASSERT_TRUE(cache_->Contains(data_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(data_path_0_));
+  ASSERT_TRUE(cache_->Contains(cf_path_0_));
+  ASSERT_EQ(1, cache_->GetRefCount(cf_path_0_));
+  ASSERT_OK(DestroyDB(dbname_ + "/db0", options, {{"cf", cf_options}}));
+
+  ASSERT_OK(db1->DestroyColumnFamilyHandle(cf1));
+  delete db1;
+  ASSERT_EQ(0, cache_->Size());
+  ASSERT_OK(DestroyDB(dbname_ + "/db1", options, {{"cf", cf_options}}));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // OS_LINUX
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_memtable_test.cc b/src/rocksdb/db/db_memtable_test.cc
new file mode 100644
index 000000000..cae592db3
--- /dev/null
+++ b/src/rocksdb/db/db_memtable_test.cc
@@ -0,0 +1,344 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <memory>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBMemTableTest : public DBTestBase {
+ public:
+  DBMemTableTest() : DBTestBase("db_memtable_test", /*env_do_fsync=*/true) {}
+};
+
+class MockMemTableRep : public MemTableRep {
+ public:
+  explicit MockMemTableRep(Allocator* allocator, MemTableRep* rep)
+      : MemTableRep(allocator), rep_(rep), num_insert_with_hint_(0) {}
+
+  KeyHandle Allocate(const size_t len, char** buf) override {
+    return rep_->Allocate(len, buf);
+  }
+
+  void Insert(KeyHandle handle) override { rep_->Insert(handle); }
+
+  void InsertWithHint(KeyHandle handle, void** hint) override {
+    num_insert_with_hint_++;
+    EXPECT_NE(nullptr, hint);
+    last_hint_in_ = *hint;
+    rep_->InsertWithHint(handle, hint);
+    last_hint_out_ = *hint;
+  }
+
+  bool Contains(const char* key) const override { return rep_->Contains(key); }
+
+  void Get(const LookupKey& k, void* callback_args,
+           bool (*callback_func)(void* arg, const char* entry)) override {
+    rep_->Get(k, callback_args, callback_func);
+  }
+
+  size_t ApproximateMemoryUsage() override {
+    return rep_->ApproximateMemoryUsage();
+  }
+
+  Iterator* GetIterator(Arena* arena) override {
+    return rep_->GetIterator(arena);
+  }
+
+  void* last_hint_in() { return last_hint_in_; }
+  void* last_hint_out() { return last_hint_out_; }
+  int num_insert_with_hint() { return num_insert_with_hint_; }
+
+ private:
+  std::unique_ptr<MemTableRep> rep_;
+  void* last_hint_in_;
+  void* last_hint_out_;
+  int num_insert_with_hint_;
+};
+
+class MockMemTableRepFactory : public MemTableRepFactory {
+ public:
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger) override {
+    SkipListFactory factory;
+    MemTableRep* skiplist_rep =
+        factory.CreateMemTableRep(cmp, allocator, transform, logger);
+    mock_rep_ = new MockMemTableRep(allocator, skiplist_rep);
+    return mock_rep_;
+  }
+
+  MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator& cmp,
+                                 Allocator* allocator,
+                                 const SliceTransform* transform,
+                                 Logger* logger,
+                                 uint32_t column_family_id) override {
+    last_column_family_id_ = column_family_id;
+    return CreateMemTableRep(cmp, allocator, transform, logger);
+  }
+
+  const char* Name() const override { return "MockMemTableRepFactory"; }
+
+  MockMemTableRep* rep() { return mock_rep_; }
+
+  bool IsInsertConcurrentlySupported() const override { return false; }
+
+  uint32_t GetLastColumnFamilyId() { return last_column_family_id_; }
+
+ private:
+  MockMemTableRep* mock_rep_;
+  // workaround since there's no std::numeric_limits<uint32_t>::max() yet.
+  uint32_t last_column_family_id_ = static_cast<uint32_t>(-1);
+};
+
+class TestPrefixExtractor : public SliceTransform {
+ public:
+  const char* Name() const override { return "TestPrefixExtractor"; }
+
+  Slice Transform(const Slice& key) const override {
+    const char* p = separator(key);
+    if (p == nullptr) {
+      return Slice();
+    }
+    return Slice(key.data(), p - key.data() + 1);
+  }
+
+  bool InDomain(const Slice& key) const override {
+    return separator(key) != nullptr;
+  }
+
+  bool InRange(const Slice& /*key*/) const override { return false; }
+
+ private:
+  const char* separator(const Slice& key) const {
+    return reinterpret_cast<const char*>(memchr(key.data(), '_', key.size()));
+  }
+};
+
+// Test that ::Add properly returns false when inserting duplicate keys
+TEST_F(DBMemTableTest, DuplicateSeq) {
+  SequenceNumber seq = 123;
+  std::string value;
+  MergeContext merge_context;
+  Options options;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  ReadRangeDelAggregator range_del_agg(&ikey_cmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Write some keys and make sure it returns false on duplicates
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  // Changing the type should still cause the duplicatae key
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeMerge, "key", "value2", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  // Changing the seq number will make the key fresh
+  ASSERT_OK(mem->Add(seq + 1, kTypeMerge, "key", "value2",
+                     nullptr /* kv_prot_info */));
+  // Test with different types for duplicate keys
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeSingleDeletion, "key", "", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+
+  // Test the duplicate keys under stress
+  for (int i = 0; i < 10000; i++) {
+    bool insert_dup = i % 10 == 1;
+    if (!insert_dup) {
+      seq++;
+    }
+    Status s = mem->Add(seq, kTypeValue, "foo", "value" + std::to_string(seq),
+                        nullptr /* kv_prot_info */);
+    if (insert_dup) {
+      ASSERT_TRUE(s.IsTryAgain());
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+  delete mem;
+
+  // Test with InsertWithHint
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());  // which uses _ to extract the prefix
+  ioptions = ImmutableOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  // Insert a duplicate key with _ in it
+  ASSERT_OK(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */));
+  ASSERT_TRUE(
+      mem->Add(seq, kTypeValue, "key_1", "value", nullptr /* kv_prot_info */)
+          .IsTryAgain());
+  delete mem;
+
+  // Test when InsertConcurrently will be invoked
+  options.allow_concurrent_memtable_write = true;
+  ioptions = ImmutableOptions(options);
+  mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                     kMaxSequenceNumber, 0 /* column_family_id */);
+  MemTablePostProcessInfo post_process_info;
+  ASSERT_OK(mem->Add(seq, kTypeValue, "key", "value",
+                     nullptr /* kv_prot_info */, true, &post_process_info));
+  ASSERT_TRUE(mem->Add(seq, kTypeValue, "key", "value",
+                       nullptr /* kv_prot_info */, true, &post_process_info)
+                  .IsTryAgain());
+  delete mem;
+}
+
+// A simple test to verify that the concurrent merge writes is functional
+TEST_F(DBMemTableTest, ConcurrentMergeWrite) {
+  int num_ops = 1000;
+  std::string value;
+  MergeContext merge_context;
+  Options options;
+  // A merge operator that is not sensitive to concurrent writes since in this
+  // test we don't order the writes.
+  options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  options.allow_concurrent_memtable_write = true;
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+
+  // Put 0 as the base
+  PutFixed64(&value, static_cast<uint64_t>(0));
+  ASSERT_OK(mem->Add(0, kTypeValue, "key", value, nullptr /* kv_prot_info */));
+  value.clear();
+
+  // Write Merge concurrently
+  ROCKSDB_NAMESPACE::port::Thread write_thread1([&]() {
+    MemTablePostProcessInfo post_process_info1;
+    std::string v1;
+    for (int seq = 1; seq < num_ops / 2; seq++) {
+      PutFixed64(&v1, seq);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v1, nullptr /* kv_prot_info */,
+                         true, &post_process_info1));
+      v1.clear();
+    }
+  });
+  ROCKSDB_NAMESPACE::port::Thread write_thread2([&]() {
+    MemTablePostProcessInfo post_process_info2;
+    std::string v2;
+    for (int seq = num_ops / 2; seq < num_ops; seq++) {
+      PutFixed64(&v2, seq);
+      ASSERT_OK(mem->Add(seq, kTypeMerge, "key", v2, nullptr /* kv_prot_info */,
+                         true, &post_process_info2));
+      v2.clear();
+    }
+  });
+  write_thread1.join();
+  write_thread2.join();
+
+  Status status;
+  ReadOptions roptions;
+  SequenceNumber max_covering_tombstone_seq = 0;
+  LookupKey lkey("key", kMaxSequenceNumber);
+  bool res = mem->Get(lkey, &value, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                      &status, &merge_context, &max_covering_tombstone_seq,
+                      roptions, false /* immutable_memtable */);
+  ASSERT_OK(status);
+  ASSERT_TRUE(res);
+  uint64_t ivalue = DecodeFixed64(Slice(value).data());
+  uint64_t sum = 0;
+  for (int seq = 0; seq < num_ops; seq++) {
+    sum += seq;
+  }
+  ASSERT_EQ(ivalue, sum);
+
+  delete mem;
+}
+
+TEST_F(DBMemTableTest, InsertWithHint) {
+  Options options;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  options.memtable_insert_with_hint_prefix_extractor.reset(
+      new TestPrefixExtractor());
+  options.env = env_;
+  Reopen(options);
+  MockMemTableRep* rep =
+      reinterpret_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+          ->rep();
+  ASSERT_OK(Put("foo_k1", "foo_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_foo = rep->last_hint_out();
+  ASSERT_OK(Put("foo_k2", "foo_v2"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("foo_k3", "foo_v3"));
+  ASSERT_EQ(hint_foo, rep->last_hint_in());
+  ASSERT_EQ(hint_foo, rep->last_hint_out());
+  ASSERT_OK(Put("bar_k1", "bar_v1"));
+  ASSERT_EQ(nullptr, rep->last_hint_in());
+  void* hint_bar = rep->last_hint_out();
+  ASSERT_NE(hint_foo, hint_bar);
+  ASSERT_OK(Put("bar_k2", "bar_v2"));
+  ASSERT_EQ(hint_bar, rep->last_hint_in());
+  ASSERT_EQ(hint_bar, rep->last_hint_out());
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_OK(Put("NotInPrefixDomain", "vvv"));
+  ASSERT_EQ(5, rep->num_insert_with_hint());
+  ASSERT_EQ("foo_v1", Get("foo_k1"));
+  ASSERT_EQ("foo_v2", Get("foo_k2"));
+  ASSERT_EQ("foo_v3", Get("foo_k3"));
+  ASSERT_EQ("bar_v1", Get("bar_k1"));
+  ASSERT_EQ("bar_v2", Get("bar_k2"));
+  ASSERT_EQ("vvv", Get("NotInPrefixDomain"));
+}
+
+TEST_F(DBMemTableTest, ColumnFamilyId) {
+  // Verifies MemTableRepFactory is told the right column family id.
+  Options options;
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(new MockMemTableRepFactory());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_EQ(
+        cf, static_cast<MockMemTableRepFactory*>(options.memtable_factory.get())
+                ->GetLastColumnFamilyId());
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operand_test.cc b/src/rocksdb/db/db_merge_operand_test.cc
new file mode 100644
index 000000000..cbec37138
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operand_test.cc
@@ -0,0 +1,448 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "rocksdb/merge_operator.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/sortlist.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+ public:
+  LimitedStringAppendMergeOp(int limit, char delim)
+      : StringAppendTESTOperator(delim), limit_(limit) {}
+
+  const char* Name() const override {
+    return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+  }
+
+  bool ShouldMerge(const std::vector<Slice>& operands) const override {
+    if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+      return true;
+    }
+    return false;
+  }
+
+ private:
+  size_t limit_ = 0;
+};
+}  // anonymous namespace
+
+class DBMergeOperandTest : public DBTestBase {
+ public:
+  DBMergeOperandTest()
+      : DBTestBase("db_merge_operand_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBMergeOperandTest, CacheEvictedMergeOperandReadAfterFreeBug) {
+  // There was a bug of reading merge operands after they are mistakely freed
+  // in DB::GetMergeOperands, which is surfaced by cache full.
+  // See PR#9507 for more.
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.env = env_;
+  BlockBasedTableOptions table_options;
+
+  // Small cache to simulate cache full
+  table_options.block_cache = NewLRUCache(1);
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k1", "v4"));
+
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(number_of_operands, 4);
+  ASSERT_EQ(values[0].ToString(), "v1");
+  ASSERT_EQ(values[1].ToString(), "v2");
+  ASSERT_EQ(values[2].ToString(), "v3");
+  ASSERT_EQ(values[3].ToString(), "v4");
+}
+
+TEST_F(DBMergeOperandTest, FlushedMergeOperandReadAfterFreeBug) {
+  // Repro for a bug where a memtable containing a merge operand could be
+  // deleted before the merge operand was saved to the result.
+  auto options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  ASSERT_OK(Merge("key", "value"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:PostMemTableGet:0",
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush"},
+       {"DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush",
+        "DBImpl::GetImpl:PostMemTableGet:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  auto flush_thread = port::Thread([&]() {
+    TEST_SYNC_POINT(
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PreFlush");
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT(
+        "DBMergeOperandTest::FlushedMergeOperandReadAfterFreeBug:PostFlush");
+  });
+
+  PinnableSlice value;
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = 1;
+  int number_of_operands;
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "key", &value, &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(1, number_of_operands);
+
+  flush_thread.join();
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // k0 value in memtable
+  ASSERT_OK(Put("k0", "PutARock"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "PutARock");
+
+  // k0.1 value in SST
+  ASSERT_OK(Put("k0.1", "RockInSST"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k0.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "RockInSST");
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1.1 values are in memtable.
+  ASSERT_OK(Merge("k1.1", "r"));
+  ASSERT_OK(Delete("k1.1"));
+  ASSERT_OK(Merge("k1.1", "c"));
+  ASSERT_OK(Merge("k1.1", "k"));
+  ASSERT_OK(Merge("k1.1", "s"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "c");
+  ASSERT_EQ(values[1], "k");
+  ASSERT_EQ(values[2], "s");
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q");
+  ASSERT_EQ(values[1], "w");
+  ASSERT_EQ(values[2], "e");
+  ASSERT_EQ(values[3], "r");
+
+  // All k2.1 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.1", "m"));
+  ASSERT_OK(Put("k2.1", "l"));
+  ASSERT_OK(Merge("k2.1", "n"));
+  ASSERT_OK(Merge("k2.1", "o"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "l,n,o");
+
+  // All k2.2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2.2", "g"));
+  ASSERT_OK(Delete("k2.2"));
+  ASSERT_OK(Merge("k2.2", "o"));
+  ASSERT_OK(Merge("k2.2", "t"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "o,t");
+
+  // Do some compaction that will make the following tests more predictable
+  //  Slice start("PutARock");
+  //  Slice end("t");
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All k3.1 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.1", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k3.1", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.1", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "bc");
+  ASSERT_EQ(values[1], "cd");
+  ASSERT_EQ(values[2], "de");
+
+  // All k3.2 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3.2", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Delete("k3.2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3.2", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3.2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "cd");
+  ASSERT_EQ(values[1], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+
+  // First 3 k5 values are in SST and next 4 k5 values are in Immutable
+  // Memtable
+  ASSERT_OK(Merge("k5", "who"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k5", "remember"));
+  ASSERT_OK(Merge("k5", "i"));
+  ASSERT_OK(Merge("k5", "am"));
+  ASSERT_OK(Merge("k5", "rocks"));
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k5", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "remember");
+  ASSERT_EQ(values[1], "i");
+  ASSERT_EQ(values[2], "am");
+}
+
+TEST_F(DBMergeOperandTest, BlobDBGetMergeOperandsBasic) {
+  Options options;
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  int num_records = 4;
+  int number_of_operands = 0;
+  std::vector<PinnableSlice> values(num_records);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k1 values are in memtable.
+  ASSERT_OK(Put("k1", "x"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k1", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "x");
+  ASSERT_EQ(values[1], "b");
+  ASSERT_EQ(values[2], "c");
+  ASSERT_EQ(values[3], "d");
+
+  // expected_max_number_of_operands is less than number of merge operands so
+  // status should be Incomplete.
+  merge_operands_info.expected_max_number_of_operands = num_records - 1;
+  Status status = db_->GetMergeOperands(
+      ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(),
+      &merge_operands_info, &number_of_operands);
+  ASSERT_EQ(status.IsIncomplete(), true);
+  merge_operands_info.expected_max_number_of_operands = num_records;
+
+  // All k2 values are flushed to L0 into a single file.
+  ASSERT_OK(Put("k2", "q"));
+  ASSERT_OK(Merge("k2", "w"));
+  ASSERT_OK(Merge("k2", "e"));
+  ASSERT_OK(Merge("k2", "r"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k2", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "q,w,e,r");
+
+  // Do some compaction that will make the following tests more predictable
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // All k3 values are flushed and are in different files.
+  ASSERT_OK(Put("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k3", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ab");
+  ASSERT_EQ(values[1], "bc");
+  ASSERT_EQ(values[2], "cd");
+  ASSERT_EQ(values[3], "de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Put("k4", "ba"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "cb"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "dc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "ed"));
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "k4", values.data(), &merge_operands_info,
+                                  &number_of_operands));
+  ASSERT_EQ(values[0], "ba");
+  ASSERT_EQ(values[1], "cb");
+  ASSERT_EQ(values[2], "dc");
+  ASSERT_EQ(values[3], "ed");
+}
+
+TEST_F(DBMergeOperandTest, GetMergeOperandsLargeResultOptimization) {
+  // These constants are chosen to trigger the large result optimization
+  // (pinning a bundle of `DBImpl` resources).
+  const int kNumOperands = 1024;
+  const int kOperandLen = 1024;
+
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::vector<std::string> expected_merge_operands;
+  expected_merge_operands.reserve(kNumOperands);
+  for (int i = 0; i < kNumOperands; ++i) {
+    expected_merge_operands.emplace_back(rnd.RandomString(kOperandLen));
+    ASSERT_OK(Merge("key", expected_merge_operands.back()));
+  }
+
+  std::vector<PinnableSlice> merge_operands(kNumOperands);
+  GetMergeOperandsOptions merge_operands_info;
+  merge_operands_info.expected_max_number_of_operands = kNumOperands;
+  int num_merge_operands = 0;
+  ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                  "key", merge_operands.data(),
+                                  &merge_operands_info, &num_merge_operands));
+  ASSERT_EQ(num_merge_operands, kNumOperands);
+
+  // Ensures the large result optimization was used.
+  for (int i = 0; i < kNumOperands; ++i) {
+    ASSERT_TRUE(merge_operands[i].IsPinned());
+  }
+
+  // Add a Flush() to change the `SuperVersion` to challenge the resource
+  // pinning.
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < kNumOperands; ++i) {
+    ASSERT_EQ(expected_merge_operands[i], merge_operands[i]);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_merge_operator_test.cc b/src/rocksdb/db/db_merge_operator_test.cc
new file mode 100644
index 000000000..7c5505bd1
--- /dev/null
+++ b/src/rocksdb/db/db_merge_operator_test.cc
@@ -0,0 +1,669 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+#include "rocksdb/merge_operator.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TestReadCallback : public ReadCallback {
+ public:
+  TestReadCallback(SnapshotChecker* snapshot_checker,
+                   SequenceNumber snapshot_seq)
+      : ReadCallback(snapshot_seq),
+        snapshot_checker_(snapshot_checker),
+        snapshot_seq_(snapshot_seq) {}
+
+  bool IsVisibleFullCheck(SequenceNumber seq) override {
+    return snapshot_checker_->CheckInSnapshot(seq, snapshot_seq_) ==
+           SnapshotCheckerResult::kInSnapshot;
+  }
+
+ private:
+  SnapshotChecker* snapshot_checker_;
+  SequenceNumber snapshot_seq_;
+};
+
+// Test merge operator functionality.
+class DBMergeOperatorTest : public DBTestBase {
+ public:
+  DBMergeOperatorTest()
+      : DBTestBase("db_merge_operator_test", /*env_do_fsync=*/false) {}
+
+  std::string GetWithReadCallback(SnapshotChecker* snapshot_checker,
+                                  const Slice& key,
+                                  const Snapshot* snapshot = nullptr) {
+    SequenceNumber seq = snapshot == nullptr ? db_->GetLatestSequenceNumber()
+                                             : snapshot->GetSequenceNumber();
+    TestReadCallback read_callback(snapshot_checker, seq);
+    ReadOptions read_opt;
+    read_opt.snapshot = snapshot;
+    PinnableSlice value;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = db_->DefaultColumnFamily();
+    get_impl_options.value = &value;
+    get_impl_options.callback = &read_callback;
+    Status s = dbfull()->GetImpl(read_opt, key, get_impl_options);
+    if (!s.ok()) {
+      return s.ToString();
+    }
+    return value.ToString();
+  }
+};
+
+TEST_F(DBMergeOperatorTest, LimitMergeOperands) {
+  class LimitedStringAppendMergeOp : public StringAppendTESTOperator {
+   public:
+    LimitedStringAppendMergeOp(int limit, char delim)
+        : StringAppendTESTOperator(delim), limit_(limit) {}
+
+    const char* Name() const override {
+      return "DBMergeOperatorTest::LimitedStringAppendMergeOp";
+    }
+
+    bool ShouldMerge(const std::vector<Slice>& operands) const override {
+      if (operands.size() > 0 && limit_ > 0 && operands.size() >= limit_) {
+        return true;
+      }
+      return false;
+    }
+
+   private:
+    size_t limit_ = 0;
+  };
+
+  Options options;
+  options.create_if_missing = true;
+  // Use only the latest two merge operands.
+  options.merge_operator = std::make_shared<LimitedStringAppendMergeOp>(2, ',');
+  options.env = env_;
+  Reopen(options);
+  // All K1 values are in memtable.
+  ASSERT_OK(Merge("k1", "a"));
+  ASSERT_OK(Merge("k1", "b"));
+  ASSERT_OK(Merge("k1", "c"));
+  ASSERT_OK(Merge("k1", "d"));
+  std::string value;
+  ASSERT_OK(db_->Get(ReadOptions(), "k1", &value));
+  // Make sure that only the latest two merge operands are used. If this was
+  // not the case the value would be "a,b,c,d".
+  ASSERT_EQ(value, "c,d");
+
+  // All K2 values are flushed to L0 into a single file.
+  ASSERT_OK(Merge("k2", "a"));
+  ASSERT_OK(Merge("k2", "b"));
+  ASSERT_OK(Merge("k2", "c"));
+  ASSERT_OK(Merge("k2", "d"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Get(ReadOptions(), "k2", &value));
+  ASSERT_EQ(value, "c,d");
+
+  // All K3 values are flushed and are in different files.
+  ASSERT_OK(Merge("k3", "ab"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "bc"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "cd"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Merge("k3", "de"));
+  ASSERT_OK(db_->Get(ReadOptions(), "k3", &value));
+  ASSERT_EQ(value, "cd,de");
+
+  // All K4 values are in different levels
+  ASSERT_OK(Merge("k4", "ab"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(4);
+  ASSERT_OK(Merge("k4", "bc"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(3);
+  ASSERT_OK(Merge("k4", "cd"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+  ASSERT_OK(Merge("k4", "de"));
+  ASSERT_OK(db_->Get(ReadOptions(), "k4", &value));
+  ASSERT_EQ(value, "cd,de");
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnRead) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "k1", &value).IsCorruption());
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnWrite) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.max_successive_merges = 3;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "v2"));
+  // Will trigger a merge when hitting max_successive_merges and the merge
+  // will fail. The delta will be inserted nevertheless.
+  ASSERT_OK(Merge("k1", "corrupted"));
+  // Data should stay unmerged after the error.
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v2"}, {"k1", "v1"}});
+}
+
+TEST_F(DBMergeOperatorTest, MergeErrorOnIteration) {
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator.reset(new TestPutOperator());
+  options.env = env_;
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Merge("k1", "corrupted"));
+  ASSERT_OK(Put("k2", "v2"));
+  auto* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k2");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Prev();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "corrupted"}, {"k1", "v1"}, {"k2", "v2"}});
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Merge("k1", "v1"));
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Merge("k2", "corrupted"));
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_TRUE(iter->status().IsCorruption());
+  delete iter;
+  VerifyDBInternal({{"k1", "v1"}, {"k2", "corrupted"}, {"k2", "v2"}});
+}
+
+class MergeOperatorPinningTest : public DBMergeOperatorTest,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  MergeOperatorPinningTest() { disable_block_cache_ = GetParam(); }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(MergeOperatorPinningTest, MergeOperatorPinningTest,
+                        ::testing::Bool());
+
+#ifndef ROCKSDB_LITE
+TEST_P(MergeOperatorPinningTest, OperandsMultiBlocks) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 1;  // every block will contain one entry
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  const int kKeysPerFile = 10;
+  const int kOperandsPerKeyPerFile = 7;
+  const int kOperandSize = 100;
+  // Filse to write in L0 before compacting to lower level
+  const int kFilesPerLevel = 3;
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+  int batch_num = 1;
+  int lvl_to_fill = 4;
+  int key_id = 0;
+  while (true) {
+    for (int j = 0; j < kKeysPerFile; j++) {
+      std::string key = Key(key_id % 35);
+      key_id++;
+      for (int k = 0; k < kOperandsPerKeyPerFile; k++) {
+        std::string val = rnd.RandomString(kOperandSize);
+        ASSERT_OK(db_->Merge(WriteOptions(), key, val));
+        if (true_data[key].size() == 0) {
+          true_data[key] = val;
+        } else {
+          true_data[key] += "," + val;
+        }
+      }
+    }
+
+    if (lvl_to_fill == -1) {
+      // Keep last batch in memtable and stop
+      break;
+    }
+
+    ASSERT_OK(Flush());
+    if (batch_num % kFilesPerLevel == 0) {
+      if (lvl_to_fill != 0) {
+        MoveFilesToLevel(lvl_to_fill);
+      }
+      lvl_to_fill--;
+    }
+    batch_num++;
+  }
+
+  // 3 L0 files
+  // 1 L1 file
+  // 3 L2 files
+  // 1 L3 file
+  // 3 L4 Files
+  ASSERT_EQ(FilesPerLevel(), "3,1,3,1,3");
+
+  VerifyDBFromMap(true_data);
+}
+
+class MergeOperatorHook : public MergeOperator {
+ public:
+  explicit MergeOperatorHook(std::shared_ptr<MergeOperator> _merge_op)
+      : merge_op_(_merge_op) {}
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    before_merge_();
+    bool res = merge_op_->FullMergeV2(merge_in, merge_out);
+    after_merge_();
+    return res;
+  }
+
+  const char* Name() const override { return merge_op_->Name(); }
+
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::function<void()> before_merge_ = []() {};
+  std::function<void()> after_merge_ = []() {};
+};
+
+TEST_P(MergeOperatorPinningTest, EvictCacheBeforeMerge) {
+  Options options = CurrentOptions();
+
+  auto merge_hook =
+      std::make_shared<MergeOperatorHook>(MergeOperators::CreateMaxOperator());
+  options.merge_operator = merge_hook;
+  options.disable_auto_compactions = true;
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  if (bbto.no_block_cache == false) {
+    bbto.block_cache = NewLRUCache(64 * 1024 * 1024);
+  } else {
+    bbto.block_cache = nullptr;
+  }
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 30;
+  const int kNumKeys = 1000;
+  const int kOperandSize = 100;
+  Random rnd(301);
+
+  // 1000 keys every key have 30 operands, every operand is in a different file
+  std::map<std::string, std::string> true_data;
+  for (int i = 0; i < kNumOperands; i++) {
+    for (int j = 0; j < kNumKeys; j++) {
+      std::string k = Key(j);
+      std::string v = rnd.RandomString(kOperandSize);
+      ASSERT_OK(db_->Merge(WriteOptions(), k, v));
+
+      true_data[k] = std::max(true_data[k], v);
+    }
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+  ASSERT_EQ(file_numbers.size(), kNumOperands);
+  int merge_cnt = 0;
+
+  // Code executed before merge operation
+  merge_hook->before_merge_ = [&]() {
+    // Evict all tables from cache before every merge operation
+    auto* table_cache = dbfull()->TEST_table_cache();
+    for (uint64_t num : file_numbers) {
+      TableCache::Evict(table_cache, num);
+    }
+    // Decrease cache capacity to force all unrefed blocks to be evicted
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(1);
+    }
+    merge_cnt++;
+  };
+
+  // Code executed after merge operation
+  merge_hook->after_merge_ = [&]() {
+    // Increase capacity again after doing the merge
+    if (bbto.block_cache) {
+      bbto.block_cache->SetCapacity(64 * 1024 * 1024);
+    }
+  };
+
+  size_t total_reads;
+  VerifyDBFromMap(true_data, &total_reads);
+  ASSERT_EQ(merge_cnt, total_reads);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  VerifyDBFromMap(true_data, &total_reads);
+}
+
+TEST_P(MergeOperatorPinningTest, TailingIterator) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const int kNumOperands = 100;
+  const int kNumWrites = 100000;
+
+  std::function<void()> writer_func = [&]() {
+    int k = 0;
+    for (int i = 0; i < kNumWrites; i++) {
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(k), Key(k)));
+
+      if (i && i % kNumOperands == 0) {
+        k++;
+      }
+      if (i && i % 127 == 0) {
+        ASSERT_OK(Flush());
+      }
+      if (i && i % 317 == 0) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+  };
+
+  std::function<void()> reader_func = [&]() {
+    ReadOptions ro;
+    ro.tailing = true;
+    Iterator* iter = db_->NewIterator(ro);
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    for (int i = 0; i < (kNumWrites / kNumOperands); i++) {
+      while (!iter->Valid()) {
+        // wait for the key to be written
+        env_->SleepForMicroseconds(100);
+        iter->Seek(Key(i));
+      }
+      ASSERT_EQ(iter->key(), Key(i));
+      ASSERT_EQ(iter->value(), Key(i));
+
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+
+    delete iter;
+  };
+
+  ROCKSDB_NAMESPACE::port::Thread writer_thread(writer_func);
+  ROCKSDB_NAMESPACE::port::Thread reader_thread(reader_func);
+
+  writer_thread.join();
+  reader_thread.join();
+}
+
+TEST_F(DBMergeOperatorTest, TailingIteratorMemtableUnrefedBySomeoneElse) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  // Overview of the test:
+  //  * There are two merge operands for the same key: one in an sst file,
+  //    another in a memtable.
+  //  * Seek a tailing iterator to this key.
+  //  * As part of the seek, the iterator will:
+  //      (a) first visit the operand in the memtable and tell ForwardIterator
+  //          to pin this operand, then
+  //      (b) move on to the operand in the sst file, then pass both operands
+  //          to merge operator.
+  //  * The memtable may get flushed and unreferenced by another thread between
+  //    (a) and (b). The test simulates it by flushing the memtable inside a
+  //    SyncPoint callback located between (a) and (b).
+  //  * In this case it's ForwardIterator's responsibility to keep the memtable
+  //    pinned until (b) is complete. There used to be a bug causing
+  //    ForwardIterator to not pin it in some circumstances. This test
+  //    reproduces it.
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "sst"));
+  ASSERT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion A
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "memtable"));
+
+  // Pin SuperVersion A
+  std::unique_ptr<Iterator> someone_else(db_->NewIterator(ReadOptions()));
+  ASSERT_OK(someone_else->status());
+
+  bool pushed_first_operand = false;
+  bool stepped_to_next_operand = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:PushedFirstOperand", [&](void*) {
+        EXPECT_FALSE(pushed_first_operand);
+        pushed_first_operand = true;
+        EXPECT_OK(db_->Flush(FlushOptions()));  // Switch to SuperVersion B
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBIter::MergeValuesNewToOld:SteppedToNextOperand", [&](void*) {
+        EXPECT_FALSE(stepped_to_next_operand);
+        stepped_to_next_operand = true;
+        someone_else.reset();  // Unpin SuperVersion A
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions ro;
+  ro.tailing = true;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+  iter->Seek("key");
+
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  EXPECT_EQ(std::string("sst,memtable"), iter->value().ToString());
+  EXPECT_TRUE(pushed_first_operand);
+  EXPECT_TRUE(stepped_to_next_operand);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBMergeOperatorTest, SnapshotCheckerAndReadCallback) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  class TestSnapshotChecker : public SnapshotChecker {
+   public:
+    SnapshotCheckerResult CheckInSnapshot(
+        SequenceNumber seq, SequenceNumber snapshot_seq) const override {
+      return IsInSnapshot(seq, snapshot_seq)
+                 ? SnapshotCheckerResult::kInSnapshot
+                 : SnapshotCheckerResult::kNotInSnapshot;
+    }
+
+    bool IsInSnapshot(SequenceNumber seq, SequenceNumber snapshot_seq) const {
+      switch (snapshot_seq) {
+        case 0:
+          return seq == 0;
+        case 1:
+          return seq <= 1;
+        case 2:
+          // seq = 2 not visible to snapshot with seq = 2
+          return seq <= 1;
+        case 3:
+          return seq <= 3;
+        case 4:
+          // seq = 4 not visible to snpahost with seq = 4
+          return seq <= 3;
+        default:
+          // seq >=4 is uncommitted
+          return seq <= 4;
+      };
+    }
+  };
+  TestSnapshotChecker* snapshot_checker = new TestSnapshotChecker();
+  dbfull()->SetSnapshotChecker(snapshot_checker);
+
+  std::string value;
+  ASSERT_OK(Merge("foo", "v1"));
+  ASSERT_EQ(1, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v2"));
+  ASSERT_EQ(2, db_->GetLatestSequenceNumber());
+  // v2 is not visible to latest snapshot, which has seq = 2.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+  // Take a snapshot with seq = 2.
+  const Snapshot* snapshot1 = db_->GetSnapshot();
+  ASSERT_EQ(2, snapshot1->GetSequenceNumber());
+  // v2 is not visible to snapshot1, which has seq = 2
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v3"));
+  ASSERT_EQ(3, db_->GetLatestSequenceNumber());
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  ASSERT_OK(Merge("foo", "v4"));
+  ASSERT_EQ(4, db_->GetLatestSequenceNumber());
+  // v4 is not visible to latest snapshot, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+  const Snapshot* snapshot2 = db_->GetSnapshot();
+  ASSERT_EQ(4, snapshot2->GetSequenceNumber());
+  // v4 is not visible to snapshot2, which has seq = 4.
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+
+  // Verify flush doesn't alter the result.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3", GetWithReadCallback(snapshot_checker, "foo"));
+
+  ASSERT_OK(Merge("foo", "v5"));
+  ASSERT_EQ(5, db_->GetLatestSequenceNumber());
+  // v5 is uncommitted
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  // full manual compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Verify compaction doesn't alter the result.
+  ASSERT_EQ("v1", GetWithReadCallback(snapshot_checker, "foo", snapshot1));
+  ASSERT_EQ("v1,v2,v3",
+            GetWithReadCallback(snapshot_checker, "foo", snapshot2));
+  ASSERT_EQ("v1,v2,v3,v4", GetWithReadCallback(snapshot_checker, "foo"));
+
+  db_->ReleaseSnapshot(snapshot1);
+  db_->ReleaseSnapshot(snapshot2);
+}
+
+class PerConfigMergeOperatorPinningTest
+    : public DBMergeOperatorTest,
+      public testing::WithParamInterface<std::tuple<bool, int>> {
+ public:
+  PerConfigMergeOperatorPinningTest() {
+    std::tie(disable_block_cache_, option_config_) = GetParam();
+  }
+
+  bool disable_block_cache_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    MergeOperatorPinningTest, PerConfigMergeOperatorPinningTest,
+    ::testing::Combine(::testing::Bool(),
+                       ::testing::Range(static_cast<int>(DBTestBase::kDefault),
+                                        static_cast<int>(DBTestBase::kEnd))));
+
+TEST_P(PerConfigMergeOperatorPinningTest, Randomized) {
+  if (ShouldSkipOptions(option_config_, kSkipMergePut)) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateMaxOperator();
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = disable_block_cache_;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  std::map<std::string, std::string> true_data;
+
+  const int kTotalMerges = 5000;
+  // Every key gets ~10 operands
+  const int kKeyRange = kTotalMerges / 10;
+  const int kOperandSize = 20;
+  const int kNumPutBefore = kKeyRange / 10;  // 10% value
+  const int kNumPutAfter = kKeyRange / 10;   // 10% overwrite
+  const int kNumDelete = kKeyRange / 10;     // 10% delete
+
+  // kNumPutBefore keys will have base values
+  for (int i = 0; i < kNumPutBefore; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Do kTotalMerges merges
+  for (int i = 0; i < kTotalMerges; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+
+    if (true_data[key] < value) {
+      true_data[key] = value;
+    }
+  }
+
+  // Overwrite random kNumPutAfter keys
+  for (int i = 0; i < kNumPutAfter; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    std::string value = rnd.RandomString(kOperandSize);
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+    true_data[key] = value;
+  }
+
+  // Delete random kNumDelete keys
+  for (int i = 0; i < kNumDelete; i++) {
+    std::string key = Key(rnd.Next() % kKeyRange);
+    ASSERT_OK(db_->Delete(WriteOptions(), key));
+
+    true_data.erase(key);
+  }
+
+  VerifyDBFromMap(true_data);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_options_test.cc b/src/rocksdb/db/db_options_test.cc
new file mode 100644
index 000000000..691081db9
--- /dev/null
+++ b/src/rocksdb/db/db_options_test.cc
@@ -0,0 +1,1219 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include <limits>
+#include <string>
+#include <unordered_map>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/rate_limiter.h"
+#include "rocksdb/stats_history.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBOptionsTest : public DBTestBase {
+ public:
+  DBOptionsTest() : DBTestBase("db_options_test", /*env_do_fsync=*/true) {}
+
+#ifndef ROCKSDB_LITE
+  std::unordered_map<std::string, std::string> GetMutableDBOptionsMap(
+      const DBOptions& options) {
+    std::string options_str;
+    std::unordered_map<std::string, std::string> mutable_map;
+    ConfigOptions config_options(options);
+    config_options.delimiter = "; ";
+
+    EXPECT_OK(GetStringFromMutableDBOptions(
+        config_options, MutableDBOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
+
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetMutableCFOptionsMap(
+      const ColumnFamilyOptions& options) {
+    std::string options_str;
+    ConfigOptions config_options;
+    config_options.delimiter = "; ";
+
+    std::unordered_map<std::string, std::string> mutable_map;
+    EXPECT_OK(GetStringFromMutableCFOptions(
+        config_options, MutableCFOptions(options), &options_str));
+    EXPECT_OK(StringToMap(options_str, &mutable_map));
+    return mutable_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableCFOptionsMap(
+      Random* rnd) {
+    Options options = CurrentOptions();
+    options.env = env_;
+    ImmutableDBOptions db_options(options);
+    test::RandomInitCFOptions(&options, options, rnd);
+    auto sanitized_options = SanitizeOptions(db_options, options);
+    auto opt_map = GetMutableCFOptionsMap(sanitized_options);
+    delete options.compaction_filter;
+    return opt_map;
+  }
+
+  std::unordered_map<std::string, std::string> GetRandomizedMutableDBOptionsMap(
+      Random* rnd) {
+    DBOptions db_options;
+    test::RandomInitDBOptions(&db_options, rnd);
+    auto sanitized_options = SanitizeOptions(dbname_, db_options);
+    return GetMutableDBOptionsMap(sanitized_options);
+  }
+#endif  // ROCKSDB_LITE
+};
+
+TEST_F(DBOptionsTest, ImmutableTrackAndVerifyWalsInManifest) {
+  Options options;
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.track_and_verify_wals_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().track_and_verify_wals_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"track_and_verify_wals_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
+TEST_F(DBOptionsTest, ImmutableVerifySstUniqueIdInManifest) {
+  Options options;
+  options.env = env_;
+  options.verify_sst_unique_id_in_manifest = true;
+
+  ImmutableDBOptions db_options(options);
+  ASSERT_TRUE(db_options.verify_sst_unique_id_in_manifest);
+
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetDBOptions().verify_sst_unique_id_in_manifest);
+
+  Status s =
+      dbfull()->SetDBOptions({{"verify_sst_unique_id_in_manifest", "false"}});
+  ASSERT_FALSE(s.ok());
+}
+
+// RocksDB lite don't support dynamic options.
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, AvoidUpdatingOptions) {
+  Options options;
+  options.env = env_;
+  options.max_background_jobs = 4;
+  options.delayed_write_rate = 1024;
+
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool is_changed_stats = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::WriteOptionsFile:PersistOptions", [&](void* /*arg*/) {
+        ASSERT_FALSE(is_changed_stats);  // should only save options file once
+        is_changed_stats = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // helper function to check the status and reset after each check
+  auto is_changed = [&] {
+    bool ret = is_changed_stats;
+    is_changed_stats = false;
+    return ret;
+  };
+
+  // without changing the value, but it's sanitized to a different value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "0"}}));
+  ASSERT_TRUE(is_changed());
+
+  // without changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_jobs", "4"}}));
+  ASSERT_FALSE(is_changed());
+
+  // changing the value
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_TRUE(is_changed());
+
+  // update again
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "123"}}));
+  ASSERT_FALSE(is_changed());
+
+  // without changing a default value
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "false"}}));
+  ASSERT_FALSE(is_changed());
+
+  // now change
+  ASSERT_OK(dbfull()->SetDBOptions({{"strict_bytes_per_sync", "true"}}));
+  ASSERT_TRUE(is_changed());
+
+  // multiple values without change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_total_wal_size", "0"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_FALSE(is_changed());
+
+  // multiple values with change
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"max_open_files", "100"}, {"stats_dump_period_sec", "600"}}));
+  ASSERT_TRUE(is_changed());
+}
+
+TEST_F(DBOptionsTest, GetLatestDBOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  auto new_options = GetRandomizedMutableDBOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetDBOptions(new_options));
+  ASSERT_EQ(new_options, GetMutableDBOptionsMap(dbfull()->GetDBOptions()));
+}
+
+TEST_F(DBOptionsTest, GetLatestCFOptions) {
+  // GetOptions should be able to get latest option changed by SetOptions.
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  Random rnd(228);
+  Reopen(options);
+  CreateColumnFamilies({"foo"}, options);
+  ReopenWithColumnFamilies({"default", "foo"}, options);
+  auto options_default = GetRandomizedMutableCFOptionsMap(&rnd);
+  auto options_foo = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(handles_[0], options_default));
+  ASSERT_OK(dbfull()->SetOptions(handles_[1], options_foo));
+  ASSERT_EQ(options_default,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[0])));
+  ASSERT_EQ(options_foo,
+            GetMutableCFOptionsMap(dbfull()->GetOptions(handles_[1])));
+}
+
+TEST_F(DBOptionsTest, SetMutableTableOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.blob_file_size = 16384;
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = true;
+  bbto.block_size = 8192;
+  bbto.block_restart_interval = 7;
+
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  Options c_opts = dbfull()->GetOptions(cfh);
+
+  const auto* c_bbto =
+      c_opts.table_factory->GetOptions<BlockBasedTableOptions>();
+  ASSERT_NE(c_bbto, nullptr);
+  ASSERT_EQ(c_opts.blob_file_size, 16384);
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 8192);
+  ASSERT_EQ(c_bbto->block_restart_interval, 7);
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"table_factory.block_restart_interval", "11"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Now set an option that is not mutable - options should not change
+  ASSERT_NOK(
+      dbfull()->SetOptions(cfh, {{"table_factory.no_block_cache", "false"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that are not - options should not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.no_block_cache", "false"},
+            {"table_factory.block_size", "8192"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Set some that are mutable and some that do not exist - options should not
+  // change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "8192"},
+            {"table_factory.does_not_exist", "true"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->no_block_cache, true);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 11);
+
+  // Trying to change the table factory fails
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory", TableFactory::kPlainTableName()}}));
+
+  // Set some on the table and some on the Column Family
+  ASSERT_OK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "16384"},
+            {"blob_file_size", "32768"},
+            {"table_factory.block_restart_interval", "13"}}));
+  c_opts = dbfull()->GetOptions(cfh);
+  ASSERT_EQ(c_opts.blob_file_size, 32768);
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+  // Set some on the table and a bad one on the ColumnFamily - options should
+  // not change
+  ASSERT_NOK(dbfull()->SetOptions(
+      cfh, {{"table_factory.block_size", "1024"},
+            {"no_such_option", "32768"},
+            {"table_factory.block_restart_interval", "7"}}));
+  ASSERT_EQ(c_bbto->block_size, 16384);
+  ASSERT_EQ(c_bbto->block_restart_interval, 13);
+}
+
+TEST_F(DBOptionsTest, SetWithCustomMemTableFactory) {
+  class DummySkipListFactory : public SkipListFactory {
+   public:
+    static const char* kClassName() { return "DummySkipListFactory"; }
+    const char* Name() const override { return kClassName(); }
+    explicit DummySkipListFactory() : SkipListFactory(2) {}
+  };
+  {
+    // Verify the DummySkipList cannot be created
+    ConfigOptions config_options;
+    config_options.ignore_unsupported_options = false;
+    std::unique_ptr<MemTableRepFactory> factory;
+    ASSERT_NOK(MemTableRepFactory::CreateFromString(
+        config_options, DummySkipListFactory::kClassName(), &factory));
+  }
+  Options options;
+  options.create_if_missing = true;
+  // Try with fail_if_options_file_error=false/true to update the options
+  for (bool on_error : {false, true}) {
+    options.fail_if_options_file_error = on_error;
+    options.env = env_;
+    options.disable_auto_compactions = false;
+
+    options.memtable_factory.reset(new DummySkipListFactory());
+    Reopen(options);
+
+    ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+    ASSERT_OK(
+        dbfull()->SetOptions(cfh, {{"disable_auto_compactions", "true"}}));
+    ColumnFamilyDescriptor cfd;
+    ASSERT_OK(cfh->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+    ColumnFamilyHandle* test = nullptr;
+    ASSERT_OK(dbfull()->CreateColumnFamily(options, "test", &test));
+    ASSERT_OK(test->GetDescriptor(&cfd));
+    ASSERT_STREQ(cfd.options.memtable_factory->Name(),
+                 DummySkipListFactory::kClassName());
+
+    ASSERT_OK(dbfull()->DropColumnFamily(test));
+    delete test;
+  }
+}
+
+TEST_F(DBOptionsTest, SetBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024;  // 1MB
+  Options options;
+  options.create_if_missing = true;
+  options.bytes_per_sync = 1024 * 1024;
+  options.use_direct_reads = false;
+  options.write_buffer_size = 400 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  int counter = 0;
+  int low_bytes_per_sync = 0;
+  int i = 0;
+  const std::string kValue(kValueSize, 'v');
+  ASSERT_EQ(options.bytes_per_sync, dbfull()->GetDBOptions().bytes_per_sync);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0", [&](void* /*arg*/) { counter++; });
+
+  WriteOptions write_opts;
+  // should sync approximately 40MB/1MB ~= 40 times.
+  for (i = 0; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  low_bytes_per_sync = counter;
+  ASSERT_GT(low_bytes_per_sync, 35);
+  ASSERT_LT(low_bytes_per_sync, 45);
+
+  counter = 0;
+  // 8388608 = 8 * 1024 * 1024
+  ASSERT_OK(dbfull()->SetDBOptions({{"bytes_per_sync", "8388608"}}));
+  ASSERT_EQ(8388608, dbfull()->GetDBOptions().bytes_per_sync);
+  // should sync approximately 40MB*2/8MB ~= 10 times.
+  // data will be 40*2MB because of previous Puts too.
+  for (i = 0; i < 40; i++) {
+    ASSERT_OK(Put(Key(i), kValue, write_opts));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_GT(counter, 5);
+  ASSERT_LT(counter, 15);
+
+  // Redundant assert. But leaving it here just to get the point across that
+  // low_bytes_per_sync > counter.
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, SetWalBytesPerSync) {
+  const size_t kValueSize = 1024 * 1024 * 3;
+  Options options;
+  options.create_if_missing = true;
+  options.wal_bytes_per_sync = 512;
+  options.write_buffer_size = 100 * kValueSize;
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(512, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  std::atomic_int counter{0};
+  int low_bytes_per_sync = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::RangeSync:0",
+      [&](void* /*arg*/) { counter.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kValue(kValueSize, 'v');
+  int i = 0;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  // Do not flush. If we flush here, SwitchWAL will reuse old WAL file since its
+  // empty and will not get the new wal_bytes_per_sync value.
+  low_bytes_per_sync = counter;
+  // 5242880 = 1024 * 1024 * 5
+  ASSERT_OK(dbfull()->SetDBOptions({{"wal_bytes_per_sync", "5242880"}}));
+  ASSERT_EQ(5242880, dbfull()->GetDBOptions().wal_bytes_per_sync);
+  counter = 0;
+  i = 0;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_GT(counter, 0);
+  ASSERT_GT(low_bytes_per_sync, 0);
+  ASSERT_GT(low_bytes_per_sync, counter);
+}
+
+TEST_F(DBOptionsTest, WritableFileMaxBufferSize) {
+  Options options;
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 1024 * 1024;
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_manifest_file_size = 1;
+  options.env = env_;
+  int buffer_size = 1024 * 1024;
+  Reopen(options);
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+
+  std::atomic<int> match_cnt(0);
+  std::atomic<int> unmatch_cnt(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WritableFileWriter::WritableFileWriter:0", [&](void* arg) {
+        int value = static_cast<int>(reinterpret_cast<uintptr_t>(arg));
+        if (value == buffer_size) {
+          match_cnt++;
+        } else {
+          unmatch_cnt++;
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  int i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"writable_file_max_buffer_size", "524288"}}));
+  buffer_size = 512 * 1024;
+  match_cnt = 0;
+  unmatch_cnt = 0;  // SetDBOptions() will create a WritableFileWriter
+
+  ASSERT_EQ(buffer_size,
+            dbfull()->GetDBOptions().writable_file_max_buffer_size);
+  i = 0;
+  for (; i < 3; i++) {
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(unmatch_cnt, 0);
+  ASSERT_GE(match_cnt, 11);
+}
+
+TEST_F(DBOptionsTest, SetOptionsAndReopen) {
+  Random rnd(1044);
+  auto rand_opts = GetRandomizedMutableCFOptionsMap(&rnd);
+  ASSERT_OK(dbfull()->SetOptions(rand_opts));
+  // Verify if DB can be reopen after setting options.
+  Options options;
+  options.env = env_;
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBOptionsTest, EnableAutoCompactionAndTriggerStall) {
+  const std::string kValue(1024, 'v');
+  for (int method_type = 0; method_type < 2; method_type++) {
+    for (int option_type = 0; option_type < 4; option_type++) {
+      Options options;
+      options.create_if_missing = true;
+      options.disable_auto_compactions = true;
+      options.write_buffer_size = 1024 * 1024 * 10;
+      options.compression = CompressionType::kNoCompression;
+      options.level0_file_num_compaction_trigger = 1;
+      options.level0_stop_writes_trigger = std::numeric_limits<int>::max();
+      options.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
+      options.hard_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.soft_pending_compaction_bytes_limit =
+          std::numeric_limits<uint64_t>::max();
+      options.env = env_;
+
+      DestroyAndReopen(options);
+      int i = 0;
+      for (; i < 1024; i++) {
+        ASSERT_OK(Put(Key(i), kValue));
+      }
+      ASSERT_OK(Flush());
+      for (; i < 1024 * 2; i++) {
+        ASSERT_OK(Put(Key(i), kValue));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      ASSERT_EQ(2, NumTableFilesAtLevel(0));
+      uint64_t l0_size = SizeAtLevel(0);
+
+      switch (option_type) {
+        case 0:
+          // test with level0_stop_writes_trigger
+          options.level0_stop_writes_trigger = 2;
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 1:
+          options.level0_slowdown_writes_trigger = 2;
+          break;
+        case 2:
+          options.hard_pending_compaction_bytes_limit = l0_size;
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+        case 3:
+          options.soft_pending_compaction_bytes_limit = l0_size;
+          break;
+      }
+      Reopen(options);
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+
+      SyncPoint::GetInstance()->LoadDependency(
+          {{"DBOptionsTest::EnableAutoCompactionAndTriggerStall:1",
+            "BackgroundCallCompaction:0"},
+           {"DBImpl::BackgroundCompaction():BeforePickCompaction",
+            "DBOptionsTest::EnableAutoCompactionAndTriggerStall:2"},
+           {"DBOptionsTest::EnableAutoCompactionAndTriggerStall:3",
+            "DBImpl::BackgroundCompaction():AfterPickCompaction"}});
+      // Block background compaction.
+      SyncPoint::GetInstance()->EnableProcessing();
+
+      switch (method_type) {
+        case 0:
+          ASSERT_OK(
+              dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+          break;
+        case 1:
+          ASSERT_OK(dbfull()->EnableAutoCompaction(
+              {dbfull()->DefaultColumnFamily()}));
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:1");
+      // Wait for stall condition recalculate.
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:2");
+
+      switch (option_type) {
+        case 0:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 1:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+        case 2:
+          ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped());
+          break;
+        case 3:
+          ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+          ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+          break;
+      }
+      TEST_SYNC_POINT("DBOptionsTest::EnableAutoCompactionAndTriggerStall:3");
+
+      // Background compaction executed.
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().IsStopped());
+      ASSERT_FALSE(dbfull()->TEST_write_controler().NeedsDelay());
+    }
+  }
+}
+
+TEST_F(DBOptionsTest, SetOptionsMayTriggerCompaction) {
+  Options options;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 1000;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; i++) {
+    // Need to insert two keys to avoid trivial move.
+    ASSERT_OK(Put("foo", std::to_string(i)));
+    ASSERT_OK(Put("bar", std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "3"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundCompactionThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_compactions = 1;  // default value
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_compactions", "3"}}));
+  ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+  auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+  ASSERT_EQ(3, dbfull()->TEST_BGCompactionsAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundFlushThreads) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_flushes = 1;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(1, dbfull()->TEST_BGFlushesAllowed());
+  ASSERT_EQ(1, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_background_flushes", "3"}}));
+  ASSERT_EQ(3, env_->GetBackgroundThreads(Env::Priority::HIGH));
+  ASSERT_EQ(3, dbfull()->TEST_BGFlushesAllowed());
+}
+
+TEST_F(DBOptionsTest, SetBackgroundJobs) {
+  Options options;
+  options.create_if_missing = true;
+  options.max_background_jobs = 8;
+  options.env = env_;
+  Reopen(options);
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      options.max_background_jobs = 12;
+      ASSERT_OK(dbfull()->SetDBOptions(
+          {{"max_background_jobs",
+            std::to_string(options.max_background_jobs)}}));
+    }
+
+    const int expected_max_flushes = options.max_background_jobs / 4;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(1, dbfull()->TEST_BGCompactionsAllowed());
+
+    auto stop_token = dbfull()->TEST_write_controler().GetStopToken();
+
+    const int expected_max_compactions = 3 * expected_max_flushes;
+
+    ASSERT_EQ(expected_max_flushes, dbfull()->TEST_BGFlushesAllowed());
+    ASSERT_EQ(expected_max_compactions, dbfull()->TEST_BGCompactionsAllowed());
+
+    ASSERT_EQ(expected_max_flushes,
+              env_->GetBackgroundThreads(Env::Priority::HIGH));
+    ASSERT_EQ(expected_max_compactions,
+              env_->GetBackgroundThreads(Env::Priority::LOW));
+  }
+}
+
+TEST_F(DBOptionsTest, AvoidFlushDuringShutdown) {
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  WriteOptions write_without_wal;
+  write_without_wal.disableWAL = true;
+
+  ASSERT_FALSE(options.avoid_flush_during_shutdown);
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1", write_without_wal));
+  Reopen(options);
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_EQ("1", FilesPerLevel());
+
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v2", write_without_wal));
+  ASSERT_OK(dbfull()->SetDBOptions({{"avoid_flush_during_shutdown", "true"}}));
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("", FilesPerLevel());
+}
+
+TEST_F(DBOptionsTest, SetDelayedWriteRateOption) {
+  Options options;
+  options.create_if_missing = true;
+  options.delayed_write_rate = 2 * 1024U * 1024U;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(2 * 1024U * 1024U,
+            dbfull()->TEST_write_controler().max_delayed_write_rate());
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"delayed_write_rate", "20000"}}));
+  ASSERT_EQ(20000, dbfull()->TEST_write_controler().max_delayed_write_rate());
+}
+
+TEST_F(DBOptionsTest, MaxTotalWalSizeChange) {
+  Random rnd(1044);
+  const auto value_size = size_t(1024);
+  std::string value = rnd.RandomString(value_size);
+
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  CreateColumnFamilies({"1", "2", "3"}, options);
+  ReopenWithColumnFamilies({"default", "1", "2", "3"}, options);
+
+  WriteOptions write_options;
+
+  const int key_count = 100;
+  for (int i = 0; i < key_count; ++i) {
+    for (size_t cf = 0; cf < handles_.size(); ++cf) {
+      ASSERT_OK(Put(static_cast<int>(cf), Key(i), value));
+    }
+  }
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_total_wal_size", "10"}}));
+
+  for (size_t cf = 0; cf < handles_.size(); ++cf) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
+    ASSERT_EQ("1", FilesPerLevel(static_cast<int>(cf)));
+  }
+}
+
+TEST_F(DBOptionsTest, SetStatsDumpPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_dump_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+
+  for (int i = 0; i < 20; i++) {
+    unsigned int num = rand() % 5000 + 1;
+    ASSERT_OK(dbfull()->SetDBOptions(
+        {{"stats_dump_period_sec", std::to_string(num)}}));
+    ASSERT_EQ(num, dbfull()->GetDBOptions().stats_dump_period_sec);
+  }
+  Close();
+}
+
+TEST_F(DBOptionsTest, SetOptionsStatsPersistPeriodSec) {
+  Options options;
+  options.create_if_missing = true;
+  options.stats_persist_period_sec = 5;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "12345"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+  ASSERT_NOK(dbfull()->SetDBOptions({{"stats_persist_period_sec", "abcde"}}));
+  ASSERT_EQ(12345u, dbfull()->GetDBOptions().stats_persist_period_sec);
+}
+
+static void assert_candidate_files_empty(DBImpl* dbfull, const bool empty) {
+  dbfull->TEST_LockMutex();
+  JobContext job_context(0);
+  dbfull->FindObsoleteFiles(&job_context, false);
+  ASSERT_EQ(empty, job_context.full_scan_candidate_files.empty());
+  dbfull->TEST_UnlockMutex();
+  if (job_context.HaveSomethingToDelete()) {
+    // fulfill the contract of FindObsoleteFiles by calling PurgeObsoleteFiles
+    // afterwards; otherwise the test may hang on shutdown
+    dbfull->PurgeObsoleteFiles(job_context);
+  }
+  job_context.Clean();
+}
+
+TEST_F(DBOptionsTest, DeleteObsoleteFilesPeriodChange) {
+  Options options;
+  options.env = env_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  options.create_if_missing = true;
+  ASSERT_OK(TryReopen(options));
+
+  // Verify that candidate files set is empty when no full scan requested.
+  assert_candidate_files_empty(dbfull(), true);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "0"}}));
+
+  // After delete_obsolete_files_period_micros updated to 0, the next call
+  // to FindObsoleteFiles should make a full scan
+  assert_candidate_files_empty(dbfull(), false);
+
+  ASSERT_OK(
+      dbfull()->SetDBOptions({{"delete_obsolete_files_period_micros", "20"}}));
+
+  assert_candidate_files_empty(dbfull(), true);
+
+  env_->MockSleepForMicroseconds(20);
+  assert_candidate_files_empty(dbfull(), true);
+
+  env_->MockSleepForMicroseconds(1);
+  assert_candidate_files_empty(dbfull(), false);
+
+  Close();
+}
+
+TEST_F(DBOptionsTest, MaxOpenFilesChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.max_open_files = -1;
+
+  Reopen(options);
+
+  Cache* tc = dbfull()->TEST_table_cache();
+
+  ASSERT_EQ(-1, dbfull()->GetDBOptions().max_open_files);
+  ASSERT_LT(2000, tc->GetCapacity());
+  ASSERT_OK(dbfull()->SetDBOptions({{"max_open_files", "1024"}}));
+  ASSERT_EQ(1024, dbfull()->GetDBOptions().max_open_files);
+  // examine the table cache (actual size should be 1014)
+  ASSERT_GT(1500, tc->GetCapacity());
+  Close();
+}
+
+TEST_F(DBOptionsTest, SanitizeDelayedWriteRate) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.delayed_write_rate = 0;
+  Reopen(options);
+  ASSERT_EQ(16 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+
+  options.rate_limiter.reset(NewGenericRateLimiter(31 * 1024 * 1024));
+  Reopen(options);
+  ASSERT_EQ(31 * 1024 * 1024, dbfull()->GetDBOptions().delayed_write_rate);
+}
+
+TEST_F(DBOptionsTest, SanitizeUniversalTTLCompaction) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.compaction_style = kCompactionStyleUniversal;
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(0, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 0;
+  options.periodic_compaction_seconds = 100;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 0;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 100;
+  options.periodic_compaction_seconds = 500;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+  ASSERT_EQ(100, dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBOptionsTest, SanitizeTtlDefault) {
+  Options options;
+  options.env = CurrentOptions().env;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.compaction_style = kCompactionStyleLevel;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(0, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SanitizeFIFOPeriodicCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.env = CurrentOptions().env;
+  options.ttl = 0;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100;
+  Reopen(options);
+  ASSERT_EQ(100, dbfull()->GetOptions().ttl);
+
+  options.ttl = 100 * 24 * 60 * 60;
+  Reopen(options);
+  ASSERT_EQ(100 * 24 * 60 * 60, dbfull()->GetOptions().ttl);
+
+  options.ttl = 200;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(200, dbfull()->GetOptions().ttl);
+
+  options.ttl = 500;
+  options.periodic_compaction_seconds = 300;
+  Reopen(options);
+  ASSERT_EQ(300, dbfull()->GetOptions().ttl);
+}
+
+TEST_F(DBOptionsTest, SetFIFOCompactionOptions) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.compaction_options_fifo.allow_compaction = false;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // Test dynamically changing ttl.
+  options.ttl = 1 * 60 * 60;  // 1 hour
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  env_->MockSleepForSeconds(61);
+
+  // No files should be compacted as ttl is set to 1 hour.
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 3600);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set ttl to 1 minute. So all files should get deleted.
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // Test dynamically changing compaction_options_fifo.max_table_files_size
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 00KB
+  options.ttl = 0;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB.
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            500 << 10);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set max_table_files_size to 12 KB. So only 1 file should remain now.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=12288;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            12 << 10);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+
+  // Test dynamically changing compaction_options_fifo.allow_compaction
+  options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+  options.ttl = 0;
+  options.compaction_options_fifo.allow_compaction = false;
+  options.level0_file_num_compaction_trigger = 6;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // No files should be compacted as max_table_files_size is set to 500 KB and
+  // allow_compaction is false
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // Set allow_compaction to true. So number of files should be between 1 and 5.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GE(NumTableFilesAtLevel(0), 1);
+  ASSERT_LE(NumTableFilesAtLevel(0), 5);
+}
+
+TEST_F(DBOptionsTest, CompactionReadaheadSizeChange) {
+  SpecialEnv env(env_);
+  Options options;
+  options.env = &env;
+
+  options.compaction_readahead_size = 0;
+  options.level0_file_num_compaction_trigger = 2;
+  const std::string kValue(1024, 'v');
+  Reopen(options);
+
+  ASSERT_EQ(0, dbfull()->GetDBOptions().compaction_readahead_size);
+  ASSERT_OK(dbfull()->SetDBOptions({{"compaction_readahead_size", "256"}}));
+  ASSERT_EQ(256, dbfull()->GetDBOptions().compaction_readahead_size);
+  for (int i = 0; i < 1024; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 1024 * 2; i++) {
+    ASSERT_OK(Put(Key(i), kValue));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(256, env_->compaction_readahead_size_);
+  Close();
+}
+
+TEST_F(DBOptionsTest, FIFOTtlBackwardCompatible) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.create_if_missing = true;
+  options.env = CurrentOptions().env;
+
+  ASSERT_OK(TryReopen(options));
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    // Generate and flush a file about 10KB.
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  // In release 6.0, ttl was promoted from a secondary level option under
+  // compaction_options_fifo to a top level option under ColumnFamilyOptions.
+  // We still need to handle old SetOptions calls but should ignore
+  // ttl under compaction_options_fifo.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{allow_compaction=true;max_table_files_size=1024;ttl=731;}"},
+       {"ttl", "60"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 60);
+
+  // Put ttl as the first option inside compaction_options_fifo. That works as
+  // it doesn't overwrite any other option.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{ttl=985;allow_compaction=true;max_table_files_size=1024;}"},
+       {"ttl", "191"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 191);
+}
+
+TEST_F(DBOptionsTest, ChangeCompression) {
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+  Options options;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kLZ4Compression;
+  options.bottommost_compression = CompressionType::kNoCompression;
+  options.bottommost_compression_opts.level = 2;
+  options.bottommost_compression_opts.parallel_threads = 1;
+  options.env = CurrentOptions().env;
+
+  ASSERT_OK(TryReopen(options));
+
+  CompressionType compression_used = CompressionType::kLZ4Compression;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* c = reinterpret_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kNoCompression, compression_used);
+  ASSERT_EQ(options.compression_opts.level, compression_opt_used.level);
+  ASSERT_EQ(options.compression_opts.parallel_threads,
+            compression_opt_used.parallel_threads);
+
+  compression_used = CompressionType::kLZ4Compression;
+  compacted = false;
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"bottommost_compression", "kSnappyCompression"},
+       {"bottommost_compression_opts", "0:6:0:0:4:true"}}));
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "foofoofoo"));
+  ASSERT_OK(Put("bar", "foofoofoo"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kSnappyCompression, compression_used);
+  ASSERT_EQ(6, compression_opt_used.level);
+  // Right now parallel_level is not yet allowed to be changed.
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBOptionsTest, BottommostCompressionOptsWithFallbackType) {
+  // Verify the bottommost compression options still take effect even when the
+  // bottommost compression type is left at its default value. Verify for both
+  // automatic and manual compaction.
+  if (!Snappy_Supported() || !LZ4_Supported()) {
+    return;
+  }
+
+  constexpr int kUpperCompressionLevel = 1;
+  constexpr int kBottommostCompressionLevel = 2;
+  constexpr int kNumL0Files = 2;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.compression = CompressionType::kLZ4Compression;
+  options.compression_opts.level = kUpperCompressionLevel;
+  options.bottommost_compression_opts.level = kBottommostCompressionLevel;
+  options.bottommost_compression_opts.enabled = true;
+  Reopen(options);
+
+  CompressionType compression_used = CompressionType::kDisableCompressionOption;
+  CompressionOptions compression_opt_used;
+  bool compacted = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionPicker::RegisterCompaction:Registered", [&](void* arg) {
+        Compaction* c = static_cast<Compaction*>(arg);
+        compression_used = c->output_compression();
+        compression_opt_used = c->output_compression_opts();
+        compacted = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // First, verify for automatic compaction.
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put("foo", "foofoofoo"));
+    ASSERT_OK(Put("bar", "foofoofoo"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+
+  // Second, verify for manual compaction.
+  compacted = false;
+  compression_used = CompressionType::kDisableCompressionOption;
+  compression_opt_used = CompressionOptions();
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(dbfull()->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_TRUE(compacted);
+  ASSERT_EQ(CompressionType::kLZ4Compression, compression_used);
+  ASSERT_EQ(kBottommostCompressionLevel, compression_opt_used.level);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_properties_test.cc b/src/rocksdb/db/db_properties_test.cc
new file mode 100644
index 000000000..85cd5c04e
--- /dev/null
+++ b/src/rocksdb/db/db_properties_test.cc
@@ -0,0 +1,2206 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <stdio.h>
+
+#include <algorithm>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "options/cf_options.h"
+#include "port/stack_trace.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/perf_level.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_builder.h"
+#include "test_util/mock_time_env.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBPropertiesTest : public DBTestBase {
+ public:
+  DBPropertiesTest()
+      : DBTestBase("db_properties_test", /*env_do_fsync=*/false) {}
+
+  void AssertDbStats(const std::map<std::string, std::string>& db_stats,
+                     double expected_uptime, int expected_user_bytes_written,
+                     int expected_wal_bytes_written,
+                     int expected_user_writes_by_self,
+                     int expected_user_writes_with_wal) {
+    ASSERT_EQ(std::to_string(expected_uptime), db_stats.at("db.uptime"));
+    ASSERT_EQ(std::to_string(expected_wal_bytes_written),
+              db_stats.at("db.wal_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.wal_syncs"));
+    ASSERT_EQ(std::to_string(expected_user_bytes_written),
+              db_stats.at("db.user_bytes_written"));
+    ASSERT_EQ("0", db_stats.at("db.user_writes_by_other"));
+    ASSERT_EQ(std::to_string(expected_user_writes_by_self),
+              db_stats.at("db.user_writes_by_self"));
+    ASSERT_EQ(std::to_string(expected_user_writes_with_wal),
+              db_stats.at("db.user_writes_with_wal"));
+    ASSERT_EQ("0", db_stats.at("db.user_write_stall_micros"));
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, Empty) {
+  do {
+    Options options;
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    options.allow_concurrent_memtable_write = false;
+    options = CurrentOptions(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string num;
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("2", num);
+
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger compaction
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ("1", num);
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->DisableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions(false));
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("0", num);
+
+    ASSERT_OK(db_->EnableFileDeletions());
+    ASSERT_TRUE(
+        dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num));
+    ASSERT_EQ("1", num);
+  } while (ChangeOptions());
+}
+
+TEST_F(DBPropertiesTest, CurrentVersionNumber) {
+  uint64_t v1, v2, v3;
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1));
+  ASSERT_OK(Put("12345678", ""));
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3));
+
+  ASSERT_EQ(v1, v2);
+  ASSERT_GT(v3, v2);
+}
+
+TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) {
+  const int kKeySize = 100;
+  const int kValueSize = 500;
+  const int kKeyNum = 100;
+
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10;
+  // Make them never flush
+  options.min_write_buffer_number_to_merge = 1000;
+  options.max_write_buffer_number = 1000;
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"one", "two", "three", "four"}, options);
+
+  Random rnd(301);
+  for (auto* handle : handles_) {
+    for (int i = 0; i < kKeyNum; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), handle, rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
+    }
+  }
+
+  uint64_t manual_sum = 0;
+  uint64_t api_sum = 0;
+  uint64_t value = 0;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(
+        db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value));
+    manual_sum += value;
+  }
+  ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables,
+                                            &api_sum));
+  ASSERT_GT(manual_sum, 0);
+  ASSERT_EQ(manual_sum, api_sum);
+
+  ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value));
+
+  uint64_t before_flush_trm;
+  uint64_t after_flush_trm;
+  for (auto* handle : handles_) {
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &before_flush_trm));
+
+    // Issue flush and expect larger memory usage of table readers.
+    ASSERT_OK(db_->Flush(FlushOptions(), handle));
+
+    ASSERT_TRUE(db_->GetAggregatedIntProperty(
+        DB::Properties::kEstimateTableReadersMem, &after_flush_trm));
+    ASSERT_GT(after_flush_trm, before_flush_trm);
+  }
+}
+
+namespace {
+void ResetTableProperties(TableProperties* tp) {
+  tp->data_size = 0;
+  tp->index_size = 0;
+  tp->filter_size = 0;
+  tp->raw_key_size = 0;
+  tp->raw_value_size = 0;
+  tp->num_data_blocks = 0;
+  tp->num_entries = 0;
+  tp->num_deletions = 0;
+  tp->num_merge_operands = 0;
+  tp->num_range_deletions = 0;
+}
+
+void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) {
+  double dummy_double;
+  std::replace(tp_string.begin(), tp_string.end(), ';', ' ');
+  std::replace(tp_string.begin(), tp_string.end(), '=', ' ');
+  ResetTableProperties(tp);
+  sscanf(tp_string.c_str(),
+         "# data blocks %" SCNu64 " # entries %" SCNu64 " # deletions %" SCNu64
+         " # merge operands %" SCNu64 " # range deletions %" SCNu64
+         " raw key size %" SCNu64
+         " raw average key size %lf "
+         " raw value size %" SCNu64
+         " raw average value size %lf "
+         " data block size %" SCNu64 " index block size (user-key? %" SCNu64
+         ", delta-value? %" SCNu64 ") %" SCNu64 " filter block size %" SCNu64,
+         &tp->num_data_blocks, &tp->num_entries, &tp->num_deletions,
+         &tp->num_merge_operands, &tp->num_range_deletions, &tp->raw_key_size,
+         &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size,
+         &tp->index_key_is_user_key, &tp->index_value_is_delta_encoded,
+         &tp->index_size, &tp->filter_size);
+}
+
+void VerifySimilar(uint64_t a, uint64_t b, double bias) {
+  ASSERT_EQ(a == 0U, b == 0U);
+  if (a == 0) {
+    return;
+  }
+  double dbl_a = static_cast<double>(a);
+  double dbl_b = static_cast<double>(b);
+  if (dbl_a > dbl_b) {
+    ASSERT_LT(static_cast<double>(dbl_a - dbl_b) / (dbl_a + dbl_b), bias);
+  } else {
+    ASSERT_LT(static_cast<double>(dbl_b - dbl_a) / (dbl_a + dbl_b), bias);
+  }
+}
+
+void VerifyTableProperties(
+    const TableProperties& base_tp, const TableProperties& new_tp,
+    double filter_size_bias = CACHE_LINE_SIZE >= 256 ? 0.18 : 0.1,
+    double index_size_bias = 0.1, double data_size_bias = 0.1,
+    double num_data_blocks_bias = 0.05) {
+  VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias);
+  VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias);
+  VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias);
+  VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks,
+                num_data_blocks_bias);
+
+  ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size);
+  ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size);
+  ASSERT_EQ(base_tp.num_entries, new_tp.num_entries);
+  ASSERT_EQ(base_tp.num_deletions, new_tp.num_deletions);
+  ASSERT_EQ(base_tp.num_range_deletions, new_tp.num_range_deletions);
+
+  // Merge operands may become Puts, so we only have an upper bound the exact
+  // number of merge operands.
+  ASSERT_GE(base_tp.num_merge_operands, new_tp.num_merge_operands);
+}
+
+void GetExpectedTableProperties(
+    TableProperties* expected_tp, const int kKeySize, const int kValueSize,
+    const int kPutsPerTable, const int kDeletionsPerTable,
+    const int kMergeOperandsPerTable, const int kRangeDeletionsPerTable,
+    const int kTableCount, const int kBloomBitsPerKey, const size_t kBlockSize,
+    const bool index_key_is_user_key, const bool value_delta_encoding) {
+  const int kKeysPerTable =
+      kPutsPerTable + kDeletionsPerTable + kMergeOperandsPerTable;
+  const int kPutCount = kTableCount * kPutsPerTable;
+  const int kDeletionCount = kTableCount * kDeletionsPerTable;
+  const int kMergeCount = kTableCount * kMergeOperandsPerTable;
+  const int kRangeDeletionCount = kTableCount * kRangeDeletionsPerTable;
+  const int kKeyCount =
+      kPutCount + kDeletionCount + kMergeCount + kRangeDeletionCount;
+  const int kAvgSuccessorSize = kKeySize / 5;
+  const int kEncodingSavePerKey = kKeySize / 4;
+  expected_tp->raw_key_size = kKeyCount * (kKeySize + 8);
+  expected_tp->raw_value_size =
+      (kPutCount + kMergeCount + kRangeDeletionCount) * kValueSize;
+  expected_tp->num_entries = kKeyCount;
+  expected_tp->num_deletions = kDeletionCount + kRangeDeletionCount;
+  expected_tp->num_merge_operands = kMergeCount;
+  expected_tp->num_range_deletions = kRangeDeletionCount;
+  expected_tp->num_data_blocks =
+      kTableCount *
+      (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) /
+      kBlockSize;
+  expected_tp->data_size =
+      kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize));
+  expected_tp->index_size =
+      expected_tp->num_data_blocks *
+      (kAvgSuccessorSize + (index_key_is_user_key ? 0 : 8) -
+       // discount 1 byte as value size is not encoded in value delta encoding
+       (value_delta_encoding ? 1 : 0));
+  expected_tp->filter_size =
+      kTableCount * ((kKeysPerTable * kBloomBitsPerKey + 7) / 8 +
+                     /*average-ish overhead*/ CACHE_LINE_SIZE / 2);
+}
+}  // anonymous namespace
+
+TEST_F(DBPropertiesTest, ValidatePropertyInfo) {
+  for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) {
+    // If C++ gets a std::string_literal, this would be better to check at
+    // compile-time using static_assert.
+    ASSERT_TRUE(ppt_name_and_info.first.empty() ||
+                !isdigit(ppt_name_and_info.first.back()));
+
+    int count = 0;
+    count += (ppt_name_and_info.second.handle_string == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_int == nullptr) ? 0 : 1;
+    count += (ppt_name_and_info.second.handle_string_dbimpl == nullptr) ? 0 : 1;
+    ASSERT_TRUE(count == 1);
+  }
+}
+
+TEST_F(DBPropertiesTest, ValidateSampleNumber) {
+  // When "max_open_files" is -1, we read all the files for
+  // "rocksdb.estimate-num-keys" computation, which is the ground truth.
+  // Otherwise, we sample 20 newest files to make an estimation.
+  // Formula: lastest_20_files_active_key_ratio * total_files
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1000;
+  DestroyAndReopen(options);
+  int key = 0;
+  for (int files = 20; files >= 10; files -= 10) {
+    for (int i = 0; i < files; i++) {
+      int rows = files / 10;
+      for (int j = 0; j < rows; j++) {
+        ASSERT_OK(db_->Put(WriteOptions(), std::to_string(++key), "foo"));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+  }
+  std::string num;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("45", num);
+  options.max_open_files = -1;
+  Reopen(options);
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ("50", num);
+}
+
+TEST_F(DBPropertiesTest, AggregatedTableProperties) {
+  for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) {
+    const int kDeletionsPerTable = 0;
+    const int kMergeOperandsPerTable = 15;
+    const int kRangeDeletionsPerTable = 5;
+    const int kPutsPerTable = 100;
+    const int kKeySize = 80;
+    const int kValueSize = 200;
+    const int kBloomBitsPerKey = 20;
+
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 8;
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.merge_operator.reset(new TestPutOperator());
+
+    BlockBasedTableOptions table_options;
+    table_options.filter_policy.reset(
+        NewBloomFilterPolicy(kBloomBitsPerKey, false));
+    table_options.block_size = 1024;
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    DestroyAndReopen(options);
+
+    // Hold open a snapshot to prevent range tombstones from being compacted
+    // away.
+    ManagedSnapshot snapshot(db_);
+
+    Random rnd(5632);
+    for (int table = 1; table <= kTableCount; ++table) {
+      for (int i = 0; i < kPutsPerTable; ++i) {
+        ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
+      }
+      for (int i = 0; i < kDeletionsPerTable; i++) {
+        ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+      }
+      for (int i = 0; i < kMergeOperandsPerTable; i++) {
+        ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                             rnd.RandomString(kValueSize)));
+      }
+      for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+        std::string start = rnd.RandomString(kKeySize);
+        std::string end = start;
+        end.resize(kValueSize);
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   start, end));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    std::string property;
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property);
+    TableProperties output_tp;
+    ParseTablePropertiesString(property, &output_tp);
+    bool index_key_is_user_key = output_tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = output_tp.index_value_is_delta_encoded > 0;
+
+    TableProperties expected_tp;
+    GetExpectedTableProperties(
+        &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+        kMergeOperandsPerTable, kRangeDeletionsPerTable, kTableCount,
+        kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+        value_is_delta_encoded);
+
+    VerifyTableProperties(expected_tp, output_tp);
+  }
+}
+
+TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;
+  options.level0_file_num_compaction_trigger = 6;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 4500 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_write_buffer_number = 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_open_files = 11;  // Make sure no proloading of table readers
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int key_index = 0;
+  Random rnd(301);
+  for (int num = 0; num < 8; num++) {
+    ASSERT_OK(Put("foo", "bar"));
+    GenerateNewFile(&rnd, &key_index);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
+
+  // Get() after flushes, See latency histogram tracked.
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue Get(). See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+
+  // Test for getting immutable_db_options_.statistics
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.options-statistics", &prop));
+  ASSERT_NE(std::string::npos, prop.find("rocksdb.block.cache.miss"));
+  ASSERT_EQ(std::string::npos, prop.find("rocksdb.db.f.micros"));
+
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Reopen and issue iterating. See thee latency tracked
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+    }
+    ASSERT_OK(iter->status());
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // CF 1 should show no histogram.
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  // put something and read it back , CF 1 should show histogram.
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("bar", Get(1, "foo"));
+
+  ASSERT_TRUE(
+      dbfull()->GetProperty(handles_[1], "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // options.max_open_files preloads table readers.
+  options.max_open_files = -1;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(dbfull()->GetProperty(dbfull()->DefaultColumnFamily(),
+                                    "rocksdb.cf-file-histogram", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+  for (int key = 0; key < key_index; key++) {
+    Get(Key(key));
+  }
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+
+  // Clear internal stats
+  ASSERT_OK(dbfull()->ResetStats());
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.cfstats", &prop));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram"));
+  ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram"));
+}
+
+TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) {
+  const int kTableCount = 100;
+  const int kDeletionsPerTable = 0;
+  const int kMergeOperandsPerTable = 2;
+  const int kRangeDeletionsPerTable = 2;
+  const int kPutsPerTable = 10;
+  const int kKeySize = 50;
+  const int kValueSize = 400;
+  const int kMaxLevel = 7;
+  const int kBloomBitsPerKey = 20;
+  Random rnd(301);
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 8192;
+  options.max_bytes_for_level_base = 10000;
+  options.max_bytes_for_level_multiplier = 2;
+  // This ensures there no compaction happening when we call GetProperty().
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new TestPutOperator());
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(
+      NewBloomFilterPolicy(kBloomBitsPerKey, false));
+  table_options.block_size = 1024;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  DestroyAndReopen(options);
+
+  // Hold open a snapshot to prevent range tombstones from being compacted away.
+  ManagedSnapshot snapshot(db_);
+
+  std::string level_tp_strings[kMaxLevel];
+  std::string tp_string;
+  TableProperties level_tps[kMaxLevel];
+  TableProperties tp, sum_tp, expected_tp;
+  for (int table = 1; table <= kTableCount; ++table) {
+    for (int i = 0; i < kPutsPerTable; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), rnd.RandomString(kKeySize),
+                         rnd.RandomString(kValueSize)));
+    }
+    for (int i = 0; i < kDeletionsPerTable; i++) {
+      ASSERT_OK(db_->Delete(WriteOptions(), rnd.RandomString(kKeySize)));
+    }
+    for (int i = 0; i < kMergeOperandsPerTable; i++) {
+      ASSERT_OK(db_->Merge(WriteOptions(), rnd.RandomString(kKeySize),
+                           rnd.RandomString(kValueSize)));
+    }
+    for (int i = 0; i < kRangeDeletionsPerTable; i++) {
+      std::string start = rnd.RandomString(kKeySize);
+      std::string end = start;
+      end.resize(kValueSize);
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 start, end));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ResetTableProperties(&sum_tp);
+    for (int level = 0; level < kMaxLevel; ++level) {
+      db_->GetProperty(DB::Properties::kAggregatedTablePropertiesAtLevel +
+                           std::to_string(level),
+                       &level_tp_strings[level]);
+      ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]);
+      sum_tp.data_size += level_tps[level].data_size;
+      sum_tp.index_size += level_tps[level].index_size;
+      sum_tp.filter_size += level_tps[level].filter_size;
+      sum_tp.raw_key_size += level_tps[level].raw_key_size;
+      sum_tp.raw_value_size += level_tps[level].raw_value_size;
+      sum_tp.num_data_blocks += level_tps[level].num_data_blocks;
+      sum_tp.num_entries += level_tps[level].num_entries;
+      sum_tp.num_deletions += level_tps[level].num_deletions;
+      sum_tp.num_merge_operands += level_tps[level].num_merge_operands;
+      sum_tp.num_range_deletions += level_tps[level].num_range_deletions;
+    }
+    db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string);
+    ParseTablePropertiesString(tp_string, &tp);
+    bool index_key_is_user_key = tp.index_key_is_user_key > 0;
+    bool value_is_delta_encoded = tp.index_value_is_delta_encoded > 0;
+    ASSERT_EQ(sum_tp.data_size, tp.data_size);
+    ASSERT_EQ(sum_tp.index_size, tp.index_size);
+    ASSERT_EQ(sum_tp.filter_size, tp.filter_size);
+    ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size);
+    ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size);
+    ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks);
+    ASSERT_EQ(sum_tp.num_entries, tp.num_entries);
+    ASSERT_EQ(sum_tp.num_deletions, tp.num_deletions);
+    ASSERT_EQ(sum_tp.num_merge_operands, tp.num_merge_operands);
+    ASSERT_EQ(sum_tp.num_range_deletions, tp.num_range_deletions);
+    if (table > 3) {
+      GetExpectedTableProperties(
+          &expected_tp, kKeySize, kValueSize, kPutsPerTable, kDeletionsPerTable,
+          kMergeOperandsPerTable, kRangeDeletionsPerTable, table,
+          kBloomBitsPerKey, table_options.block_size, index_key_is_user_key,
+          value_is_delta_encoded);
+      // Gives larger bias here as index block size, filter block size,
+      // and data block size become much harder to estimate in this test.
+      VerifyTableProperties(expected_tp, tp, CACHE_LINE_SIZE >= 256 ? 0.6 : 0.5,
+                            0.5, 0.5, 0.25);
+    }
+  }
+}
+
+TEST_F(DBPropertiesTest, NumImmutableMemTable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.write_buffer_size = 1000000;
+    options.max_write_buffer_size_to_maintain =
+        5 * static_cast<int64_t>(options.write_buffer_size);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    std::string big_value(1000000 * 2, 'x');
+    std::string num;
+    uint64_t value;
+    SetPerfLevel(kEnableTime);
+    ASSERT_TRUE(GetPerfLevel() == kEnableTime);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "1");
+
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value));
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &num));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "2");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &num));
+    ASSERT_EQ(num, "1");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &num));
+    ASSERT_EQ(num, "2");
+    get_perf_context()->Reset();
+    Get(1, "k2");
+    ASSERT_EQ(2, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k3");
+    ASSERT_EQ(1, static_cast<int>(get_perf_context()->get_from_memtable_count));
+    get_perf_context()->Reset();
+    Get(1, "k1");
+    ASSERT_EQ(3, static_cast<int>(get_perf_context()->get_from_memtable_count));
+
+    ASSERT_OK(Flush(1));
+    ASSERT_TRUE(dbfull()->GetProperty(handles_[1],
+                                      "rocksdb.num-immutable-mem-table", &num));
+    ASSERT_EQ(num, "0");
+    ASSERT_TRUE(dbfull()->GetProperty(
+        handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num));
+    ASSERT_EQ(num, "3");
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.cur-size-active-mem-table", &value));
+    // "192" is the size of the metadata of two empty skiplists, this would
+    // break if we change the default skiplist implementation
+    ASSERT_GE(value, 192);
+
+    uint64_t int_num;
+    uint64_t base_total_size;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &base_total_size));
+
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", ""));
+    ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3"));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 2U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-active-mem-table", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value));
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 4U);
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.estimate-num-keys", &int_num));
+    ASSERT_EQ(int_num, base_total_size + 1);
+
+    SetPerfLevel(kDisable);
+    ASSERT_TRUE(GetPerfLevel() == kDisable);
+  } while (ChangeCompactOptions());
+}
+
+// TODO(techdept) : Disabled flaky test #12863555
+TEST_F(DBPropertiesTest, DISABLED_GetProperty) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = 1;
+  options.compaction_options_universal.size_ratio = 50;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+  SetPerfLevel(kEnableTime);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "1");
+  get_perf_context()->Reset();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing"));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num));
+  ASSERT_EQ(num, "2");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "2");
+  // Verify the same set of properties through GetIntProperty
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num));
+  ASSERT_EQ(int_num, 2U);
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num));
+  ASSERT_EQ(int_num, 1U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num));
+  ASSERT_EQ(int_num, 0U);
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_EQ(int_num, 2U);
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  dbfull()->TEST_WaitForFlushMemTable();
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value));
+  ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value));
+  dbfull()->TEST_WaitForFlushMemTable();
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num));
+  ASSERT_EQ(num, "0");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num));
+  ASSERT_EQ(num, "1");
+  ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num));
+  ASSERT_EQ(num, "4");
+
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  // Wait for compaction to be done. This is important because otherwise RocksDB
+  // might schedule a compaction when reopening the database, failing assertion
+  // (A) as a result.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  options.max_open_files = 10;
+  Reopen(options);
+  // After reopening, no table reader is loaded, so no memory for table readers
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_EQ(int_num, 0U);  // (A)
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // After reading a key, at least one table reader is loaded.
+  Get("k5");
+  ASSERT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  // Test rocksdb.num-live-versions
+  {
+    options.level0_file_num_compaction_trigger = 20;
+    Reopen(options);
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value));
+    ASSERT_OK(Flush());
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    // Use an iterator to hold current version
+    std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+
+    ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value));
+    ASSERT_OK(Flush());
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 3U);
+
+    iter2.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 2U);
+
+    iter1.reset();
+    ASSERT_TRUE(
+        dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num));
+    ASSERT_EQ(int_num, 1U);
+  }
+}
+
+TEST_F(DBPropertiesTest, ApproximateMemoryUsage) {
+  const int kNumRounds = 10;
+  // TODO(noetzli) kFlushesPerRound does not really correlate with how many
+  // flushes happen.
+  const int kFlushesPerRound = 10;
+  const int kWritesPerFlush = 10;
+  const int kKeySize = 100;
+  const int kValueSize = 1000;
+  Options options;
+  options.write_buffer_size = 1000;  // small write buffer
+  options.min_write_buffer_number_to_merge = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  std::vector<Iterator*> iters;
+
+  uint64_t active_mem;
+  uint64_t unflushed_mem;
+  uint64_t all_mem;
+  uint64_t prev_all_mem;
+
+  // Phase 0. The verify the initial value of all these properties are the same
+  // as we have no mem-tables.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(all_mem, active_mem);
+  ASSERT_EQ(all_mem, unflushed_mem);
+
+  // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to
+  // "size-all-mem-tables"
+  for (int r = 0; r < kNumRounds; ++r) {
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+      }
+    }
+    // Make sure that there is no flush between getting the two properties.
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // in no iterator case, these two number should be the same.
+    ASSERT_EQ(unflushed_mem, all_mem);
+  }
+  prev_all_mem = all_mem;
+
+  // Phase 2. Keep issuing Put() but also create new iterators. This time we
+  // expect "size-all-mem-tables" > "cur-size-all-mem-tables".
+  for (int r = 0; r < kNumRounds; ++r) {
+    iters.push_back(db_->NewIterator(ReadOptions()));
+    for (int f = 0; f < kFlushesPerRound; ++f) {
+      for (int w = 0; w < kWritesPerFlush; ++w) {
+        ASSERT_OK(
+            Put(rnd.RandomString(kKeySize), rnd.RandomString(kValueSize)));
+      }
+    }
+    // Force flush to prevent flush from happening between getting the
+    // properties or after getting the properties and before the new round.
+    ASSERT_OK(Flush());
+
+    // In the second round, add iterators.
+    dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+    dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    ASSERT_GT(all_mem, active_mem);
+    ASSERT_GT(all_mem, unflushed_mem);
+    ASSERT_GT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks
+  // whenever we release an iterator.
+  for (auto* iter : iters) {
+    ASSERT_OK(iter->status());
+    delete iter;
+    dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+    // Expect the size shrinking
+    ASSERT_LT(all_mem, prev_all_mem);
+    prev_all_mem = all_mem;
+  }
+
+  // Expect all these three counters to be the same.
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+
+  // Phase 5. Reopen, and expect all these three counters to be the same again.
+  Reopen(options);
+  dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem);
+  dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem);
+  dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem);
+  ASSERT_EQ(active_mem, unflushed_mem);
+  ASSERT_EQ(unflushed_mem, all_mem);
+}
+
+TEST_F(DBPropertiesTest, EstimatePendingCompBytes) {
+  // Set sizes to both background thread pool to be 1 and block them.
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  Options options = CurrentOptions();
+  WriteOptions writeOpt = WriteOptions();
+  writeOpt.disableWAL = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_background_compactions = 1;
+  options.max_background_flushes = 1;
+  options.max_write_buffer_number = 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain = 0;
+  options.write_buffer_size = 1000000;
+  Reopen(options);
+
+  std::string big_value(1000000 * 2, 'x');
+  std::string num;
+  uint64_t int_num;
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_GT(int_num, 0U);
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(dbfull()->GetIntProperty(
+      "rocksdb.estimate-pending-compaction-bytes", &int_num));
+  ASSERT_EQ(int_num, 0U);
+}
+
+TEST_F(DBPropertiesTest, EstimateCompressionRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNumL0Files = 3;
+  const int kNumEntriesPerFile = 1000;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 3;
+  Reopen(options);
+
+  ASSERT_OK(db_->SetOptions(
+      {{"compression_per_level", "kNoCompression:kSnappyCompression"}}));
+  auto opts = db_->GetOptions();
+  ASSERT_EQ(opts.compression_per_level.size(), 2);
+  ASSERT_EQ(opts.compression_per_level[0], kNoCompression);
+  ASSERT_EQ(opts.compression_per_level[1], kSnappyCompression);
+
+  // compression ratio is -1.0 when no open files at level
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+
+  const std::string kVal(100, 'a');
+  for (int i = 0; i < kNumL0Files; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      // Put common data ("key") at end to prevent delta encoding from
+      // compressing the key effectively
+      std::string key = std::to_string(i) + std::to_string(j) + "key";
+      ASSERT_OK(dbfull()->Put(WriteOptions(), key, kVal));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // no compression at L0, so ratio is less than one
+  ASSERT_LT(CompressionRatioAtLevel(0), 1.0);
+  ASSERT_GT(CompressionRatioAtLevel(0), 0.0);
+  ASSERT_EQ(CompressionRatioAtLevel(1), -1.0);
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+
+  ASSERT_EQ(CompressionRatioAtLevel(0), -1.0);
+  // Data at L1 should be highly compressed thanks to Snappy and redundant data
+  // in values (ratio is 12.846 as of 4/19/2016).
+  ASSERT_GT(CompressionRatioAtLevel(1), 10.0);
+}
+
+#endif  // ROCKSDB_LITE
+
+class CountingUserTblPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingUserTblPropCollector"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{
+        {"CountingUserTblPropCollector", message_},
+        {"Count", encoded},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+};
+
+class CountingUserTblPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  explicit CountingUserTblPropCollectorFactory(
+      uint32_t expected_column_family_id)
+      : expected_column_family_id_(expected_column_family_id),
+        num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(expected_column_family_id_, context.column_family_id);
+    num_created_++;
+    return new CountingUserTblPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingUserTblPropCollectorFactory";
+  }
+  void set_expected_column_family_id(uint32_t v) {
+    expected_column_family_id_ = v;
+  }
+  uint32_t expected_column_family_id_;
+  uint32_t num_created_;
+};
+
+class CountingDeleteTabPropCollector : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "CountingDeleteTabPropCollector"; }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    if (type == kEntryDelete) {
+      num_deletes_++;
+    }
+    return Status::OK();
+  }
+
+  bool NeedCompact() const override { return num_deletes_ > 10; }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    *properties =
+        UserCollectedProperties{{"num_delete", std::to_string(num_deletes_)}};
+    return Status::OK();
+  }
+
+ private:
+  uint32_t num_deletes_ = 0;
+};
+
+class CountingDeleteTabPropCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new CountingDeleteTabPropCollector();
+  }
+  const char* Name() const override {
+    return "CountingDeleteTabPropCollectorFactory";
+  }
+};
+
+class BlockCountingTablePropertiesCollector : public TablePropertiesCollector {
+ public:
+  static const std::string kNumSampledBlocksPropertyName;
+
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollector";
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    (*properties)[kNumSampledBlocksPropertyName] =
+        std::to_string(num_sampled_blocks_);
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t block_compressed_bytes_fast,
+                uint64_t block_compressed_bytes_slow) override {
+    if (block_compressed_bytes_fast > 0 || block_compressed_bytes_slow > 0) {
+      num_sampled_blocks_++;
+    }
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{
+        {kNumSampledBlocksPropertyName, std::to_string(num_sampled_blocks_)},
+    };
+  }
+
+ private:
+  uint32_t num_sampled_blocks_ = 0;
+};
+
+const std::string
+    BlockCountingTablePropertiesCollector::kNumSampledBlocksPropertyName =
+        "NumSampledBlocks";
+
+class BlockCountingTablePropertiesCollectorFactory
+    : public TablePropertiesCollectorFactory {
+ public:
+  const char* Name() const override {
+    return "BlockCountingTablePropertiesCollectorFactory";
+  }
+
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /* context */) override {
+    return new BlockCountingTablePropertiesCollector();
+  }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(0);
+  options.table_properties_collector_factories[0] = collector_factory;
+  Reopen(options);
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(4U, props.size());
+  uint32_t sum = 0;
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb");
+    ASSERT_TRUE(user_collected.find("Count") != user_collected.end());
+    Slice key(user_collected.at("Count"));
+    uint32_t count;
+    ASSERT_TRUE(GetVarint32(&key, &count));
+    sum += count;
+  }
+  ASSERT_EQ(10u + 11u + 12u + 13u, sum);
+
+  ASSERT_GT(collector_factory->num_created_, 0U);
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 3;
+  options.table_properties_collector_factories.resize(1);
+  std::shared_ptr<CountingUserTblPropCollectorFactory> collector_factory =
+      std::make_shared<CountingUserTblPropCollectorFactory>(1);
+  options.table_properties_collector_factories[0] = collector_factory,
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Create 2 files
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush(1));
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(1, std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  // Come back to write to default column family
+  collector_factory->num_created_ = 0;
+  collector_factory->set_expected_column_family_id(0);  // default CF
+  // Create 4 tables in default column family
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  // Trigger automatic compactions.
+  for (int table = 0; table < 3; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(collector_factory->num_created_, 0U);
+
+  collector_factory->num_created_ = 0;
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_GT(collector_factory->num_created_, 0U);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 4;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 10240;
+  options.max_bytes_for_level_multiplier = 4;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 1000;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(102)));
+    ASSERT_OK(Put(Key(kMaxKey + i), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 1) {
+    // Clear Level 0 so that when later flush a file with deletions,
+    // we don't trigger an organic compaction.
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(Put(Key(kMaxKey * 2), ""));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  {
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+      ++c;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 200);
+  }
+
+  ASSERT_OK(Delete(Key(0)));
+  for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Delete(Key(kMaxKey * 2)));
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(kMaxKey - 100));
+    while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) {
+      iter->Next();
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 0);
+    ASSERT_LT(get_perf_context()->internal_delete_skipped_count, 30u);
+    ASSERT_LT(get_perf_context()->internal_key_skipped_count, 30u);
+    SetPerfLevel(kDisable);
+  }
+}
+
+TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) {
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.max_write_buffer_number = 8;
+  options.level0_file_num_compaction_trigger = 10;
+  options.level0_slowdown_writes_trigger = 10;
+  options.level0_stop_writes_trigger = 10;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
+      std::make_shared<CountingDeleteTabPropCollectorFactory>();
+  options.table_properties_collector_factories.resize(1);
+  options.table_properties_collector_factories[0] = collector_factory;
+
+  DestroyAndReopen(options);
+
+  const int kMaxKey = 100;
+  for (int i = 0; i < kMaxKey; i++) {
+    ASSERT_OK(Put(Key(i), ""));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  for (int i = 1; i < kMaxKey - 1; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+
+  // Restart the DB. Although number of files didn't reach
+  // options.level0_file_num_compaction_trigger, compaction should
+  // still be triggered because of the need-compaction hint.
+  options.disable_auto_compactions = false;
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  {
+    SetPerfLevel(kEnableCount);
+    get_perf_context()->Reset();
+    int c = 0;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
+      c++;
+    }
+    ASSERT_OK(iter->status());
+    ASSERT_EQ(c, 2);
+    ASSERT_EQ(get_perf_context()->internal_delete_skipped_count, 0);
+    // We iterate every key twice. Is it a bug?
+    ASSERT_LE(get_perf_context()->internal_key_skipped_count, 2);
+    SetPerfLevel(kDisable);
+  }
+}
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_F(DBPropertiesTest, BlockAddForCompressionSampling) {
+  // Sampled compression requires at least one of the following four types.
+  if (!Snappy_Supported() && !Zlib_Supported() && !LZ4_Supported() &&
+      !ZSTD_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.table_properties_collector_factories.emplace_back(
+      std::make_shared<BlockCountingTablePropertiesCollectorFactory>());
+
+  for (bool sample_for_compression : {false, true}) {
+    // For simplicity/determinism, sample 100% when enabled, or 0% when disabled
+    options.sample_for_compression = sample_for_compression ? 1 : 0;
+
+    DestroyAndReopen(options);
+
+    // Setup the following LSM:
+    //
+    // L0_0 ["a", "b"]
+    // L1_0 ["a", "b"]
+    //
+    // L0_0 was created by flush. L1_0 was created by compaction. Each file
+    // contains one data block.
+    for (int i = 0; i < 3; ++i) {
+      ASSERT_OK(Put("a", "val"));
+      ASSERT_OK(Put("b", "val"));
+      ASSERT_OK(Flush());
+      if (i == 1) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    // A `BlockAdd()` should have been seen for files generated by flush or
+    // compaction when `sample_for_compression` is enabled.
+    TablePropertiesCollection file_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+    ASSERT_EQ(2, file_to_props.size());
+    for (const auto& file_and_props : file_to_props) {
+      auto& user_props = file_and_props.second->user_collected_properties;
+      ASSERT_TRUE(user_props.find(BlockCountingTablePropertiesCollector::
+                                      kNumSampledBlocksPropertyName) !=
+                  user_props.end());
+      ASSERT_EQ(user_props.at(BlockCountingTablePropertiesCollector::
+                                  kNumSampledBlocksPropertyName),
+                std::to_string(sample_for_compression ? 1 : 0));
+    }
+  }
+}
+
+class CompressionSamplingDBPropertiesTest
+    : public DBPropertiesTest,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  CompressionSamplingDBPropertiesTest() : fast_(GetParam()) {}
+
+ protected:
+  const bool fast_;
+};
+
+INSTANTIATE_TEST_CASE_P(CompressionSamplingDBPropertiesTest,
+                        CompressionSamplingDBPropertiesTest, ::testing::Bool());
+
+// Excluded from RocksDB lite tests due to `GetPropertiesOfAllTables()` usage.
+TEST_P(CompressionSamplingDBPropertiesTest,
+       EstimateDataSizeWithCompressionSampling) {
+  Options options = CurrentOptions();
+  if (fast_) {
+    // One of the following light compression libraries must be present.
+    if (LZ4_Supported()) {
+      options.compression = kLZ4Compression;
+    } else if (Snappy_Supported()) {
+      options.compression = kSnappyCompression;
+    } else {
+      return;
+    }
+  } else {
+    // One of the following heavy compression libraries must be present.
+    if (ZSTD_Supported()) {
+      options.compression = kZSTD;
+    } else if (Zlib_Supported()) {
+      options.compression = kZlibCompression;
+    } else {
+      return;
+    }
+  }
+  options.disable_auto_compactions = true;
+  // For simplicity/determinism, sample 100%.
+  options.sample_for_compression = 1;
+  Reopen(options);
+
+  // Setup the following LSM:
+  //
+  // L0_0 ["a", "b"]
+  // L1_0 ["a", "b"]
+  //
+  // L0_0 was created by flush. L1_0 was created by compaction. Each file
+  // contains one data block. The value consists of compressible data so the
+  // data block should be stored compressed.
+  std::string val(1024, 'a');
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("a", val));
+    ASSERT_OK(Put("b", val));
+    ASSERT_OK(Flush());
+    if (i == 1) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+  }
+
+  TablePropertiesCollection file_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&file_to_props));
+  ASSERT_EQ(2, file_to_props.size());
+  for (const auto& file_and_props : file_to_props) {
+    ASSERT_GT(file_and_props.second->data_size, 0);
+    if (fast_) {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->fast_compression_estimated_data_size);
+    } else {
+      ASSERT_EQ(file_and_props.second->data_size,
+                file_and_props.second->slow_compression_estimated_data_size);
+    }
+  }
+}
+
+TEST_F(DBPropertiesTest, EstimateNumKeysUnderflow) {
+  Options options = CurrentOptions();
+  Reopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Delete("foo"));
+  ASSERT_OK(Delete("foo"));
+  uint64_t num_keys = 0;
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &num_keys));
+  ASSERT_EQ(0, num_keys);
+}
+
+TEST_F(DBPropertiesTest, EstimateOldestKeyTime) {
+  uint64_t oldest_key_time = 0;
+  Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
+
+  // "rocksdb.estimate-oldest-key-time" only available to fifo compaction.
+  for (auto compaction : {kCompactionStyleLevel, kCompactionStyleUniversal,
+                          kCompactionStyleNone}) {
+    options.compaction_style = compaction;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_FALSE(dbfull()->GetIntProperty(
+        DB::Properties::kEstimateOldestKeyTime, &oldest_key_time));
+  }
+
+  int64_t mock_start_time;
+  ASSERT_OK(env_->GetCurrentTime(&mock_start_time));
+
+  options.compaction_style = kCompactionStyleFIFO;
+  options.ttl = 300;
+  options.max_open_files = -1;
+  options.compaction_options_fifo.allow_compaction = false;
+  DestroyAndReopen(options);
+
+  env_->MockSleepForSeconds(100);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 200
+  ASSERT_OK(Put("k2", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 300
+  ASSERT_OK(Put("k3", "v3"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("3", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(100, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(150);  // -> 450
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("2", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(200, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 550
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("1", FilesPerLevel());
+  ASSERT_TRUE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                       &oldest_key_time));
+  ASSERT_EQ(300, oldest_key_time - mock_start_time);
+
+  env_->MockSleepForSeconds(100);  // -> 650
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel());
+  ASSERT_FALSE(dbfull()->GetIntProperty(DB::Properties::kEstimateOldestKeyTime,
+                                        &oldest_key_time));
+}
+
+TEST_F(DBPropertiesTest, SstFilesSize) {
+  struct TestListener : public EventListener {
+    void OnCompactionCompleted(DB* db,
+                               const CompactionJobInfo& /*info*/) override {
+      assert(callback_triggered == false);
+      assert(size_before_compaction > 0);
+      callback_triggered = true;
+      uint64_t total_sst_size = 0;
+      uint64_t live_sst_size = 0;
+      bool ok = db->GetIntProperty(DB::Properties::kTotalSstFilesSize,
+                                   &total_sst_size);
+      ASSERT_TRUE(ok);
+      // total_sst_size include files before and after compaction.
+      ASSERT_GT(total_sst_size, size_before_compaction);
+      ok =
+          db->GetIntProperty(DB::Properties::kLiveSstFilesSize, &live_sst_size);
+      ASSERT_TRUE(ok);
+      // live_sst_size only include files after compaction.
+      ASSERT_GT(live_sst_size, 0);
+      ASSERT_LT(live_sst_size, size_before_compaction);
+    }
+
+    uint64_t size_before_compaction = 0;
+    bool callback_triggered = false;
+  };
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options;
+  options.env = CurrentOptions().env;
+  options.disable_auto_compactions = true;
+  options.listeners.push_back(listener);
+  Reopen(options);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("key" + std::to_string(i), std::string(1000, 'v')));
+  }
+  ASSERT_OK(Flush());
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete("key" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t sst_size;
+  bool ok = db_->GetIntProperty(DB::Properties::kTotalSstFilesSize, &sst_size);
+  ASSERT_TRUE(ok);
+  ASSERT_GT(sst_size, 0);
+  listener->size_before_compaction = sst_size;
+  // Compact to clean all keys and trigger listener.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_TRUE(listener->callback_triggered);
+}
+
+TEST_F(DBPropertiesTest, MinObsoleteSstNumberToKeep) {
+  class TestListener : public EventListener {
+   public:
+    void OnTableFileCreated(const TableFileCreationInfo& info) override {
+      if (info.reason == TableFileCreationReason::kCompaction) {
+        // Verify the property indicates that SSTs created by a running
+        // compaction cannot be deleted.
+        uint64_t created_file_num;
+        FileType created_file_type;
+        std::string filename =
+            info.file_path.substr(info.file_path.rfind('/') + 1);
+        ASSERT_TRUE(
+            ParseFileName(filename, &created_file_num, &created_file_type));
+        ASSERT_EQ(kTableFile, created_file_type);
+
+        uint64_t keep_sst_lower_bound;
+        ASSERT_TRUE(
+            db_->GetIntProperty(DB::Properties::kMinObsoleteSstNumberToKeep,
+                                &keep_sst_lower_bound));
+
+        ASSERT_LE(keep_sst_lower_bound, created_file_num);
+        validated_ = true;
+      }
+    }
+
+    void SetDB(DB* db) { db_ = db; }
+
+    int GetNumCompactions() { return num_compactions_; }
+
+    // True if we've verified the property for at least one output file
+    bool Validated() { return validated_; }
+
+   private:
+    int num_compactions_ = 0;
+    bool validated_ = false;
+    DB* db_ = nullptr;
+  };
+
+  const int kNumL0Files = 4;
+
+  std::shared_ptr<TestListener> listener = std::make_shared<TestListener>();
+
+  Options options = CurrentOptions();
+  options.listeners.push_back(listener);
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  DestroyAndReopen(options);
+  listener->SetDB(db_);
+
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // Make sure they overlap in keyspace to prevent trivial move
+    ASSERT_OK(Put("key1", "val"));
+    ASSERT_OK(Put("key2", "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_TRUE(listener->Validated());
+}
+
+TEST_F(DBPropertiesTest, BlobCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  options.env = CurrentOptions().env;
+
+  // Test with empty blob cache.
+  constexpr size_t kCapacity = 100;
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto blob_cache = NewLRUCache(co);
+  options.blob_cache = blob_cache;
+
+  Reopen(options);
+
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned blob to the cache and check size.
+  constexpr size_t kSize1 = 70;
+  ASSERT_OK(blob_cache->Insert("blob1", nullptr /*value*/, kSize1,
+                               nullptr /*deleter*/));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned blob to the cache and check size.
+  constexpr size_t kSize2 = 60;
+  Cache::Handle* blob2 = nullptr;
+  ASSERT_OK(blob_cache->Insert("blob2", nullptr /*value*/, kSize2,
+                               nullptr /*deleter*/, &blob2));
+  ASSERT_NE(nullptr, blob2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  // blob1 is evicted.
+  ASSERT_EQ(kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned blob to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* blob3 = nullptr;
+  ASSERT_OK(blob_cache->Insert("blob3", nullptr /*value*/, kSize3,
+                               nullptr /*deleter*/, &blob3));
+  ASSERT_NE(nullptr, blob3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  blob_cache->Release(blob2);
+  blob_cache->Release(blob3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlobCacheUsage, &value));
+  // blob2 will be evicted, while blob3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlobCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, BlockCacheProperties) {
+  Options options;
+  uint64_t value;
+
+  options.env = CurrentOptions().env;
+
+  // Block cache properties are not available for tables other than
+  // block-based table.
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  options.table_factory.reset(NewCuckooTableFactory());
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Block cache properties are not available if block cache is not used.
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_FALSE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_FALSE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+
+  // Test with empty block cache.
+  constexpr size_t kCapacity = 100;
+  LRUCacheOptions co;
+  co.capacity = kCapacity;
+  co.num_shard_bits = 0;
+  co.metadata_charge_policy = kDontChargeCacheMetadata;
+  auto block_cache = NewLRUCache(co);
+  table_options.block_cache = block_cache;
+  table_options.no_block_cache = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(0, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert unpinned item to the cache and check size.
+  constexpr size_t kSize1 = 50;
+  ASSERT_OK(block_cache->Insert("item1", nullptr /*value*/, kSize1,
+                                nullptr /*deleter*/));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+
+  // Insert pinned item to the cache and check size.
+  constexpr size_t kSize2 = 30;
+  Cache::Handle* item2 = nullptr;
+  ASSERT_OK(block_cache->Insert("item2", nullptr /*value*/, kSize2,
+                                nullptr /*deleter*/, &item2));
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  ASSERT_EQ(kSize1 + kSize2, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2, value);
+
+  // Insert another pinned item to make the cache over-sized.
+  constexpr size_t kSize3 = 80;
+  Cache::Handle* item3 = nullptr;
+  ASSERT_OK(block_cache->Insert("item3", nullptr /*value*/, kSize3,
+                                nullptr /*deleter*/, &item3));
+  ASSERT_NE(nullptr, item2);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // Item 1 is evicted.
+  ASSERT_EQ(kSize2 + kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(kSize2 + kSize3, value);
+
+  // Check size after release.
+  block_cache->Release(item2);
+  block_cache->Release(item3);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheCapacity, &value));
+  ASSERT_EQ(kCapacity, value);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kBlockCacheUsage, &value));
+  // item2 will be evicted, while item3 remain in cache after release.
+  ASSERT_EQ(kSize3, value);
+  ASSERT_TRUE(
+      db_->GetIntProperty(DB::Properties::kBlockCachePinnedUsage, &value));
+  ASSERT_EQ(0, value);
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyDbStats) {
+  auto mock_clock = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+  CompositeEnvWrapper env(env_, mock_clock);
+
+  Options opts = CurrentOptions();
+  opts.env = &env;
+  Reopen(opts);
+
+  {
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 0.0 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  {
+    mock_clock->SleepForMicroseconds(1500000);
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  0 /* expected_user_bytes_written */,
+                  0 /* expected_wal_bytes_written */,
+                  0 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_user_bytes_written = 0;
+  {
+    // Write with WAL disabled.
+    WriteOptions write_opts;
+    write_opts.disableWAL = true;
+
+    WriteBatch batch;
+    ASSERT_OK(batch.Put("key", "val"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(write_opts, &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written,
+                  0 /* expected_wal_bytes_written */,
+                  1 /* expected_user_writes_by_self */,
+                  0 /* expected_user_writes_with_wal */);
+  }
+
+  int expected_wal_bytes_written = 0;
+  {
+    // Write with WAL enabled.
+    WriteBatch batch;
+    ASSERT_OK(batch.Delete("key"));
+    expected_user_bytes_written += static_cast<int>(batch.GetDataSize());
+    expected_wal_bytes_written += static_cast<int>(batch.GetDataSize());
+
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+    std::map<std::string, std::string> db_stats;
+    ASSERT_TRUE(db_->GetMapProperty(DB::Properties::kDBStats, &db_stats));
+    AssertDbStats(db_stats, 1.5 /* expected_uptime */,
+                  expected_user_bytes_written, expected_wal_bytes_written,
+                  2 /* expected_user_writes_by_self */,
+                  1 /* expected_user_writes_with_wal */);
+  }
+
+  Close();
+}
+
+TEST_F(DBPropertiesTest, GetMapPropertyBlockCacheEntryStats) {
+  // Currently only verifies the expected properties are present
+  std::map<std::string, std::string> values;
+  ASSERT_TRUE(
+      db_->GetMapProperty(DB::Properties::kBlockCacheEntryStats, &values));
+
+  ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheId()) !=
+              values.end());
+  ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::CacheCapacityBytes()) !=
+              values.end());
+  ASSERT_TRUE(
+      values.find(
+          BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()) !=
+      values.end());
+  ASSERT_TRUE(
+      values.find(BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()) !=
+      values.end());
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    CacheEntryRole role = static_cast<CacheEntryRole>(i);
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::EntryCount(role)) !=
+                values.end());
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedBytes(role)) !=
+                values.end());
+    ASSERT_TRUE(values.find(BlockCacheEntryStatsMapKeys::UsedPercent(role)) !=
+                values.end());
+  }
+
+  // There should be no extra values in the map.
+  ASSERT_EQ(3 * kNumCacheEntryRoles + 4, values.size());
+}
+
+namespace {
+std::string PopMetaIndexKey(InternalIterator* meta_iter) {
+  Status s = meta_iter->status();
+  if (!s.ok()) {
+    return s.ToString();
+  } else if (meta_iter->Valid()) {
+    std::string rv = meta_iter->key().ToString();
+    meta_iter->Next();
+    return rv;
+  } else {
+    return "NOT_FOUND";
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(DBPropertiesTest, TableMetaIndexKeys) {
+  // This is to detect unexpected churn in metaindex block keys. This is more
+  // of a "table test" but table_test.cc doesn't depend on db_test_util.h and
+  // we need ChangeOptions() for broad coverage.
+  constexpr int kKeyCount = 100;
+  do {
+    Options options;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    // Create an SST file
+    for (int key = 0; key < kKeyCount; key++) {
+      ASSERT_OK(Put(Key(key), "val"));
+    }
+    ASSERT_OK(Flush());
+
+    // Find its file number
+    std::vector<LiveFileMetaData> files;
+    db_->GetLiveFilesMetaData(&files);
+    // 1 SST file
+    ASSERT_EQ(1, files.size());
+
+    // Open it for inspection
+    std::string sst_file =
+        files[0].directory + "/" + files[0].relative_filename;
+    std::unique_ptr<FSRandomAccessFile> f;
+    ASSERT_OK(env_->GetFileSystem()->NewRandomAccessFile(
+        sst_file, FileOptions(), &f, nullptr));
+    std::unique_ptr<RandomAccessFileReader> r;
+    r.reset(new RandomAccessFileReader(std::move(f), sst_file));
+    uint64_t file_size = 0;
+    ASSERT_OK(env_->GetFileSize(sst_file, &file_size));
+
+    // Read metaindex
+    BlockContents bc;
+    ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U,
+                                       ImmutableOptions(options), &bc));
+    Block metaindex_block(std::move(bc));
+    std::unique_ptr<InternalIterator> meta_iter;
+    meta_iter.reset(metaindex_block.NewMetaIterator());
+    meta_iter->SeekToFirst();
+
+    if (strcmp(options.table_factory->Name(),
+               TableFactory::kBlockBasedTableName()) == 0) {
+      auto bbto = options.table_factory->GetOptions<BlockBasedTableOptions>();
+      if (bbto->filter_policy) {
+        if (bbto->partition_filters) {
+          // The key names are intentionally hard-coded here to detect
+          // accidental regression on compatibility.
+          EXPECT_EQ("partitionedfilter.rocksdb.BuiltinBloomFilter",
+                    PopMetaIndexKey(meta_iter.get()));
+        } else {
+          EXPECT_EQ("fullfilter.rocksdb.BuiltinBloomFilter",
+                    PopMetaIndexKey(meta_iter.get()));
+        }
+      }
+      if (bbto->index_type == BlockBasedTableOptions::kHashSearch) {
+        EXPECT_EQ("rocksdb.hashindex.metadata",
+                  PopMetaIndexKey(meta_iter.get()));
+        EXPECT_EQ("rocksdb.hashindex.prefixes",
+                  PopMetaIndexKey(meta_iter.get()));
+      }
+    }
+    EXPECT_EQ("rocksdb.properties", PopMetaIndexKey(meta_iter.get()));
+    EXPECT_EQ("NOT_FOUND", PopMetaIndexKey(meta_iter.get()));
+  } while (ChangeOptions());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_range_del_test.cc b/src/rocksdb/db/db_range_del_test.cc
new file mode 100644
index 000000000..d576f2217
--- /dev/null
+++ b/src/rocksdb/db/db_range_del_test.cc
@@ -0,0 +1,2807 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(cbi): parameterize the test to cover user-defined timestamp cases
+class DBRangeDelTest : public DBTestBase {
+ public:
+  DBRangeDelTest() : DBTestBase("db_range_del_test", /*env_do_fsync=*/false) {}
+
+  std::string GetNumericStr(int key) {
+    uint64_t uint64_key = static_cast<uint64_t>(key);
+    std::string str;
+    str.resize(8);
+    memcpy(&str[0], static_cast<void*>(&uint64_key), 8);
+    return str;
+  }
+};
+
+// PlainTableFactory, WriteBatchWithIndex, and NumTableFilesAtLevel() are not
+// supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, NonBlockBasedTableNotSupported) {
+  // TODO: figure out why MmapReads trips the iterator pinning assertion in
+  // RangeDelAggregator. Ideally it would be supported; otherwise it should at
+  // least be explicitly unsupported.
+  for (auto config : {kPlainTableAllBytesPrefix, /* kWalDirAndMmapReads */}) {
+    option_config_ = config;
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "dr1", "dr1")
+                    .IsNotSupported());
+  }
+}
+
+TEST_F(DBRangeDelTest, WriteBatchWithIndexNotSupported) {
+  WriteBatchWithIndex indexedBatch{};
+  ASSERT_TRUE(indexedBatch.DeleteRange(db_->DefaultColumnFamily(), "dr1", "dr1")
+                  .IsNotSupported());
+  ASSERT_TRUE(indexedBatch.DeleteRange("dr1", "dr1").IsNotSupported());
+}
+
+TEST_F(DBRangeDelTest, EndSameAsStartCoversNothing) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "b"));
+  ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, EndComesBeforeStartInvalidArgument) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "val"));
+  ASSERT_TRUE(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "a")
+          .IsInvalidArgument());
+  ASSERT_EQ("val", Get("b"));
+}
+
+TEST_F(DBRangeDelTest, FlushOutputHasOnlyRangeTombstones) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "dr1", "dr2"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, DictionaryCompressionWithOnlyRangeTombstones) {
+  Options opts = CurrentOptions();
+  opts.compression_opts.max_dict_bytes = 16384;
+  Reopen(opts);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr2"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputHasOnlyRangeTombstone) {
+  do {
+    Options opts = CurrentOptions();
+    opts.disable_auto_compactions = true;
+    opts.statistics = CreateDBStatistics();
+    DestroyAndReopen(opts);
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                          true /* disallow_trivial_move */));
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(0, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip cuckoo memtables, which do not support snapshots. Skip non-leveled
+    // compactions as the above assertions about the number of files in a level
+    // do not hold true.
+  } while (ChangeOptions(kRangeDelSkipConfigs | kSkipUniversalCompaction |
+                         kSkipFIFOCompaction));
+}
+
+TEST_F(DBRangeDelTest, CompactionOutputFilesExactlyFilled) {
+  // regression test for exactly filled compaction output files. Previously
+  // another file would be generated containing all range deletions, which
+  // could invalidate the non-overlapping file boundary invariant.
+  const int kNumPerFile = 4, kNumFiles = 2, kFileBytes = 9 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size_deviation = 50;  // each block holds two keys
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+      if (j == 0 && i > 0) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+    }
+  }
+  // put extra key to trigger final flush
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MaxCompactionBytesCutsOutputFiles) {
+  // Ensures range deletion spanning multiple compaction output files that are
+  // cut by max_compaction_bytes will have non-overlapping key-ranges.
+  // https://github.com/facebook/rocksdb/issues/1778
+  const int kNumFiles = 2, kNumPerFile = 1 << 8, kBytesPerVal = 1 << 12;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.level0_file_num_compaction_trigger = kNumFiles;
+  opts.max_compaction_bytes = kNumPerFile * kBytesPerVal;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  // Want max_compaction_bytes to trigger the end of compaction output file, not
+  // target_file_size_base, so make the latter much bigger
+  //  opts.target_file_size_base = 100 * opts.max_compaction_bytes;
+  opts.target_file_size_base = 1;
+  DestroyAndReopen(opts);
+
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  Random rnd(301);
+
+  ASSERT_OK(Put(GetNumericStr(0), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(GetNumericStr(kNumPerFile), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(
+      Put(GetNumericStr(kNumPerFile * 2 - 1), rnd.RandomString(kBytesPerVal)));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(NumTableFilesAtLevel(2), 2);
+
+  ASSERT_OK(
+      db_->SetOptions(db_->DefaultColumnFamily(),
+                      {{"target_file_size_base",
+                        std::to_string(100 * opts.max_compaction_bytes)}}));
+
+  // It spans the whole key-range, thus will be included in all output files
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             GetNumericStr(0),
+                             GetNumericStr(kNumFiles * kNumPerFile - 1)));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 1MB (256 values, each 4K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(kBytesPerVal));
+      ASSERT_OK(Put(GetNumericStr(kNumPerFile * i + j), values[j]));
+    }
+    // extra entry to trigger SpecialSkipListFactory's flush
+    ASSERT_OK(Put(GetNumericStr(kNumPerFile), ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr,
+                                        /*column_family=*/nullptr,
+                                        /*disallow_trivial_move=*/true));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), 2);
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+  for (size_t i = 0; i + 1 < files[1].size(); ++i) {
+    ASSERT_TRUE(InternalKeyComparator(opts.comparator)
+                    .Compare(files[1][i].largest, files[1][i + 1].smallest) <
+                0);
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SentinelsOmittedFromOutputFile) {
+  // Regression test for bug where sentinel range deletions (i.e., ones with
+  // sequence number of zero) were included in output files.
+  // snapshot protects range tombstone from dropping due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // gaps between ranges creates sentinels in our internal representation
+  std::vector<std::pair<std::string, std::string>> range_dels = {
+      {"a", "b"}, {"c", "d"}, {"e", "f"}};
+  for (const auto& range_del : range_dels) {
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               range_del.first, range_del.second));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+  ASSERT_GT(files[0][0].fd.smallest_seqno, 0);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, FlushRangeDelsSameStartKey) {
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Put(WriteOptions(), "b2", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  // first iteration verifies query correctness in memtable, second verifies
+  // query correctness for a single SST file
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+    ASSERT_OK(db_->Get(ReadOptions(), "b2", &value));
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactRangeDelsSameStartKey) {
+  ASSERT_OK(db_->Put(WriteOptions(), "unused",
+                     "val"));  // prevents empty after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "b1", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "c"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(3, NumTableFilesAtLevel(0));
+
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), "b1", &value).IsNotFound());
+  }
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
+  const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  DestroyAndReopen(opts);
+
+  // Write a third before snapshot, a third between snapshot and tombstone, and
+  // a third after the tombstone. Keys older than snapshot or newer than the
+  // tombstone should be preserved.
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 3) {
+      snapshot = db_->GetSnapshot();
+    } else if (i == 2 * kNum / 3) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  for (int i = 0; i < kNum; ++i) {
+    ReadOptions read_opts;
+    read_opts.ignore_range_deletions = true;
+    std::string value;
+    if (i < kRangeBegin || i > kRangeEnd || i < kNum / 3 || i >= 2 * kNum / 3) {
+      ASSERT_OK(db_->Get(read_opts, GetNumericStr(i), &value));
+    } else {
+      ASSERT_TRUE(db_->Get(read_opts, GetNumericStr(i), &value).IsNotFound());
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
+  const int kNumPerFile = 100, kNumFiles = 4;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  DestroyAndReopen(opts);
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    if (i > 0) {
+      // range tombstone covers first half of the previous file
+      ASSERT_OK(db_->DeleteRange(
+          WriteOptions(), db_->DefaultColumnFamily(),
+          GetNumericStr((i - 1) * kNumPerFile),
+          GetNumericStr((i - 1) * kNumPerFile + kNumPerFile / 2)));
+    }
+    // Make sure a given key appears in each file so compaction won't be able to
+    // use trivial move, which would happen if the ranges were non-overlapping.
+    // Also, we need an extra element since flush is only triggered when the
+    // number of keys is one greater than SpecialSkipListFactory's limit.
+    // We choose a key outside the key-range used by the test to avoid conflict.
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(kNumPerFile * kNumFiles),
+                       "val"));
+
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), GetNumericStr(i * kNumPerFile + j), "val"));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ((kNumFiles - 1) * kNumPerFile / 2,
+            TestGetTickerCount(opts, COMPACTION_KEY_DROP_RANGE_DEL));
+
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumPerFile; ++j) {
+      ReadOptions read_opts;
+      read_opts.ignore_range_deletions = true;
+      std::string value;
+      if (i == kNumFiles - 1 || j >= kNumPerFile / 2) {
+        ASSERT_OK(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value));
+      } else {
+        ASSERT_TRUE(
+            db_->Get(read_opts, GetNumericStr(i * kNumPerFile + j), &value)
+                .IsNotFound());
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidLevelSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kNumFiles = 4, kFileBytes = 100 << 10;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_bytes_for_level_base = 2 * kFileBytes;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = 3;
+  options.target_file_size_base = kFileBytes;
+  options.target_file_size_multiplier = 1;
+  options.max_compaction_bytes = 1500;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < kNumFiles; ++j) {
+      if (i > 0) {
+        // delete [95,105) in two files, [295,305) in next two
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(rnd.RandomString(990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      if (j < kNumFiles - 1) {
+        // background compaction may happen early for kNumFiles'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+      if (j == options.level0_file_num_compaction_trigger - 1) {
+        // When i == 1, compaction will output some files to L1, at which point
+        // L1 is not bottommost so range deletions cannot be compacted away. The
+        // new L1 files must be generated with non-overlapping key ranges even
+        // though multiple subcompactions see the same ranges deleted, else an
+        // assertion will fail.
+        //
+        // Only enable auto-compactions when we're ready; otherwise, the
+        // oversized L0 (relative to base_level) causes the compaction to run
+        // earlier.
+        ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+        ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                  {{"disable_auto_compactions", "true"}}));
+        ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+        ASSERT_GT(NumTableFilesAtLevel(1), 0);
+        ASSERT_GT(NumTableFilesAtLevel(2), 0);
+      }
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, ValidUniversalSubcompactionBoundaries) {
+  const int kNumPerFile = 100, kFilesPerLevel = 4, kNumLevels = 4;
+  Options options = CurrentOptions();
+  options.compaction_options_universal.min_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.max_merge_width = kFilesPerLevel;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kFilesPerLevel;
+  options.max_subcompactions = 4;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  options.num_levels = kNumLevels;
+  options.target_file_size_base = kNumPerFile << 10;
+  options.target_file_size_multiplier = 1;
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevels - 1; ++i) {
+    for (int j = 0; j < kFilesPerLevel; ++j) {
+      if (i == kNumLevels - 2) {
+        // insert range deletions [95,105) in two files, [295,305) in next two
+        // to prepare L1 for later manual compaction.
+        int mid = (j + (1 - j % 2)) * kNumPerFile;
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(mid - 5), Key(mid + 5)));
+      }
+      std::vector<std::string> values;
+      // Write 100KB (100 values, each 1K)
+      for (int k = 0; k < kNumPerFile; k++) {
+        values.push_back(rnd.RandomString(990));
+        ASSERT_OK(Put(Key(j * kNumPerFile + k), values[k]));
+      }
+      // put extra key to trigger flush
+      ASSERT_OK(Put("", ""));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      if (j < kFilesPerLevel - 1) {
+        // background compaction may happen early for kFilesPerLevel'th file
+        ASSERT_EQ(NumTableFilesAtLevel(0), j + 1);
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+    ASSERT_GT(NumTableFilesAtLevel(kNumLevels - 1 - i), kFilesPerLevel - 1);
+  }
+  // Now L1-L3 are full, when we compact L1->L2 we should see (1) subcompactions
+  // happen since input level > 0; (2) range deletions are not dropped since
+  // output level is not bottommost. If no file boundary assertion fails, that
+  // probably means universal compaction + subcompaction + range deletion are
+  // compatible.
+  ASSERT_OK(dbfull()->RunManualCompaction(
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd(),
+      1 /* input_level */, 2 /* output_level */, CompactRangeOptions(),
+      nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+      true /* disallow_trivial_move */,
+      std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+      "" /*trim_ts*/));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBRangeDelTest, CompactionRemovesCoveredMergeOperands) {
+  const int kNumPerFile = 3, kNumFiles = 3;
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(2 * kNumPerFile));
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  opts.num_levels = 2;
+  Reopen(opts);
+
+  // Iterates kNumFiles * kNumPerFile + 1 times since flushing the last file
+  // requires an extra entry.
+  for (int i = 0; i <= kNumFiles * kNumPerFile; ++i) {
+    if (i % kNumPerFile == 0 && i / kNumPerFile == kNumFiles - 1) {
+      // Delete merge operands from all but the last file
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    std::string val;
+    PutFixed64(&val, i);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+    // we need to prevent trivial move using Puts so compaction will actually
+    // process the merge operands.
+    ASSERT_OK(db_->Put(WriteOptions(), "prevent_trivial_move", ""));
+    if (i > 0 && i % kNumPerFile == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 1+2+...+9
+  ASSERT_EQ(expected, actual);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  expected.clear();
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  uint64_t tmp;
+  Slice tmp2(actual);
+  GetFixed64(&tmp2, &tmp);
+  PutFixed64(&expected, 30);  // 6+7+8+9 (earlier operands covered by tombstone)
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, PutDeleteRangeMergeFlush) {
+  // Test the sequence of operations: (1) Put, (2) DeleteRange, (3) Merge, (4)
+  // Flush. The `CompactionIterator` previously had a bug where we forgot to
+  // check for covering range tombstones when processing the (1) Put, causing
+  // it to reappear after the flush.
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  std::string val;
+  PutFixed64(&val, 1);
+  ASSERT_OK(db_->Put(WriteOptions(), "key", val));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                             "key_"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 1);
+  ASSERT_EQ(expected, actual);
+}
+
+// NumTableFilesAtLevel() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+TEST_F(DBRangeDelTest, ObsoleteTombstoneCleanup) {
+  // During compaction to bottommost level, verify range tombstones older than
+  // the oldest snapshot are removed, while others are preserved.
+  Options opts = CurrentOptions();
+  opts.disable_auto_compactions = true;
+  opts.num_levels = 2;
+  opts.statistics = CreateDBStatistics();
+  Reopen(opts);
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr1",
+                             "dr10"));  // obsolete after compaction
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "dr2",
+                             "dr20"));  // protected by snapshot
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, TestGetTickerCount(opts, COMPACTION_RANGE_DEL_DROP_OBSOLETE));
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
+  // The RangeDelAggregator holds pointers into range deletion blocks created by
+  // table readers. This test ensures the aggregator can still access those
+  // blocks even if it outlives the table readers that created them.
+  //
+  // DBIter always keeps readers open for L0 files. So, in order to test
+  // aggregator outliving reader, we need to have deletions in L1 files, which
+  // are opened/closed on-demand during the scan. This is accomplished by
+  // setting kNumRanges > level0_stop_writes_trigger, which prevents deletions
+  // from all lingering in L0 (there is at most one range deletion per L0 file).
+  //
+  // The first L1 file will contain a range deletion since its begin key is 0.
+  // SeekToFirst() references that table's reader and adds its range tombstone
+  // to the aggregator. Upon advancing beyond that table's key-range via Next(),
+  // the table reader will be unreferenced by the iterator. Since we manually
+  // call Evict() on all readers before the full scan, this unreference causes
+  // the reader's refcount to drop to zero and thus be destroyed.
+  //
+  // When it is destroyed, we do not remove its range deletions from the
+  // aggregator. So, subsequent calls to Next() must be able to use these
+  // deletions to decide whether a key is covered. This will work as long as
+  // the aggregator properly references the range deletion block.
+  const int kNum = 25, kRangeBegin = 0, kRangeEnd = 7, kNumRanges = 5;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.level0_stop_writes_trigger = 4;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  opts.num_levels = 2;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.block_cache = NewLRUCache(8 << 20);
+  opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(opts);
+
+  // Hold a snapshot so range deletions can't become obsolete during compaction
+  // to bottommost level (i.e., L1).
+  const Snapshot* snapshot = db_->GetSnapshot();
+  for (int i = 0; i < kNum; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (i >= kNum / 2 && i < kNum / 2 + kNumRanges) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+  }
+  // Must be > 1 so the first L1 file can be closed before scan finishes
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+  std::vector<uint64_t> file_numbers = ListTableFiles(env_, dbname_);
+
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+  int expected = kRangeEnd;
+  iter->SeekToFirst();
+  for (auto file_number : file_numbers) {
+    // This puts table caches in the state of being externally referenced only
+    // so they are destroyed immediately upon iterator unreferencing.
+    TableCache::Evict(dbfull()->TEST_table_cache(), file_number);
+  }
+  for (; iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+    // Keep clearing block cache's LRU so range deletion block can be freed as
+    // soon as its refcount drops to zero.
+    bbto.block_cache->EraseUnRefEntries();
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+
+  // Also test proper cache handling in GetRangeTombstoneIterator,
+  // via TablesRangeTombstoneSummary. (This once triggered memory leak
+  // report with ASAN.)
+  opts.max_open_files = 1;
+  Reopen(opts);
+
+  std::string str;
+  ASSERT_OK(dbfull()->TablesRangeTombstoneSummary(db_->DefaultColumnFamily(),
+                                                  100, &str));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromMutableMemtable) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromImmutableMemtable) {
+  do {
+    Options opts = CurrentOptions();
+    opts.max_write_buffer_number = 3;
+    opts.min_write_buffer_number_to_merge = 2;
+    // SpecialSkipListFactory lets us specify maximum number of elements the
+    // memtable can hold. It switches the active memtable to immutable (flush is
+    // prevented by the above options) upon inserting an element that would
+    // overflow the memtable.
+    opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+    DestroyAndReopen(opts);
+
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Put(WriteOptions(), "blah", "val"));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredKeyFromSst) {
+  do {
+    DestroyAndReopen(CurrentOptions());
+    ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+    // snapshot prevents key from being deleted during flush
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(
+        db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+
+    ReadOptions read_opts;
+    std::string value;
+    ASSERT_TRUE(db_->Get(read_opts, "key", &value).IsNotFound());
+    db_->ReleaseSnapshot(snapshot);
+  } while (ChangeOptions(kRangeDelSkipConfigs));
+}
+
+TEST_F(DBRangeDelTest, GetCoveredMergeOperandFromMemtable) {
+  const int kNumMergeOps = 10;
+  Options opts = CurrentOptions();
+  opts.merge_operator = MergeOperators::CreateUInt64AddOperator();
+  Reopen(opts);
+
+  for (int i = 0; i < kNumMergeOps; ++i) {
+    std::string val;
+    PutFixed64(&val, i);
+    ASSERT_OK(db_->Merge(WriteOptions(), "key", val));
+    if (i == kNumMergeOps / 2) {
+      // deletes [0, 5]
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+  }
+
+  ReadOptions read_opts;
+  std::string expected, actual;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 30);  // 6+7+8+9
+  ASSERT_EQ(expected, actual);
+
+  expected.clear();
+  read_opts.ignore_range_deletions = true;
+  ASSERT_OK(db_->Get(read_opts, "key", &actual));
+  PutFixed64(&expected, 45);  // 0+1+2+...+9
+  ASSERT_EQ(expected, actual);
+}
+
+TEST_F(DBRangeDelTest, GetIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(opts);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  for (std::string key : {"sst_key", "imm_key", "mem_key"}) {
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, key, &value));
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ReadOptions read_opts;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    if (expected == kRangeBegin - 1) {
+      expected = kNum / 2;
+    } else {
+      ++expected;
+    }
+  }
+  ASSERT_EQ(kNum, expected);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options opts = CurrentOptions();
+  opts.comparator = test::Uint64Comparator();
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(opts);
+
+  const Snapshot* snapshot = nullptr;
+  // Put a snapshot before the range tombstone, verify an iterator using that
+  // snapshot sees all inserted keys.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      snapshot = db_->GetSnapshot();
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 GetNumericStr(kRangeBegin),
+                                 GetNumericStr(kRangeEnd)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), GetNumericStr(i), "val"));
+  }
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_EQ(GetNumericStr(expected), iter->key());
+    ++expected;
+  }
+  ASSERT_EQ(kNum / 2, expected);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, IteratorIgnoresRangeDeletions) {
+  Options opts = CurrentOptions();
+  opts.max_write_buffer_number = 4;
+  opts.min_write_buffer_number_to_merge = 3;
+  opts.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  Reopen(opts);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "sst_key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->Put(WriteOptions(), "imm_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ASSERT_OK(db_->Put(WriteOptions(), "mem_key", "val"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  ReadOptions read_opts;
+  read_opts.ignore_range_deletions = true;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+  int i = 0;
+  std::string expected[] = {"imm_key", "mem_key", "sst_key"};
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++i) {
+    std::string key;
+    ASSERT_EQ(expected[i], iter->key());
+  }
+  ASSERT_EQ(3, i);
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+#ifndef ROCKSDB_UBSAN_RUN
+TEST_F(DBRangeDelTest, TailingIteratorRangeTombstoneUnsupported) {
+  ASSERT_OK(db_->Put(WriteOptions(), "key", "val"));
+  // snapshot prevents key from being deleted during flush
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+
+  // iterations check unsupported in memtable, l0, and then l1
+  for (int i = 0; i < 3; ++i) {
+    ReadOptions read_opts;
+    read_opts.tailing = true;
+    auto* iter = db_->NewIterator(read_opts);
+    if (i == 2) {
+      // For L1+, iterators over files are created on-demand, so need seek
+      iter->SeekToFirst();
+    }
+    ASSERT_TRUE(iter->status().IsNotSupported());
+
+    delete iter;
+    if (i == 0) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    } else if (i == 1) {
+      MoveFilesToLevel(1);
+    }
+  }
+  db_->ReleaseSnapshot(snapshot);
+}
+#endif  // !ROCKSDB_UBSAN_RUN
+
+TEST_F(DBRangeDelTest, SubcompactionHasEmptyDedicatedRangeDelFile) {
+  const int kNumFiles = 2, kNumKeysPerFile = 4;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+  options.max_subcompactions = 2;
+  options.num_levels = 2;
+  options.target_file_size_base = 4096;
+  Reopen(options);
+
+  // need a L1 file for subcompaction to be triggered
+  ASSERT_OK(
+      db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(0), "val"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+
+  // put enough keys to fill up the first subcompaction, and later range-delete
+  // them so that the first subcompaction outputs no key-values. In that case
+  // it'll consider making an SST file dedicated to range deletions.
+  for (int i = 0; i < kNumKeysPerFile; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       std::string(1024, 'a')));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeysPerFile)));
+
+  // the above range tombstone can be dropped, so that one alone won't cause a
+  // dedicated file to be opened. We can make one protected by snapshot that
+  // must be considered. Make its range outside the first subcompaction's range
+  // to exercise the tricky part of the code.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             Key(kNumKeysPerFile + 1),
+                             Key(kNumKeysPerFile + 2)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->EnableAutoCompaction({db_->DefaultColumnFamily()}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, MemtableBloomFilter) {
+  // regression test for #2743. the range delete tombstones in memtable should
+  // be added even when Get() skips searching due to its prefix bloom filter
+  const int kMemtableSize = 1 << 20;              // 1MB
+  const int kMemtablePrefixFilterSize = 1 << 13;  // 8KB
+  const int kNumKeys = 1000;
+  const int kPrefixLen = 8;
+  Options options = CurrentOptions();
+  options.memtable_prefix_bloom_size_ratio =
+      static_cast<double>(kMemtablePrefixFilterSize) / kMemtableSize;
+  options.prefix_extractor.reset(
+      ROCKSDB_NAMESPACE::NewFixedPrefixTransform(kPrefixLen));
+  options.write_buffer_size = kMemtableSize;
+  Reopen(options);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(kNumKeys)));
+  for (int i = 0; i < kNumKeys; ++i) {
+    std::string value;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &value).IsNotFound());
+  }
+}
+
+TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) {
+  // This test originally verified that compaction treated files containing a
+  // split range deletion in the input level as an atomic unit. I.e.,
+  // compacting any input-level file(s) containing a portion of the range
+  // deletion causes all other input-level files containing portions of that
+  // same range deletion to be included in the compaction. Range deletion
+  // tombstones are now truncated to sstable boundaries which removed the need
+  // for that behavior (which could lead to excessively large
+  // compactions).
+  const int kNumFilesPerLevel = 4, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+  // max file size could be 2x of target file size, so set it to half of that
+  options.target_file_size_base = kValueBytes / 2;
+  // disable dynamic_file_size, as it will cut L1 files into more files (than
+  // kNumFilesPerLevel).
+  options.level_compaction_dynamic_file_size = false;
+  options.max_compaction_bytes = 1500;
+  // i == 0: CompactFiles
+  // i == 1: CompactRange
+  // i == 2: automatic compaction
+  for (int i = 0; i < 3; ++i) {
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put(Key(0), ""));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+    ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+    // snapshot protects range tombstone from dropping due to becoming obsolete.
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(0), Key(2 * kNumFilesPerLevel)));
+
+    Random rnd(301);
+    std::string value = rnd.RandomString(kValueBytes);
+    for (int j = 0; j < kNumFilesPerLevel; ++j) {
+      // give files overlapping key-ranges to prevent trivial move
+      ASSERT_OK(Put(Key(j), value));
+      ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+      if (j > 0) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+        ASSERT_EQ(j, NumTableFilesAtLevel(0));
+      }
+    }
+    // put extra key to trigger final flush
+    ASSERT_OK(Put("", ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+    ASSERT_EQ(kNumFilesPerLevel, NumTableFilesAtLevel(1));
+
+    ColumnFamilyMetaData meta;
+    db_->GetColumnFamilyMetaData(&meta);
+    if (i == 0) {
+      ASSERT_OK(db_->CompactFiles(
+          CompactionOptions(), {meta.levels[1].files[0].name}, 2 /* level */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    } else if (i == 1) {
+      auto begin_str = Key(0), end_str = Key(1);
+      Slice begin = begin_str, end = end_str;
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin, &end));
+      ASSERT_EQ(3, NumTableFilesAtLevel(1));
+    } else if (i == 2) {
+      ASSERT_OK(db_->SetOptions(db_->DefaultColumnFamily(),
+                                {{"max_bytes_for_level_base", "10000"}}));
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    }
+    ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneEndKeyAsSstableUpperBound) {
+  // Test the handling of the range-tombstone end-key as the
+  // upper-bound for an sstable.
+
+  const int kNumFilesPerLevel = 2, kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.level0_file_num_compaction_trigger = kNumFilesPerLevel;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(2 /* num_entries_flush */));
+  options.target_file_size_base = kValueBytes;
+  options.disable_auto_compactions = true;
+  // disable it for now, otherwise the L1 files are going be cut before data 1:
+  // L1: [0]   [1,4]
+  // L2: [0,0]
+  // because the grandparent file is between [0]->[1] and it's size is more than
+  // 1/8 of target size (4k).
+  options.level_compaction_dynamic_file_size = false;
+
+  DestroyAndReopen(options);
+
+  // Create an initial sstable at L2:
+  //   [key000000#1,1, key000000#1,1]
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // A snapshot protects the range tombstone from dropping due to
+  // becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(2 * kNumFilesPerLevel)));
+
+  // Create 2 additional sstables in L0. Note that the first sstable
+  // contains the range tombstone.
+  //   [key000000#3,1, key000004#72057594037927935,15]
+  //   [key000001#5,1, key000002#6,1]
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueBytes);
+  for (int j = 0; j < kNumFilesPerLevel; ++j) {
+    // Give files overlapping key-ranges to prevent a trivial move when we
+    // compact from L0 to L1.
+    ASSERT_OK(Put(Key(j), value));
+    ASSERT_OK(Put(Key(2 * kNumFilesPerLevel - 1 - j), value));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    ASSERT_EQ(j + 1, NumTableFilesAtLevel(0));
+  }
+  // Compact the 2 L0 sstables to L1, resulting in the following LSM. There
+  // are 2 sstables generated in L1 due to the target_file_size_base setting.
+  //   L1:
+  //     [key000000#3,1, key000002#72057594037927935,15]
+  //     [key000002#6,1, key000004#72057594037927935,15]
+  //   L2:
+  //     [key000000#1,1, key000000#1,1]
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  {
+    // Compact the second sstable in L1:
+    //   L1:
+    //     [key000000#3,1, key000002#72057594037927935,15]
+    //   L2:
+    //     [key000000#1,1, key000000#1,1]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    //
+    // At the same time, verify the compaction does not cause the key at the
+    // endpoint (key000002#6,1) to disappear.
+    ASSERT_EQ(value, Get(Key(2)));
+    auto begin_str = Key(3);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+    ASSERT_EQ(1, NumTableFilesAtLevel(1));
+    ASSERT_EQ(2, NumTableFilesAtLevel(2));
+    ASSERT_EQ(value, Get(Key(2)));
+  }
+
+  {
+    // Compact the first sstable in L1. This should be copacetic, but
+    // was previously resulting in overlapping sstables in L2 due to
+    // mishandling of the range tombstone end-key when used as the
+    // largest key for an sstable. The resulting LSM structure should
+    // be:
+    //
+    //   L2:
+    //     [key000000#1,1, key000001#72057594037927935,15]
+    //     [key000001#5,1, key000002#72057594037927935,15]
+    //     [key000002#6,1, key000004#72057594037927935,15]
+    auto begin_str = Key(0);
+    const ROCKSDB_NAMESPACE::Slice begin = begin_str;
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, &begin, &begin));
+    ASSERT_EQ(0, NumTableFilesAtLevel(1));
+    ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UnorderedTombstones) {
+  // Regression test for #2752. Range delete tombstones between
+  // different snapshot stripes are not stored in order, so the first
+  // tombstone of each snapshot stripe should be checked as a smallest
+  // candidate.
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  auto cf = db_->DefaultColumnFamily();
+
+  ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c"));
+  // Hold a snapshot to separate these two delete ranges.
+  auto snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b"));
+  ASSERT_OK(db_->Flush(FlushOptions(), cf));
+  db_->ReleaseSnapshot(snapshot);
+
+  std::vector<std::vector<FileMetaData>> files;
+  dbfull()->TEST_GetFilesMetaData(cf, &files);
+  ASSERT_EQ(1, files[0].size());
+  ASSERT_EQ("a", files[0][0].smallest.user_key());
+  ASSERT_EQ("c", files[0][0].largest.user_key());
+
+  std::string v;
+  auto s = db_->Get(ReadOptions(), "a", &v);
+  ASSERT_TRUE(s.IsNotFound());
+}
+
+class MockMergeOperator : public MergeOperator {
+  // Mock non-associative operator. Non-associativity is expressed by lack of
+  // implementation for any `PartialMerge*` functions.
+ public:
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    assert(merge_out != nullptr);
+    merge_out->new_value = merge_in.operand_list.back().ToString();
+    return true;
+  }
+
+  const char* Name() const override { return "MockMergeOperator"; }
+};
+
+TEST_F(DBRangeDelTest, KeyAtOverlappingEndpointReappears) {
+  // This test uses a non-associative merge operator since that is a convenient
+  // way to get compaction to write out files with overlapping user-keys at the
+  // endpoints. Note, however, overlapping endpoints can also occur with other
+  // value types (Put, etc.), assuming the right snapshots are present.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // Push dummy data to L3 so that our actual test files on L0-L2
+  // will not be considered "bottommost" level, otherwise compaction
+  // may prevent us from creating overlapping user keys
+  // as on the bottommost layer MergeHelper
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "dummy"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), "key", value));
+    }
+    if (i == kNumFiles - 1) {
+      // Take snapshot to prevent covered merge operands from being dropped by
+      // compaction.
+      snapshot = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 "key", "key_"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+  std::string value;
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Now we have multiple files at L1 all containing a single user key, thus
+  // guaranteeing overlap in the file endpoints.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Verify no merge operands reappeared after the compaction.
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  // Compact and verify again. It's worthwhile because now the files have
+  // tighter endpoints, so we can verify that doesn't mess anything up.
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      1 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_GT(NumTableFilesAtLevel(2), 1);
+  ASSERT_TRUE(db_->Get(ReadOptions(), "key", &value).IsNotFound());
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, UntruncatedTombstoneDoesNotDeleteNewerKey) {
+  // Verify a key newer than a range tombstone cannot be deleted by being
+  // compacted to the bottom level (and thus having its seqnum zeroed) before
+  // the range tombstone. This used to happen when range tombstones were
+  // untruncated on reads such that they extended past their file boundaries.
+  //
+  // Test summary:
+  //
+  // - L1 is bottommost.
+  // - A couple snapshots are strategically taken to prevent seqnums from being
+  //   zeroed, range tombstone from being dropped, merge operands from being
+  //   dropped, and merge operands from being combined.
+  // - Left half of files in L1 all have same user key, ensuring their file
+  //   boundaries overlap. In the past this would cause range tombstones to be
+  //   untruncated.
+  // - Right half of L1 files all have different keys, ensuring no overlap.
+  // - A range tombstone spans all L1 keys, so it is stored in every L1 file.
+  // - Keys in the right side of the key-range are overwritten. These are
+  //   compacted down to L1 after releasing snapshots such that their seqnums
+  //   will be zeroed.
+  // - A full range scan is performed. If the tombstone in the left L1 files
+  //   were untruncated, it would now cover keys newer than it (but with zeroed
+  //   seqnums) in the right L1 files.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  const int kNumFiles = 4;
+  const int kMaxKey = kNumFiles * kFileBytes / kValueBytes;
+  const int kKeysOverwritten = 10;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.num_levels = 2;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  // - snapshots[0] prevents merge operands from being combined during
+  //   compaction.
+  // - snapshots[1] prevents merge operands from being dropped due to the
+  //   covering range tombstone.
+  const Snapshot* snapshots[] = {nullptr, nullptr};
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      std::string key;
+      if (i < kNumFiles / 2) {
+        key = Key(0);
+      } else {
+        key = Key(1 + i * kFileBytes / kValueBytes + j);
+      }
+      ASSERT_OK(db_->Merge(WriteOptions(), key, value));
+    }
+    if (i == 0) {
+      snapshots[0] = db_->GetSnapshot();
+    }
+    if (i == kNumFiles - 1) {
+      snapshots[1] = db_->GetSnapshot();
+      // The DeleteRange is the last write so all merge operands are covered.
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key(0), Key(kMaxKey + 1)));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  auto get_key_count = [this]() -> int {
+    auto* iter = db_->NewIterator(ReadOptions());
+    assert(iter->status().ok());
+    iter->SeekToFirst();
+    int keys_found = 0;
+    for (; iter->Valid(); iter->Next()) {
+      ++keys_found;
+    }
+    delete iter;
+    return keys_found;
+  };
+
+  // All keys should be covered
+  ASSERT_EQ(0, get_key_count());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  // Roughly the left half of L1 files should have overlapping boundary keys,
+  // while the right half should not.
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  // Now overwrite a few keys that are in L1 files that definitely don't have
+  // overlapping boundary keys.
+  for (int i = kMaxKey; i > kMaxKey - kKeysOverwritten; --i) {
+    auto value = rnd.RandomString(kValueBytes);
+    ASSERT_OK(db_->Merge(WriteOptions(), Key(i), value));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // The overwritten keys are in L0 now, so clearly aren't covered by the range
+  // tombstone in L1.
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+
+  // Release snapshots so seqnums can be zeroed when L0->L1 happens.
+  db_->ReleaseSnapshot(snapshots[0]);
+  db_->ReleaseSnapshot(snapshots[1]);
+
+  auto begin_key_storage = Key(kMaxKey - kKeysOverwritten + 1);
+  auto end_key_storage = Key(kMaxKey);
+  Slice begin_key(begin_key_storage);
+  Slice end_key(end_key_storage);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &begin_key, &end_key));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GE(NumTableFilesAtLevel(1), kNumFiles);
+
+  ASSERT_EQ(kKeysOverwritten, get_key_count());
+}
+
+TEST_F(DBRangeDelTest, DeletedMergeOperandReappearsIterPrev) {
+  // Exposes a bug where we were using
+  // `RangeDelPositioningMode::kBackwardTraversal` while scanning merge operands
+  // in the forward direction. Confusingly, this case happened during
+  // `DBIter::Prev`. It could cause assertion failure, or reappearing keys.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 1 << 10;
+  // Need multiple keys so we can get results when calling `Prev()` after
+  // `SeekToLast()`.
+  const int kNumKeys = 3;
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.merge_operator.reset(new MockMergeOperator());
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  const Snapshot* snapshot = nullptr;
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(db_->Merge(WriteOptions(), Key(j % kNumKeys), value));
+      if (i == 0 && j == kNumKeys) {
+        // Take snapshot to prevent covered merge operands from being dropped or
+        // merged by compaction.
+        snapshot = db_->GetSnapshot();
+        // Do a DeleteRange near the beginning so only the oldest merge operand
+        // for each key is covered. This ensures the sequence of events:
+        //
+        // - `DBIter::Prev()` is called
+        // - After several same versions of the same user key are encountered,
+        //   it decides to seek using `DBIter::FindValueForCurrentKeyUsingSeek`.
+        // - Binary searches to the newest version of the key, which is in the
+        //   leftmost file containing the user key.
+        // - Scans forwards to collect all merge operands. Eventually reaches
+        //   the rightmost file containing the oldest merge operand, which
+        //   should be covered by the `DeleteRange`. If `RangeDelAggregator`
+        //   were not properly using `kForwardTraversal` here, that operand
+        //   would reappear.
+        ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                   Key(0), Key(kNumKeys + 1)));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(kNumFiles, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr /* begin_key */,
+                              nullptr /* end_key */));
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  auto* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+  iter->SeekToLast();
+  int keys_found = 0;
+  for (; iter->Valid(); iter->Prev()) {
+    ++keys_found;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, keys_found);
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeys) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  auto* iter = db_->NewIterator(read_opts);
+  ASSERT_OK(iter->status());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, SnapshotPreventsDroppedKeysInImmMemTables) {
+  const int kFileBytes = 1 << 20;
+
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = kFileBytes;
+  Reopen(options);
+
+  // block flush thread -> pin immtables in memory
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator",
+       "DBImpl::BGWorkFlush"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(0), "a"));
+  std::unique_ptr<const Snapshot, std::function<void(const Snapshot*)>>
+      snapshot(db_->GetSnapshot(),
+               [this](const Snapshot* s) { db_->ReleaseSnapshot(s); });
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(10)));
+
+  ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot.get();
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+  ASSERT_OK(iter->status());
+
+  TEST_SYNC_POINT("SnapshotPreventsDroppedKeysInImmMemTables:AfterNewIterator");
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(Key(0), iter->key());
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneWrittenToMinimalSsts) {
+  // Adapted from
+  // https://github.com/cockroachdb/cockroach/blob/de8b3ea603dd1592d9dc26443c2cc92c356fbc2f/pkg/storage/engine/rocksdb_test.go#L1267-L1398.
+  // Regression test for issue where range tombstone was written to more files
+  // than necessary when it began exactly at the begin key in the next
+  // compaction output file.
+  const int kFileBytes = 1 << 20;
+  const int kValueBytes = 4 << 10;
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  // Have a bit of slack in the size limits but we enforce them more strictly
+  // when manually flushing/compacting.
+  options.max_compaction_bytes = 2 * kFileBytes;
+  options.target_file_size_base = 2 * kFileBytes;
+  options.write_buffer_size = 2 * kFileBytes;
+  Reopen(options);
+
+  Random rnd(301);
+  for (char first_char : {'a', 'b', 'c'}) {
+    for (int i = 0; i < kFileBytes / kValueBytes; ++i) {
+      std::string key(1, first_char);
+      key.append(Key(i));
+      std::string value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(Put(key, value));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    MoveFilesToLevel(2);
+  }
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+
+  // Populate the memtable lightly while spanning the whole key-space. The
+  // setting of `max_compaction_bytes` will cause the L0->L1 to output multiple
+  // files to prevent a large L1->L2 compaction later.
+  ASSERT_OK(Put("a", "val"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             "c" + Key(1), "d"));
+  // Our compaction output file cutting logic currently only considers point
+  // keys. So, in order for the range tombstone to have a chance at landing at
+  // the start of a new file, we need a point key at the range tombstone's
+  // start.
+  // TODO(ajkr): remove this `Put` after file cutting accounts for range
+  // tombstones (#3977).
+  ASSERT_OK(Put("c" + Key(1), "value"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  // Ensure manual L0->L1 compaction cuts the outputs before the range tombstone
+  // and the range tombstone is only placed in the second SST.
+  std::string begin_key_storage("c" + Key(1));
+  Slice begin_key(begin_key_storage);
+  std::string end_key_storage("d");
+  Slice end_key(end_key_storage);
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, &begin_key /* begin */, &end_key /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  std::vector<LiveFileMetaData> all_metadata;
+  std::vector<LiveFileMetaData> l1_metadata;
+  db_->GetLiveFilesMetaData(&all_metadata);
+  for (const auto& metadata : all_metadata) {
+    if (metadata.level == 1) {
+      l1_metadata.push_back(metadata);
+    }
+  }
+  std::sort(l1_metadata.begin(), l1_metadata.end(),
+            [&](const LiveFileMetaData& a, const LiveFileMetaData& b) {
+              return options.comparator->Compare(a.smallestkey, b.smallestkey) <
+                     0;
+            });
+  ASSERT_EQ("a", l1_metadata[0].smallestkey);
+  ASSERT_EQ("a", l1_metadata[0].largestkey);
+  ASSERT_EQ("c" + Key(1), l1_metadata[1].smallestkey);
+  ASSERT_EQ("d", l1_metadata[1].largestkey);
+
+  TablePropertiesCollection all_table_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&all_table_props));
+  int64_t num_range_deletions = 0;
+  for (const auto& name_and_table_props : all_table_props) {
+    const auto& name = name_and_table_props.first;
+    const auto& table_props = name_and_table_props.second;
+    // The range tombstone should only be output to the second L1 SST.
+    if (name.size() >= l1_metadata[1].name.size() &&
+        name.substr(name.size() - l1_metadata[1].name.size())
+                .compare(l1_metadata[1].name) == 0) {
+      ASSERT_EQ(1, table_props->num_range_deletions);
+      ++num_range_deletions;
+    } else {
+      ASSERT_EQ(0, table_props->num_range_deletions);
+    }
+  }
+  ASSERT_EQ(1, num_range_deletions);
+}
+
+TEST_F(DBRangeDelTest, OverlappedTombstones) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key((kNumFiles)*kNumPerFile + 1)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+
+  // The tombstone range is not broken up into multiple SSTs which may incur a
+  // large compaction with L2.
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  std::vector<std::vector<FileMetaData>> files;
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, OverlappedKeys) {
+  const int kNumPerFile = 4, kNumFiles = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 9 * 1024;
+  options.max_compaction_bytes = 9 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    std::vector<std::string> values;
+    // Write 12K (4 values, each 3K)
+    for (int j = 0; j < kNumPerFile; j++) {
+      values.push_back(rnd.RandomString(3 << 10));
+      ASSERT_OK(Put(Key(i * kNumPerFile + j), values[j]));
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(2, NumTableFilesAtLevel(2));
+
+  for (int i = 1; i < kNumFiles * kNumPerFile + 1; i++) {
+    ASSERT_OK(Put(Key(i), "0x123"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // The key range is broken up into three SSTs to avoid a future big compaction
+  // with the grandparent
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, nullptr,
+                                        true /* disallow_trivial_move */));
+  // L1->L2 compaction size is limited to max_compaction_bytes
+  ASSERT_EQ(3, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBRangeDelTest, IteratorRefresh) {
+  // Refreshing an iterator after a range tombstone is added should cause the
+  // deleted range of keys to disappear.
+  for (bool sv_changed : {false, true}) {
+    ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+    ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+    auto* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
+
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "key2", "key3"));
+
+    if (sv_changed) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(iter->Refresh());
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    ASSERT_EQ("key1", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+
+    delete iter;
+  }
+}
+
+void VerifyIteratorReachesEnd(InternalIterator* iter) {
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+void VerifyIteratorReachesEnd(Iterator* iter) {
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+}
+
+TEST_F(DBRangeDelTest, IteratorReseek) {
+  // Range tombstone triggers reseek (seeking to a range tombstone end key) in
+  // merging iterator. Test set up:
+  //    one memtable: range tombstone [0, 1)
+  //    one immutable memtable: range tombstone [1, 2)
+  //    one L0 file with range tombstone [2, 3)
+  //    one L1 file with range tombstone [3, 4)
+  // Seek(0) should trigger cascading reseeks at all levels below memtable.
+  // Seek(1) should trigger cascading reseeks at all levels below immutable
+  // memtable. SeekToFirst and SeekToLast trigger no reseek.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(4)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  // L0
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  // Immutable memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(1),
+                             Key(2)));
+  ASSERT_OK(static_cast_with_check<DBImpl>(db_)->TEST_SwitchMemtable());
+  std::string value;
+  ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+                                    "rocksdb.num-immutable-mem-table", &value));
+  ASSERT_EQ(1, std::stoi(value));
+  // live memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+  // this memtable is still active
+  ASSERT_TRUE(dbfull()->GetProperty(db_->DefaultColumnFamily(),
+                                    "rocksdb.num-immutable-mem-table", &value));
+  ASSERT_EQ(1, std::stoi(value));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  // Reseeked immutable memtable, L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 3);
+  VerifyIteratorReachesEnd(iter);
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(1));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  VerifyIteratorReachesEnd(iter);
+  get_perf_context()->Reset();
+  iter->SeekToFirst();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  VerifyIteratorReachesEnd(iter);
+  iter->SeekToLast();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  VerifyIteratorReachesEnd(iter);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, ReseekDuringNextAndPrev) {
+  // Range tombstone triggers reseek during Next()/Prev() in merging iterator.
+  // Test set up:
+  //    memtable has: [0, 1) [2, 3)
+  //    L0 has: 2
+  //    L1 has: 1, 2, 3
+  // Seek(0) will reseek to 1 for L0 and L1. Seek(1) will not trigger any
+  // reseek. Then Next() determines 2 is covered by [2, 3), it will try to
+  // reseek to 3 for L0 and L1. Similar story for Prev() and SeekForPrev() is
+  // tested.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  auto iter_test_forward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(1));
+
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(3));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    // Next to Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(1));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    // Prev to Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(3));
+    // Reseeked L0 and L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+    iter->Next();
+    VerifyIteratorReachesEnd(iter);
+  };
+
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  iter_test_forward();
+  get_perf_context()->Reset();
+  iter->Seek(Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(2));
+  // Reseeked L0 and L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+  iter_test_forward();
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  get_perf_context()->Reset();
+  iter->SeekToFirst();
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 0);
+  iter_test_forward();
+
+  iter->SeekToLast();
+  iter->Prev();
+  iter_test_forward();
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneFromCurrentLevel) {
+  // Range tombstone triggers reseek when covering key from the same level.
+  // in merging iterator. Test set up:
+  //    memtable has: [0, 1)
+  //    L0 has: [2, 3), 2
+  //    L1 has: 1, 2, 3
+  // Seek(0) will reseek to 1 for L0 and L1.
+  // Then Next() will reseek to 3 for L1 since 2 in L0 is covered by [2, 3) in
+  // L0.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+
+  DestroyAndReopen(options);
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // Memtable
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(0),
+                             Key(1)));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(0));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(1));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 2);
+
+  get_perf_context()->Reset();
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(3));
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+  delete iter;
+}
+
+class TombstoneTestSstPartitioner : public SstPartitioner {
+ public:
+  const char* Name() const override { return "SingleKeySstPartitioner"; }
+
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& request) override {
+    if (cmp->Compare(*request.current_user_key, DBTestBase::Key(5)) == 0) {
+      return kRequired;
+    } else {
+      return kNotRequired;
+    }
+  }
+
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+
+  const Comparator* cmp = BytewiseComparator();
+};
+
+class TombstoneTestSstPartitionerFactory : public SstPartitionerFactory {
+ public:
+  static const char* kClassName() {
+    return "TombstoneTestSstPartitionerFactory";
+  }
+  const char* Name() const override { return kClassName(); }
+
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /* context */) const override {
+    return std::unique_ptr<SstPartitioner>(new TombstoneTestSstPartitioner());
+  }
+};
+
+TEST_F(DBRangeDelTest, TombstoneAcrossFileBoundary) {
+  // Verify that a range tombstone across file boundary covers keys from older
+  // levels. Test set up:
+  //    L1_0: 1, 3, [2, 6)   L1_1: 5, 7, [2, 6) ([2, 6) is from compaction with
+  //    L1_0) L2 has: 5
+  // Seek(1) and then Next() should move the L1 level iterator to
+  // L1_1. Check if 5 is returned after Next().
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 2 * 1024;
+  options.max_compaction_bytes = 2 * 1024;
+
+  // Make sure L1 files are split before "5"
+  auto factory = std::make_shared<TombstoneTestSstPartitionerFactory>();
+  options.sst_partitioner_factory = factory;
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L2
+  // the file should be smaller than max_compaction_bytes, otherwise the file
+  // will be cut before 7.
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 9)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(7), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(1 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(1 << 10)));
+  // Prevent keys being compacted away
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(6)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  iter->Seek(Key(1));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(1));
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(7));
+  // 1 reseek into L2 when key 5 in L2 is covered by [2, 6) from L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, NonOverlappingTombstonAtBoundary) {
+  // Verify that a range tombstone across file boundary covers keys from older
+  // levels.
+  // Test set up:
+  //    L1_0: 1, 3, [4, 7)         L1_1: 6, 8, [4, 7)
+  //    L2: 5
+  // Note that [4, 7) is at end of L1_0 and not overlapping with any point key
+  // in L1_0. [4, 7) from L1_0 should cover 5 is sentinel works
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 2 * 1024;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(8), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+  // Prevent keys being compacted away
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(7)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(3));
+  get_perf_context()->Reset();
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(8));
+  // 1 reseek into L1 since 5 from L2 is covered by [4, 7) from L1
+  ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+  for (auto& k : {4, 5, 6}) {
+    get_perf_context()->Reset();
+    iter->Seek(Key(k));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().ToString(), Key(8));
+    // 1 reseek into L1
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count, 1);
+  }
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, OlderLevelHasNewerData) {
+  // L1_0: 1, 3, [2, 7)   L1_1: 5, 6 at a newer sequence number than [2, 7)
+  // Compact L1_1 to L2. Seek(3) should not skip 5 or 6.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), rnd.RandomString(4 << 10)));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(7)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto key = Key(6);
+  Slice begin(key);
+  EXPECT_OK(dbfull()->TEST_CompactRange(1, &begin, nullptr));
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(5));
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), Key(6));
+  delete iter;
+  db_->ReleaseSnapshot(snapshot);
+}
+
+TEST_F(DBRangeDelTest, LevelBoundaryDefinedByTombstone) {
+  // L1 has: 1, 2, [4, 5)
+  // L2 has: 4
+  // Seek(3), which is over all points keys in L1, check whether
+  // sentinel key from L1 works in this case.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  const Snapshot* snapshot = db_->GetSnapshot();
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(3));
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+
+  get_perf_context()->Reset();
+  iter->SeekForPrev(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  db_->ReleaseSnapshot(snapshot);
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyFile) {
+  // L1_0: 1, 2, L1_1: [3, 5)
+  // L2: 3
+  // Seek(2) then Next() should advance L1 iterator into L1_1.
+  // If sentinel works with tombstone only file, it should cover the key in L2.
+  // Similar story for SeekForPrev(4).
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(2));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  iter->Next();
+  VerifyIteratorReachesEnd(iter);
+  iter->SeekForPrev(Key(4));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(2));
+  iter->Next();
+  VerifyIteratorReachesEnd(iter);
+  delete iter;
+}
+
+void VerifyIteratorKey(InternalIterator* iter,
+                       const std::vector<std::string>& expected_keys,
+                       bool forward = true) {
+  for (auto& key : expected_keys) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->user_key(), key);
+    if (forward) {
+      iter->Next();
+    } else {
+      iter->Prev();
+    }
+  }
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyLevel) {
+  // L1 [3, 5)
+  // L2 has: 3, 4
+  // Any kind of iterator seek should skip 3 and 4 in L2.
+  // L1 level iterator should produce sentinel key.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  get_perf_context()->Reset();
+  uint64_t expected_reseek = 0;
+  for (auto i = 0; i < 7; ++i) {
+    iter->Seek(Key(i));
+    VerifyIteratorReachesEnd(iter);
+    if (i < 5) {
+      ++expected_reseek;
+    }
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              expected_reseek);
+    iter->SeekForPrev(Key(i));
+    VerifyIteratorReachesEnd(iter);
+    if (i > 2) {
+      ++expected_reseek;
+    }
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              expected_reseek);
+    iter->SeekToFirst();
+    VerifyIteratorReachesEnd(iter);
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              ++expected_reseek);
+    iter->SeekToLast();
+    VerifyIteratorReachesEnd(iter);
+    ASSERT_EQ(get_perf_context()->internal_range_del_reseek_count,
+              ++expected_reseek);
+  }
+  delete iter;
+
+  // Check L1 LevelIterator behavior
+  ColumnFamilyData* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  SuperVersion* sv = cfd->GetSuperVersion();
+  Arena arena;
+  ReadOptions read_options;
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+                                          false /* prefix seek */);
+  InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+      read_options, &merge_iter_builder, 1 /* level */, true);
+  // This is needed to make LevelIterator range tombstone aware
+  auto miter = merge_iter_builder.Finish();
+  auto k = Key(3);
+  IterKey target;
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  // sentinel key (file boundary as a fake key)
+  VerifyIteratorKey(level_iter, {Key(5)});
+  VerifyIteratorReachesEnd(level_iter);
+
+  k = Key(5);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(3)}, false);
+  VerifyIteratorReachesEnd(level_iter);
+
+  level_iter->SeekToFirst();
+  VerifyIteratorKey(level_iter, {Key(5)});
+  VerifyIteratorReachesEnd(level_iter);
+
+  level_iter->SeekToLast();
+  VerifyIteratorKey(level_iter, {Key(3)}, false);
+  VerifyIteratorReachesEnd(level_iter);
+
+  miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, TombstoneOnlyWithOlderVisibleKey) {
+  // L1: [3, 5)
+  // L2: 2, 4, 5
+  // 2 and 5 should be visible
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // l1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(5)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  auto iter_test_backward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(5));
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(2));
+    iter->Prev();
+    VerifyIteratorReachesEnd(iter);
+  };
+  auto iter_test_forward = [&] {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(2));
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key(5));
+    iter->Next();
+    VerifyIteratorReachesEnd(iter);
+  };
+  iter->Seek(Key(4));
+  iter_test_backward();
+  iter->SeekForPrev(Key(4));
+  iter->Next();
+  iter_test_backward();
+
+  iter->Seek(Key(4));
+  iter->Prev();
+  iter_test_forward();
+  iter->SeekForPrev(Key(4));
+  iter_test_forward();
+
+  iter->SeekToFirst();
+  iter_test_forward();
+  iter->SeekToLast();
+  iter_test_backward();
+
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, TombstoneSentinelDirectionChange) {
+  // L1: 7
+  // L2: [4, 6)
+  // L3: 4
+  // Seek(5) will have sentinel key 6 at the top of minHeap in merging iterator.
+  //  then do a prev, how would sentinel work?
+  // Redo the test after Put(5) into L1 so that there is a visible key in range
+  // [4, 6).
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  // L3
+  ASSERT_OK(db_->Put(WriteOptions(), Key(4), "bar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(3);
+  ASSERT_EQ(1, NumTableFilesAtLevel(3));
+  // L2
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(4),
+                             Key(6)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(7), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(7));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+  delete iter;
+
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "foobar"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek(Key(5));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(5));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid() && iter->status().ok());
+  delete iter;
+}
+
+// Right sentinel tested in many test cases above
+TEST_F(DBRangeDelTest, LeftSentinelKeyTest) {
+  // L1_0: 0, 1    L1_1: [2, 3), 5
+  // L2: 2
+  // SeekForPrev(4) should give 1 due to sentinel key keeping [2, 3) alive.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  options.max_compaction_bytes = 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_0
+  Random rnd(301);
+  ASSERT_OK(db_->Put(WriteOptions(), Key(0), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(3)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->SeekForPrev(Key(4));
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(1));
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), Key(0));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, LeftSentinelKeyTestWithNewerKey) {
+  // L1_0: 1, 2 newer than L1_1,    L1_1: [2, 4), 5
+  // L2: 3
+  // SeekForPrev(4) then Prev() should give 2 and then 1.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+  options.max_compaction_bytes = 1024;
+
+  DestroyAndReopen(options);
+  // L2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(3), "foo"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1_1
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), "bar"));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(2),
+                             Key(4)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_0
+  Random rnd(301);
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  // Used to verify sequence number of iterator key later.
+  auto seq = dbfull()->TEST_GetLastVisibleSequence();
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  Arena arena;
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  iter.set(
+      dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber));
+
+  auto k = Key(4);
+  IterKey target;
+  target.SetInternalKey(k, 0 /* sequence_number */, kValueTypeForSeekForPrev);
+  iter->SeekForPrev(target.GetInternalKey());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->user_key(), Key(2));
+  SequenceNumber actual_seq;
+  ValueType type;
+  UnPackSequenceAndType(ExtractInternalKeyFooter(iter->key()), &actual_seq,
+                        &type);
+  ASSERT_EQ(seq, actual_seq);
+  // might as well check type
+  ASSERT_EQ(type, kTypeValue);
+
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->user_key(), Key(1));
+  iter->Prev();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+TEST_F(DBRangeDelTest, SentinelKeyCommonCaseTest) {
+  // L1 has 3 files
+  // L1_0: 1, 2     L1_1: [3, 4) 5, 6, [7, 8)     L1_2: 9
+  // Check iterator operations on LevelIterator.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.target_file_size_base = 3 * 1024;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // L1_0
+  ASSERT_OK(db_->Put(WriteOptions(), Key(1), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(2), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  // L1_1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(3),
+                             Key(4)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(5), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Put(WriteOptions(), Key(6), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), Key(7),
+                             Key(8)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(2, NumTableFilesAtLevel(1));
+
+  // L1_2
+  ASSERT_OK(db_->Put(WriteOptions(), Key(9), rnd.RandomString(4 << 10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(3, NumTableFilesAtLevel(1));
+
+  ColumnFamilyData* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  SuperVersion* sv = cfd->GetSuperVersion();
+  Arena arena;
+  ReadOptions read_options;
+  MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), &arena,
+                                          false /* prefix seek */);
+  InternalIterator* level_iter = sv->current->TEST_GetLevelIterator(
+      read_options, &merge_iter_builder, 1 /* level */, true);
+  // This is needed to make LevelIterator range tombstone aware
+  auto miter = merge_iter_builder.Finish();
+  auto k = Key(7);
+  IterKey target;
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  // The last Key(9) is a sentinel key.
+  VerifyIteratorKey(level_iter, {Key(8), Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(6);
+  target.SetInternalKey(k, kMaxSequenceNumber, kValueTypeForSeek);
+  level_iter->Seek(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(6), Key(8), Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(4);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(3), Key(2), Key(1), Key(1)}, false);
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  k = Key(5);
+  target.SetInternalKey(k, 0, kValueTypeForSeekForPrev);
+  level_iter->SeekForPrev(target.GetInternalKey());
+  VerifyIteratorKey(level_iter, {Key(5), Key(3), Key(2), Key(1), Key(1)},
+                    false);
+
+  level_iter->SeekToFirst();
+  VerifyIteratorKey(level_iter, {Key(1), Key(2), Key(2), Key(5), Key(6), Key(8),
+                                 Key(9), Key(9)});
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  level_iter->SeekToLast();
+  VerifyIteratorKey(
+      level_iter,
+      {Key(9), Key(9), Key(6), Key(5), Key(3), Key(2), Key(1), Key(1)}, false);
+  ASSERT_TRUE(!level_iter->Valid() && level_iter->status().ok());
+
+  miter->~InternalIterator();
+}
+
+TEST_F(DBRangeDelTest, PrefixSentinelKey) {
+  // L1: ['aaaa', 'aaad'), 'bbbb'
+  // L2: 'aaac', 'aaae'
+  // Prefix extracts first 3 chars
+  // Seek('aaab') should give 'aaae' as first key.
+  // This is to test a previous bug where prefix seek sees there is no prefix in
+  // the SST file, and will just set file iter to null in LevelIterator and may
+  // just skip to the next SST file. But in this case, we should keep the file's
+  // tombstone alive.
+  Options options = CurrentOptions();
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  table_options.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // L2:
+  ASSERT_OK(db_->Put(WriteOptions(), "aaac", rnd.RandomString(10)));
+  ASSERT_OK(db_->Put(WriteOptions(), "aaae", rnd.RandomString(10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(2);
+  ASSERT_EQ(1, NumTableFilesAtLevel(2));
+
+  // L1
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "aaaa",
+                             "aaad"));
+  ASSERT_OK(db_->Put(WriteOptions(), "bbbb", rnd.RandomString(10)));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  MoveFilesToLevel(1);
+  ASSERT_EQ(1, NumTableFilesAtLevel(1));
+
+  auto iter = db_->NewIterator(ReadOptions());
+  iter->Seek("aaab");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), "aaae");
+  delete iter;
+}
+
+TEST_F(DBRangeDelTest, RefreshMemtableIter) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "z"));
+  ReadOptions ro;
+  ro.read_tier = kMemtableTier;
+  std::unique_ptr<Iterator> iter{db_->NewIterator(ro)};
+  ASSERT_OK(Flush());
+  // First refresh reinits iter, which had a bug where
+  // iter.memtable_range_tombstone_iter_ was not set to nullptr, and caused
+  // subsequent refresh to double free.
+  ASSERT_OK(iter->Refresh());
+  ASSERT_OK(iter->Refresh());
+}
+
+TEST_F(DBRangeDelTest, RangeTombstoneRespectIterateUpperBound) {
+  // Memtable: a, [b, bz)
+  // Do a Seek on `a` with iterate_upper_bound being az
+  // range tombstone [b, bz) should not be processed (added to and
+  // popped from the min_heap in MergingIterator).
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "bar"));
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "b", "bz"));
+
+  // I could not find a cleaner way to test this without relying on
+  // implementation detail. Tried to test the value of
+  // `internal_range_del_reseek_count` but that did not work
+  // since BlockBasedTable iterator becomes !Valid() when point key
+  // is out of bound and that reseek only happens when a point key
+  // is covered by some range tombstone.
+  SyncPoint::GetInstance()->SetCallBack("MergeIterator::PopDeleteRangeStart",
+                                        [](void*) {
+                                          // there should not be any range
+                                          // tombstone in the heap.
+                                          FAIL();
+                                        });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ReadOptions read_opts;
+  std::string upper_bound = "az";
+  Slice upper_bound_slice = upper_bound;
+  read_opts.iterate_upper_bound = &upper_bound_slice;
+  std::unique_ptr<Iterator> iter{db_->NewIterator(read_opts)};
+  iter->Seek("a");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key(), "a");
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+  ASSERT_OK(iter->status());
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_rate_limiter_test.cc b/src/rocksdb/db/db_rate_limiter_test.cc
new file mode 100644
index 000000000..e44cc047d
--- /dev/null
+++ b/src/rocksdb/db/db_rate_limiter_test.cc
@@ -0,0 +1,451 @@
+//  Copyright (c) 2022-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <string>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "util/file_checksum_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBRateLimiterOnReadTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool, bool>> {
+ public:
+  explicit DBRateLimiterOnReadTest()
+      : DBTestBase("db_rate_limiter_on_read_test", /*env_do_fsync=*/false),
+        use_direct_io_(std::get<0>(GetParam())),
+        use_block_cache_(std::get<1>(GetParam())),
+        use_readahead_(std::get<2>(GetParam())) {}
+
+  void Init() {
+    options_ = GetOptions();
+    Reopen(options_);
+    for (int i = 0; i < kNumFiles; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(Put(Key(i * kNumKeysPerFile + j), "val"));
+      }
+      ASSERT_OK(Flush());
+    }
+    MoveFilesToLevel(1);
+  }
+
+  BlockBasedTableOptions GetTableOptions() {
+    BlockBasedTableOptions table_options;
+    table_options.no_block_cache = !use_block_cache_;
+    return table_options;
+  }
+
+  ReadOptions GetReadOptions() {
+    ReadOptions read_options;
+    read_options.rate_limiter_priority = Env::IO_USER;
+    read_options.readahead_size = use_readahead_ ? kReadaheadBytes : 0;
+    return read_options;
+  }
+
+  Options GetOptions() {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    options.file_checksum_gen_factory.reset(new FileChecksumGenCrc32cFactory());
+    options.rate_limiter.reset(NewGenericRateLimiter(
+        1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+        10 /* fairness */, RateLimiter::Mode::kAllIo));
+    options.table_factory.reset(NewBlockBasedTableFactory(GetTableOptions()));
+    options.use_direct_reads = use_direct_io_;
+    return options;
+  }
+
+ protected:
+  const static int kNumKeysPerFile = 1;
+  const static int kNumFiles = 3;
+  const static int kReadaheadBytes = 32 << 10;  // 32KB
+
+  Options options_;
+  const bool use_direct_io_;
+  const bool use_block_cache_;
+  const bool use_readahead_;
+};
+
+std::string GetTestNameSuffix(
+    ::testing::TestParamInfo<std::tuple<bool, bool, bool>> info) {
+  std::ostringstream oss;
+  if (std::get<0>(info.param)) {
+    oss << "DirectIO";
+  } else {
+    oss << "BufferedIO";
+  }
+  if (std::get<1>(info.param)) {
+    oss << "_BlockCache";
+  } else {
+    oss << "_NoBlockCache";
+  }
+  if (std::get<2>(info.param)) {
+    oss << "_Readahead";
+  } else {
+    oss << "_NoReadahead";
+  }
+  return oss.str();
+}
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()),
+                        GetTestNameSuffix);
+#else   // ROCKSDB_LITE
+// Cannot use direct I/O in lite mode.
+INSTANTIATE_TEST_CASE_P(DBRateLimiterOnReadTest, DBRateLimiterOnReadTest,
+                        ::testing::Combine(::testing::Values(false),
+                                           ::testing::Bool(),
+                                           ::testing::Bool()),
+                        GetTestNameSuffix);
+#endif  // ROCKSDB_LITE
+
+TEST_P(DBRateLimiterOnReadTest, Get) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  int expected = 0;
+  for (int i = 0; i < kNumFiles; ++i) {
+    {
+      std::string value;
+      ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+      ++expected;
+    }
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+    {
+      std::string value;
+      ASSERT_OK(db_->Get(GetReadOptions(), Key(i * kNumKeysPerFile), &value));
+      if (!use_block_cache_) {
+        ++expected;
+      }
+    }
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+}
+
+TEST_P(DBRateLimiterOnReadTest, NewMultiGet) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  const int kNumKeys = kNumFiles * kNumKeysPerFile;
+  int64_t expected = 0;
+  {
+    std::vector<std::string> key_bufs;
+    key_bufs.reserve(kNumKeys);
+    std::vector<Slice> keys;
+    keys.reserve(kNumKeys);
+    for (int i = 0; i < kNumKeys; ++i) {
+      key_bufs.emplace_back(Key(i));
+      keys.emplace_back(key_bufs[i]);
+    }
+    std::vector<Status> statuses(kNumKeys);
+    std::vector<PinnableSlice> values(kNumKeys);
+    const int64_t prev_total_rl_req = options_.rate_limiter->GetTotalRequests();
+    db_->MultiGet(GetReadOptions(), dbfull()->DefaultColumnFamily(), kNumKeys,
+                  keys.data(), values.data(), statuses.data());
+    const int64_t cur_total_rl_req = options_.rate_limiter->GetTotalRequests();
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_TRUE(statuses[i].ok());
+    }
+    ASSERT_GT(cur_total_rl_req, prev_total_rl_req);
+    ASSERT_EQ(cur_total_rl_req - prev_total_rl_req,
+              options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+  expected += kNumKeys;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, OldMultiGet) {
+  // The old `vector<Status>`-returning `MultiGet()` APIs use `Read()`, which
+  // supports rate limiting.
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  const int kNumKeys = kNumFiles * kNumKeysPerFile;
+  int expected = 0;
+  {
+    std::vector<std::string> key_bufs;
+    key_bufs.reserve(kNumKeys);
+    std::vector<Slice> keys;
+    keys.reserve(kNumKeys);
+    for (int i = 0; i < kNumKeys; ++i) {
+      key_bufs.emplace_back(Key(i));
+      keys.emplace_back(key_bufs[i]);
+    }
+    std::vector<std::string> values;
+    std::vector<Status> statuses =
+        db_->MultiGet(GetReadOptions(), keys, &values);
+    for (int i = 0; i < kNumKeys; ++i) {
+      ASSERT_OK(statuses[i]);
+    }
+  }
+  expected += kNumKeys;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, Iterator) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(GetReadOptions()));
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  int expected = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++expected;
+    ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+  }
+
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    // When `use_block_cache_ == true`, the reverse scan will access the blocks
+    // loaded to cache during the above forward scan, in which case no further
+    // file reads are expected.
+    if (!use_block_cache_) {
+      ++expected;
+    }
+  }
+  // Reverse scan does not read evenly (one block per iteration) due to
+  // descending seqno ordering, so wait until after the loop to check total.
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#if !defined(ROCKSDB_LITE)
+
+TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  ASSERT_OK(db_->VerifyChecksum(GetReadOptions()));
+  // The files are tiny so there should have just been one read per file.
+  int expected = kNumFiles;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+TEST_P(DBRateLimiterOnReadTest, VerifyFileChecksums) {
+  if (use_direct_io_ && !IsDirectIOSupported()) {
+    return;
+  }
+  Init();
+
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  ASSERT_OK(db_->VerifyFileChecksums(GetReadOptions()));
+  // The files are tiny so there should have just been one read per file.
+  int expected = kNumFiles;
+  ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+
+#endif  // !defined(ROCKSDB_LITE)
+
+class DBRateLimiterOnWriteTest : public DBTestBase {
+ public:
+  explicit DBRateLimiterOnWriteTest()
+      : DBTestBase("db_rate_limiter_on_write_test", /*env_do_fsync=*/false) {}
+
+  void Init() {
+    options_ = GetOptions();
+    ASSERT_OK(TryReopenWithColumnFamilies({"default"}, options_));
+    Random rnd(301);
+    for (int i = 0; i < kNumFiles; i++) {
+      ASSERT_OK(Put(0, kStartKey, rnd.RandomString(2)));
+      ASSERT_OK(Put(0, kEndKey, rnd.RandomString(2)));
+      ASSERT_OK(Flush(0));
+    }
+  }
+
+  Options GetOptions() {
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    options.rate_limiter.reset(NewGenericRateLimiter(
+        1 << 20 /* rate_bytes_per_sec */, 100 * 1000 /* refill_period_us */,
+        10 /* fairness */, RateLimiter::Mode::kWritesOnly));
+    options.table_factory.reset(
+        NewBlockBasedTableFactory(BlockBasedTableOptions()));
+    return options;
+  }
+
+ protected:
+  inline const static int64_t kNumFiles = 3;
+  inline const static std::string kStartKey = "a";
+  inline const static std::string kEndKey = "b";
+  Options options_;
+};
+
+TEST_F(DBRateLimiterOnWriteTest, Flush) {
+  std::int64_t prev_total_request = 0;
+
+  Init();
+
+  std::int64_t actual_flush_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+  std::int64_t exepcted_flush_request = kNumFiles;
+  EXPECT_EQ(actual_flush_request, exepcted_flush_request);
+  EXPECT_EQ(actual_flush_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_HIGH));
+}
+
+TEST_F(DBRateLimiterOnWriteTest, Compact) {
+  Init();
+
+  // Pre-comaction:
+  // level-0 : `kNumFiles` SST files overlapping on [kStartKey, kEndKey]
+#ifndef ROCKSDB_LITE
+  std::string files_per_level_pre_compaction = std::to_string(kNumFiles);
+  ASSERT_EQ(files_per_level_pre_compaction, FilesPerLevel(0 /* cf */));
+#endif  // !ROCKSDB_LITE
+
+  std::int64_t prev_total_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+
+  Compact(kStartKey, kEndKey);
+
+  std::int64_t actual_compaction_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+
+  // Post-comaction:
+  // level-0 : 0 SST file
+  // level-1 : 1 SST file
+#ifndef ROCKSDB_LITE
+  std::string files_per_level_post_compaction = "0,1";
+  ASSERT_EQ(files_per_level_post_compaction, FilesPerLevel(0 /* cf */));
+#endif  // !ROCKSDB_LITE
+
+  std::int64_t exepcted_compaction_request = 1;
+  EXPECT_EQ(actual_compaction_request, exepcted_compaction_request);
+  EXPECT_EQ(actual_compaction_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_LOW));
+}
+
+class DBRateLimiterOnWriteWALTest
+    : public DBRateLimiterOnWriteTest,
+      public ::testing::WithParamInterface<std::tuple<
+          bool /* WriteOptions::disableWal */,
+          bool /* Options::manual_wal_flush */,
+          Env::IOPriority /* WriteOptions::rate_limiter_priority */>> {
+ public:
+  static std::string GetTestNameSuffix(
+      ::testing::TestParamInfo<std::tuple<bool, bool, Env::IOPriority>> info) {
+    std::ostringstream oss;
+    if (std::get<0>(info.param)) {
+      oss << "DisableWAL";
+    } else {
+      oss << "EnableWAL";
+    }
+    if (std::get<1>(info.param)) {
+      oss << "_ManualWALFlush";
+    } else {
+      oss << "_AutoWALFlush";
+    }
+    if (std::get<2>(info.param) == Env::IO_USER) {
+      oss << "_RateLimitAutoWALFlush";
+    } else if (std::get<2>(info.param) == Env::IO_TOTAL) {
+      oss << "_NoRateLimitAutoWALFlush";
+    } else {
+      oss << "_RateLimitAutoWALFlushWithIncorrectPriority";
+    }
+    return oss.str();
+  }
+
+  explicit DBRateLimiterOnWriteWALTest()
+      : disable_wal_(std::get<0>(GetParam())),
+        manual_wal_flush_(std::get<1>(GetParam())),
+        rate_limiter_priority_(std::get<2>(GetParam())) {}
+
+  void Init() {
+    options_ = GetOptions();
+    options_.manual_wal_flush = manual_wal_flush_;
+    Reopen(options_);
+  }
+
+  WriteOptions GetWriteOptions() {
+    WriteOptions write_options;
+    write_options.disableWAL = disable_wal_;
+    write_options.rate_limiter_priority = rate_limiter_priority_;
+    return write_options;
+  }
+
+ protected:
+  bool disable_wal_;
+  bool manual_wal_flush_;
+  Env::IOPriority rate_limiter_priority_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBRateLimiterOnWriteWALTest, DBRateLimiterOnWriteWALTest,
+    ::testing::Values(std::make_tuple(false, false, Env::IO_TOTAL),
+                      std::make_tuple(false, false, Env::IO_USER),
+                      std::make_tuple(false, false, Env::IO_HIGH),
+                      std::make_tuple(false, true, Env::IO_USER),
+                      std::make_tuple(true, false, Env::IO_USER)),
+    DBRateLimiterOnWriteWALTest::GetTestNameSuffix);
+
+TEST_P(DBRateLimiterOnWriteWALTest, AutoWalFlush) {
+  Init();
+
+  const bool no_rate_limit_auto_wal_flush =
+      (rate_limiter_priority_ == Env::IO_TOTAL);
+  const bool valid_arg = (rate_limiter_priority_ == Env::IO_USER &&
+                          !disable_wal_ && !manual_wal_flush_);
+
+  std::int64_t prev_total_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL);
+  ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+
+  Status s = Put("foo", "v1", GetWriteOptions());
+
+  if (no_rate_limit_auto_wal_flush || valid_arg) {
+    EXPECT_TRUE(s.ok());
+  } else {
+    EXPECT_TRUE(s.IsInvalidArgument());
+    EXPECT_TRUE(s.ToString().find("WriteOptions::rate_limiter_priority") !=
+                std::string::npos);
+  }
+
+  std::int64_t actual_auto_wal_flush_request =
+      options_.rate_limiter->GetTotalRequests(Env::IO_TOTAL) -
+      prev_total_request;
+  std::int64_t expected_auto_wal_flush_request = valid_arg ? 1 : 0;
+
+  EXPECT_EQ(actual_auto_wal_flush_request, expected_auto_wal_flush_request);
+  EXPECT_EQ(actual_auto_wal_flush_request,
+            options_.rate_limiter->GetTotalRequests(Env::IO_USER));
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_readonly_with_timestamp_test.cc b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
new file mode 100644
index 000000000..3f53e7806
--- /dev/null
+++ b/src/rocksdb/db/db_readonly_with_timestamp_test.cc
@@ -0,0 +1,960 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBReadOnlyTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBReadOnlyTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_readonly_test_with_timestamp") {}
+
+ protected:
+#ifndef ROCKSDB_LITE
+  void CheckDBOpenedAsCompactedDBWithOneLevel0File() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    // Only 1 L0 file.
+    ASSERT_EQ(1, NumTableFilesAtLevel(0));
+    // L0 is the max level.
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 1);
+  }
+
+  void CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles() {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    // L0 has no files.
+    ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+    // All other levels have no files except the highest level with files.
+    for (int i = 1; i < storage_info->num_non_empty_levels() - 1; ++i) {
+      ASSERT_FALSE(storage_info->LevelFilesBrief(i).num_files > 0);
+    }
+
+    // The highest level with files have some files.
+    int highest_non_empty_level = storage_info->num_non_empty_levels() - 1;
+    ASSERT_TRUE(
+        storage_info->LevelFilesBrief(highest_non_empty_level).num_files > 0);
+  }
+#endif  // !ROCKSDB_LITE
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorAndGet) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+
+  auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+                                Slice expected_value, std::string expected_ts) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+    ASSERT_EQ(expected_value, value_from_get);
+    ASSERT_EQ(expected_ts, timestamp);
+  };
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, Iterators) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  const std::string read_timestamp = Timestamp(2, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<Iterator*> iters;
+  ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+  ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+  int count = 0;
+  uint64_t key = 0;
+  // Forward iterate.
+  for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+       iters[0]->Next(), ++count, ++key) {
+    CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+                       "value" + std::to_string(key), write_timestamp);
+  }
+
+  size_t expected_count = kMaxKey - 0 + 1;
+  ASSERT_EQ(expected_count, count);
+  delete iters[0];
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       IteratorsWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database in read only mode to test its timestamp support.
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ReadOptions read_opts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBGetWithOnlyOneL0File) {
+  const int kNumKeysPerFile = 1026 * 2;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    int count = 0;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+      std::string value_from_get;
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+      ASSERT_EQ("value" + std::to_string(i), value_from_get);
+      ASSERT_EQ(write_timestamps[i], timestamp);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBGetWithOnlyHighestNonEmptyLevelFiles) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    int count = 0;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key, ++count) {
+      std::string value_from_get;
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp));
+      ASSERT_EQ("value" + std::to_string(i), value_from_get);
+      ASSERT_EQ(write_timestamps[i], timestamp);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<std::string> timestamps;
+  std::vector<Status> status_list =
+      db_->MultiGet(read_opts, keys, &values, &timestamps);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<std::string> timestamps;
+  std::vector<Status> status_list =
+      db_->MultiGet(read_opts, keys, &values, &timestamps);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 1026;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(0));
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  ReadOptions read_opts;
+  std::vector<std::string> key_strs;
+  std::vector<Slice> keys;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    key_strs.push_back(Key1(key));
+  }
+  for (const auto& key_str : key_strs) {
+    keys.emplace_back(key_str);
+  }
+  std::vector<std::string> values;
+  std::vector<Status> status_list = db_->MultiGet(read_opts, keys, &values);
+  for (const auto& status : status_list) {
+    ASSERT_TRUE(status.IsInvalidArgument());
+  }
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp, CompactedDBMultiGetWithOnlyOneL0File) {
+  const int kNumKeysPerFile = 1026 * 2;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOneLevel0File();
+
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::vector<std::string> key_strs;
+    std::vector<Slice> keys;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      key_strs.push_back(Key1(key));
+    }
+    for (const auto& key_str : key_strs) {
+      keys.emplace_back(key_str);
+    }
+    size_t batch_size = kMaxKey - start_keys[i] + 1;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+    std::vector<Status> status_list =
+        db_->MultiGet(read_opts, keys, &values, &timestamps);
+    ASSERT_EQ(batch_size, values.size());
+    ASSERT_EQ(batch_size, timestamps.size());
+    for (uint64_t idx = 0; idx < values.size(); ++idx) {
+      ASSERT_EQ("value" + std::to_string(i), values[idx]);
+      ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+      ASSERT_OK(status_list[idx]);
+    }
+  }
+
+  Close();
+}
+
+TEST_F(DBReadOnlyTestWithTimestamp,
+       CompactedDBMultiGetWithOnlyHighestNonEmptyLevelFiles) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Close();
+
+  // Reopen the database in read only mode as a Compacted DB to test its
+  // timestamp support.
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  CheckDBOpenedAsCompactedDBWithOnlyHighestNonEmptyLevelFiles();
+
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::vector<std::string> key_strs;
+    std::vector<Slice> keys;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      key_strs.push_back(Key1(key));
+    }
+    for (const auto& key_str : key_strs) {
+      keys.emplace_back(key_str);
+    }
+    size_t batch_size = kMaxKey - start_keys[i] + 1;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+    std::vector<Status> status_list =
+        db_->MultiGet(read_opts, keys, &values, &timestamps);
+    ASSERT_EQ(batch_size, values.size());
+    ASSERT_EQ(batch_size, timestamps.size());
+    for (uint64_t idx = 0; idx < values.size(); ++idx) {
+      ASSERT_EQ("value" + std::to_string(i), values[idx]);
+      ASSERT_EQ(write_timestamps[i], timestamps[idx]);
+      ASSERT_OK(status_list[idx]);
+    }
+  }
+
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_secondary_test.cc b/src/rocksdb/db/db_secondary_test.cc
new file mode 100644
index 000000000..20d7534e0
--- /dev/null
+++ b/src/rocksdb/db/db_secondary_test.cc
@@ -0,0 +1,1693 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_impl/db_impl_secondary.h"
+#include "db/db_test_util.h"
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/utilities/transaction_db.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class DBSecondaryTestBase : public DBBasicTestWithTimestampBase {
+ public:
+  explicit DBSecondaryTestBase(const std::string& dbname)
+      : DBBasicTestWithTimestampBase(dbname),
+        secondary_path_(),
+        handles_secondary_(),
+        db_secondary_(nullptr) {
+    secondary_path_ =
+        test::PerThreadDBPath(env_, "/db_secondary_test_secondary");
+  }
+
+  ~DBSecondaryTestBase() override {
+    CloseSecondary();
+    if (getenv("KEEP_DB") != nullptr) {
+      fprintf(stdout, "Secondary DB is still at %s\n", secondary_path_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(secondary_path_, options));
+    }
+  }
+
+ protected:
+  Status ReopenAsSecondary(const Options& options) {
+    return DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_);
+  }
+
+  void OpenSecondary(const Options& options);
+
+  Status TryOpenSecondary(const Options& options);
+
+  void OpenSecondaryWithColumnFamilies(
+      const std::vector<std::string>& column_families, const Options& options);
+
+  void CloseSecondary() {
+    for (auto h : handles_secondary_) {
+      ASSERT_OK(db_secondary_->DestroyColumnFamilyHandle(h));
+    }
+    handles_secondary_.clear();
+    delete db_secondary_;
+    db_secondary_ = nullptr;
+  }
+
+  DBImplSecondary* db_secondary_full() {
+    return static_cast<DBImplSecondary*>(db_secondary_);
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int expected_log,
+                           int expected_sst, int expected_manifest) const;
+
+  std::string secondary_path_;
+  std::vector<ColumnFamilyHandle*> handles_secondary_;
+  DB* db_secondary_;
+};
+
+void DBSecondaryTestBase::OpenSecondary(const Options& options) {
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+Status DBSecondaryTestBase::TryOpenSecondary(const Options& options) {
+  Status s =
+      DB::OpenAsSecondary(options, dbname_, secondary_path_, &db_secondary_);
+  return s;
+}
+
+void DBSecondaryTestBase::OpenSecondaryWithColumnFamilies(
+    const std::vector<std::string>& column_families, const Options& options) {
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  for (const auto& cf_name : column_families) {
+    cf_descs.emplace_back(cf_name, options);
+  }
+  Status s = DB::OpenAsSecondary(options, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_OK(s);
+}
+
+void DBSecondaryTestBase::CheckFileTypeCounts(const std::string& dir,
+                                              int expected_log,
+                                              int expected_sst,
+                                              int expected_manifest) const {
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+  int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+  for (auto file : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(file, &number, &type)) {
+      log_cnt += (type == kWalFile);
+      sst_cnt += (type == kTableFile);
+      manifest_cnt += (type == kDescriptorFile);
+    }
+  }
+  ASSERT_EQ(expected_log, log_cnt);
+  ASSERT_EQ(expected_sst, sst_cnt);
+  ASSERT_EQ(expected_manifest, manifest_cnt);
+}
+
+class DBSecondaryTest : public DBSecondaryTestBase {
+ public:
+  explicit DBSecondaryTest() : DBSecondaryTestBase("db_secondary_test") {}
+};
+
+TEST_F(DBSecondaryTest, FailOpenIfLoggerCreationFail) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) {
+        auto* s = reinterpret_cast<Status*>(arg);
+        assert(s);
+        *s = Status::IOError("Injected");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.max_open_files = -1;
+  Status s = TryOpenSecondary(options);
+  ASSERT_EQ(nullptr, options.info_log);
+  ASSERT_TRUE(s.IsIOError());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSecondaryTest, NonExistingDb) {
+  Destroy(last_options_);
+
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.max_open_files = -1;
+  const std::string dbname = "/doesnt/exist";
+  Status s =
+      DB::OpenAsSecondary(options, dbname, secondary_path_, &db_secondary_);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBSecondaryTest, ReopenAsSecondary) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "foo_value"));
+  ASSERT_OK(Put("bar", "bar_value"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  Close();
+
+  ASSERT_OK(ReopenAsSecondary(options));
+  ASSERT_EQ("foo_value", Get("foo"));
+  ASSERT_EQ("bar_value", Get("bar"));
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  auto db1 = static_cast<DBImplSecondary*>(db_);
+  ASSERT_NE(nullptr, db1);
+  Iterator* iter = db1->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    if (0 == count) {
+      ASSERT_EQ("bar", iter->key().ToString());
+      ASSERT_EQ("bar_value", iter->value().ToString());
+    } else if (1 == count) {
+      ASSERT_EQ("foo", iter->key().ToString());
+      ASSERT_EQ("foo_value", iter->value().ToString());
+    }
+    ++count;
+  }
+  delete iter;
+  ASSERT_EQ(2, count);
+}
+
+TEST_F(DBSecondaryTest, SimpleInternalCompaction) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result));
+
+  ASSERT_EQ(result.output_files.size(), 1);
+  InternalKey smallest, largest;
+  smallest.DecodeFrom(result.output_files[0].smallest_internal_key);
+  largest.DecodeFrom(result.output_files[0].largest_internal_key);
+  ASSERT_EQ(smallest.user_key().ToString(), "bar");
+  ASSERT_EQ(largest.user_key().ToString(), "foo");
+  ASSERT_EQ(result.output_level, 1);
+  ASSERT_EQ(result.output_path, this->secondary_path_);
+  ASSERT_EQ(result.num_output_records, 2);
+  ASSERT_GT(result.bytes_written, 0);
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMultiLevels) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  const int kRangeL2 = 10;
+  const int kRangeL1 = 30;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL2), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL2 - 1), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(2);
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i * kRangeL1), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key((i + 1) * kRangeL1 - 1), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  MoveFilesToLevel(1);
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(i * 30), "value" + std::to_string(i)));
+    ASSERT_OK(Put(Key(i * 30 + 50), "value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+
+  // pick 2 files on level 0 for compaction, which has 3 overlap files on L1
+  CompactionServiceInput input1;
+  input1.input_files.push_back(meta.levels[0].files[2].name);
+  input1.input_files.push_back(meta.levels[0].files[3].name);
+  input1.input_files.push_back(meta.levels[1].files[0].name);
+  input1.input_files.push_back(meta.levels[1].files[1].name);
+  input1.input_files.push_back(meta.levels[1].files[2].name);
+
+  input1.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input1.db_id));
+
+  options.max_open_files = -1;
+  Close();
+
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+  CompactionServiceResult result;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input1, &result));
+  ASSERT_OK(result.status);
+
+  // pick 2 files on level 1 for compaction, which has 6 overlap files on L2
+  CompactionServiceInput input2;
+  input2.input_files.push_back(meta.levels[1].files[1].name);
+  input2.input_files.push_back(meta.levels[1].files[2].name);
+  for (int i = 3; i < 9; i++) {
+    input2.input_files.push_back(meta.levels[2].files[i].name);
+  }
+
+  input2.output_level = 2;
+  input2.db_id = input1.db_id;
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input2, &result));
+  ASSERT_OK(result.status);
+
+  CloseSecondary();
+
+  // delete all l2 files, without update manifest
+  for (auto& file : meta.levels[2].files) {
+    ASSERT_OK(env_->DeleteFile(dbname_ + file.name));
+  }
+  OpenSecondary(options);
+  cfh = db_secondary_->DefaultColumnFamily();
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input2, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  // TODO: L0 -> L1 compaction should success, currently version is not built
+  // if files is missing.
+  //  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(OpenAndCompactOptions(),
+  //  cfh, input1, &result));
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionCompactedFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  // trigger compaction to delete the files for secondary instance compaction
+  ASSERT_OK(Put("foo", "foo_value" + std::to_string(3)));
+  ASSERT_OK(Put("bar", "bar_value" + std::to_string(3)));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  Close();
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, InternalCompactionMissingFiles) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  CompactionServiceInput input;
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(&meta);
+  for (auto& file : meta.levels[0].files) {
+    ASSERT_EQ(0, meta.levels[0].level);
+    input.input_files.push_back(file.name);
+  }
+  ASSERT_EQ(input.input_files.size(), 3);
+
+  input.output_level = 1;
+  ASSERT_OK(db_->GetDbIdentity(input.db_id));
+
+  Close();
+
+  ASSERT_OK(env_->DeleteFile(dbname_ + input.input_files[0]));
+
+  options.max_open_files = -1;
+  OpenSecondary(options);
+  auto cfh = db_secondary_->DefaultColumnFamily();
+
+  CompactionServiceResult result;
+  Status s = db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  ASSERT_OK(result.status);
+
+  input.input_files.erase(input.input_files.begin());
+
+  ASSERT_OK(db_secondary_full()->TEST_CompactWithoutInstallation(
+      OpenAndCompactOptions(), cfh, input, &result));
+  ASSERT_OK(result.status);
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondary) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(Flush());
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+}
+
+namespace {
+class TraceFileEnv : public EnvWrapper {
+ public:
+  explicit TraceFileEnv(Env* _target) : EnvWrapper(_target) {}
+  static const char* kClassName() { return "TraceFileEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& env_options) override {
+    class TracedRandomAccessFile : public RandomAccessFile {
+     public:
+      TracedRandomAccessFile(std::unique_ptr<RandomAccessFile>&& target,
+                             std::atomic<int>& counter)
+          : target_(std::move(target)), files_closed_(counter) {}
+      ~TracedRandomAccessFile() override {
+        files_closed_.fetch_add(1, std::memory_order_relaxed);
+      }
+      Status Read(uint64_t offset, size_t n, Slice* result,
+                  char* scratch) const override {
+        return target_->Read(offset, n, result, scratch);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<int>& files_closed_;
+    };
+    Status s = target()->NewRandomAccessFile(f, r, env_options);
+    if (s.ok()) {
+      r->reset(new TracedRandomAccessFile(std::move(*r), files_closed_));
+    }
+    return s;
+  }
+
+  int files_closed() const {
+    return files_closed_.load(std::memory_order_relaxed);
+  }
+
+ private:
+  std::atomic<int> files_closed_{0};
+};
+}  // anonymous namespace
+
+TEST_F(DBSecondaryTest, SecondaryCloseFiles) {
+  Options options;
+  options.env = env_;
+  options.max_open_files = 1;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Options options1;
+  std::unique_ptr<Env> traced_env(new TraceFileEnv(env_));
+  options1.env = traced_env.get();
+  OpenSecondary(options1);
+
+  static const auto verify_db = [&]() {
+    std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+    std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(ReadOptions()));
+    for (iter1->SeekToFirst(), iter2->SeekToFirst();
+         iter1->Valid() && iter2->Valid(); iter1->Next(), iter2->Next()) {
+      ASSERT_EQ(iter1->key(), iter2->key());
+      ASSERT_EQ(iter1->value(), iter2->value());
+    }
+    ASSERT_FALSE(iter1->Valid());
+    ASSERT_FALSE(iter2->Valid());
+  };
+
+  ASSERT_OK(Put("a", "value"));
+  ASSERT_OK(Put("c", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(Put("b", "value"));
+  ASSERT_OK(Put("d", "value"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db();
+
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_EQ(2, static_cast<TraceFileEnv*>(traced_env.get())->files_closed());
+
+  Status s = db_secondary_->SetDBOptions({{"max_open_files", "-1"}});
+  ASSERT_TRUE(s.IsNotSupported());
+  CloseSecondary();
+}
+
+TEST_F(DBSecondaryTest, OpenAsSecondaryWALTailing) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  const auto verify_db_func = [&](const std::string& foo_val,
+                                  const std::string& bar_val) {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+    ASSERT_EQ(foo_val, value);
+    ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+    ASSERT_EQ(bar_val, value);
+    Iterator* iter = db_secondary_->NewIterator(ropts);
+    ASSERT_NE(nullptr, iter);
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("foo", iter->key().ToString());
+    ASSERT_EQ(foo_val, iter->value().ToString());
+    iter->Seek("bar");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("bar", iter->key().ToString());
+    ASSERT_EQ(bar_val, iter->value().ToString());
+    size_t count = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ++count;
+    }
+    ASSERT_EQ(2, count);
+    delete iter;
+  };
+
+  verify_db_func("foo_value2", "bar_value2");
+
+  ASSERT_OK(Put("foo", "new_foo_value"));
+  ASSERT_OK(Put("bar", "new_bar_value"));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value", "new_bar_value");
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "new_foo_value_1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  verify_db_func("new_foo_value_1", "new_bar_value");
+}
+
+TEST_F(DBSecondaryTest, SecondaryTailingBug_ISSUE_8467) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+  }
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto verify_db = [&](const std::string& foo_val,
+                             const std::string& bar_val) {
+    std::string value;
+    ReadOptions ropts;
+    Status s = db_secondary_->Get(ropts, "foo", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(foo_val, value);
+
+    s = db_secondary_->Get(ropts, "bar", &value);
+    ASSERT_OK(s);
+    ASSERT_EQ(bar_val, value);
+  };
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db("foo_value2", "bar_value2");
+  }
+}
+
+TEST_F(DBSecondaryTest, RefreshIterator) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  std::unique_ptr<Iterator> it(db_secondary_->NewIterator(ReadOptions()));
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    if (0 == i) {
+      it->Seek("foo");
+      ASSERT_FALSE(it->Valid());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value0", it->value());
+    } else {
+      it->Seek("foo");
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i - 1), it->value());
+      ASSERT_OK(it->status());
+
+      ASSERT_OK(it->Refresh());
+
+      it->Seek("foo");
+      ASSERT_OK(it->status());
+      ASSERT_TRUE(it->Valid());
+      ASSERT_EQ("foo", it->key());
+      ASSERT_EQ("foo_value" + std::to_string(i), it->value());
+    }
+  }
+}
+
+TEST_F(DBSecondaryTest, OpenWithNonExistColumnFamily) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options1);
+  cf_descs.emplace_back("pikachu", options1);
+  cf_descs.emplace_back("eevee", options1);
+  Status s = DB::OpenAsSecondary(options1, dbname_, secondary_path_, cf_descs,
+                                 &handles_secondary_, &db_secondary_);
+  ASSERT_NOK(s);
+}
+
+TEST_F(DBSecondaryTest, OpenWithSubsetOfColumnFamilies) {
+  Options options;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ASSERT_EQ(0, handles_secondary_.size());
+  ASSERT_NE(nullptr, db_secondary_);
+
+  ASSERT_OK(Put(0 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_value"));
+  ASSERT_OK(Flush(0 /*cf*/));
+  ASSERT_OK(Flush(1 /*cf*/));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchToNewManifestDuringOpen) {
+  Options options;
+  options.env = env_;
+  Reopen(options);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:0",
+        "VersionSet::ProcessManifestWrites:BeforeNewManifest"},
+       {"DBImpl::Open:AfterDeleteFiles",
+        "ReactiveVersionSet::MaybeSwitchManifest:AfterGetCurrentManifestPath:"
+        "1"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread ro_db_thread([&]() {
+    Options options1;
+    options1.env = env_;
+    options1.max_open_files = -1;
+    Status s = TryOpenSecondary(options1);
+    ASSERT_TRUE(s.IsTryAgain());
+
+    // Try again
+    OpenSecondary(options1);
+    CloseSecondary();
+  });
+  Reopen(options);
+  ro_db_thread.join();
+}
+
+TEST_F(DBSecondaryTest, MissingTableFileDuringOpen) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, MissingTableFile) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  for (int i = 0; i != options.level0_file_num_compaction_trigger; ++i) {
+    ASSERT_OK(Put("foo", "foo_value" + std::to_string(i)));
+    ASSERT_OK(Put("bar", "bar_value" + std::to_string(i)));
+    ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_NE(nullptr, db_secondary_full());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_NOK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_NOK(db_secondary_->Get(ropts, "bar", &value));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ASSERT_OK(db_secondary_->Get(ropts, "foo", &value));
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  ASSERT_OK(db_secondary_->Get(ropts, "bar", &value));
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            value);
+  Iterator* iter = db_secondary_->NewIterator(ropts);
+  ASSERT_NE(nullptr, iter);
+  iter->Seek("bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar", iter->key().ToString());
+  ASSERT_EQ("bar_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  iter->Seek("foo");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo", iter->key().ToString());
+  ASSERT_EQ("foo_value" +
+                std::to_string(options.level0_file_num_compaction_trigger - 1),
+            iter->value().ToString());
+  size_t count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ++count;
+  }
+  ASSERT_EQ(2, count);
+  delete iter;
+}
+
+TEST_F(DBSecondaryTest, PrimaryDropColumnFamily) {
+  Options options;
+  options.env = env_;
+  const std::string kCfName1 = "pikachu";
+  CreateAndReopenWithCF({kCfName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCfName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  ASSERT_OK(Put(1 /*cf*/, "foo", "foo_val_1"));
+  ASSERT_OK(Flush(1 /*cf*/));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  std::string value;
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+
+  ASSERT_OK(dbfull()->DropColumnFamily(handles_[1]));
+  Close();
+  CheckFileTypeCounts(dbname_, 1, 0, 1);
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  value.clear();
+  ASSERT_OK(db_secondary_->Get(ropts, handles_secondary_[1], "foo", &value));
+  ASSERT_EQ("foo_val_1", value);
+}
+
+TEST_F(DBSecondaryTest, SwitchManifest) {
+  Options options;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 4;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  const int kNumFiles = options.level0_file_num_compaction_trigger - 1;
+  // Keep it smaller than 10 so that key0, key1, ..., key9 are sorted as 0, 1,
+  // ..., 9.
+  const int kNumKeys = 10;
+  // Create two sst
+  for (int i = 0; i != kNumFiles; ++i) {
+    for (int j = 0; j != kNumKeys; ++j) {
+      ASSERT_OK(Put("key" + std::to_string(j), "value_" + std::to_string(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  const auto& range_scan_db = [&]() {
+    ReadOptions tmp_ropts;
+    tmp_ropts.total_order_seek = true;
+    tmp_ropts.verify_checksums = true;
+    std::unique_ptr<Iterator> iter(db_secondary_->NewIterator(tmp_ropts));
+    int cnt = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++cnt) {
+      ASSERT_EQ("key" + std::to_string(cnt), iter->key().ToString());
+      ASSERT_EQ("value_" + std::to_string(kNumFiles - 1),
+                iter->value().ToString());
+    }
+  };
+
+  range_scan_db();
+
+  // While secondary instance still keeps old MANIFEST open, we close primary,
+  // restart primary, performs full compaction, close again, restart again so
+  // that next time secondary tries to catch up with primary, the secondary
+  // will skip the MANIFEST in middle.
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  range_scan_db();
+}
+
+TEST_F(DBSecondaryTest, SwitchManifestTwice) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  const std::string cf1_name("test_cf");
+  CreateAndReopenWithCF({cf1_name}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kDefaultColumnFamilyName, cf1_name},
+                                  options1);
+
+  ASSERT_OK(Put("0", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::string value;
+  ReadOptions ropts;
+  ropts.verify_checksums = true;
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value0", value);
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, cf1_name}, options);
+  ASSERT_OK(Put("0", "value1"));
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+
+  ASSERT_OK(db_secondary_->Get(ropts, "0", &value));
+  ASSERT_EQ("value1", value);
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWAL) {
+  const int kNumKeysPerMemtable = 1;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  const auto& verify_db = [](DB* db1, DB* db2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts));
+    std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts));
+    it1->SeekToFirst();
+    it2->SeekToFirst();
+    for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+      ASSERT_EQ(it1->key(), it2->key());
+      ASSERT_EQ(it1->value(), it2->value());
+    }
+    ASSERT_FALSE(it1->Valid());
+    ASSERT_FALSE(it2->Valid());
+
+    for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+      std::string value;
+      ASSERT_OK(db2->Get(read_opts, it1->key(), &value));
+      ASSERT_EQ(it1->value(), value);
+    }
+    for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+      std::string value;
+      ASSERT_OK(db1->Get(read_opts, it2->key(), &value));
+      ASSERT_EQ(it2->value(), value);
+    }
+  };
+  for (int k = 0; k != 16; ++k) {
+    ASSERT_OK(Put("key" + std::to_string(k), "value" + std::to_string(k)));
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), db_secondary_);
+  }
+}
+
+TEST_F(DBSecondaryTest, DISABLED_SwitchWALMultiColumnFamilies) {
+  const int kNumKeysPerMemtable = 1;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:ContextCleanedUp",
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  const std::string kCFName1 = "pikachu";
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  CreateAndReopenWithCF({kCFName1}, options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondaryWithColumnFamilies({kCFName1}, options1);
+  ASSERT_EQ(2, handles_secondary_.size());
+
+  const auto& verify_db = [](DB* db1,
+                             const std::vector<ColumnFamilyHandle*>& handles1,
+                             DB* db2,
+                             const std::vector<ColumnFamilyHandle*>& handles2) {
+    ASSERT_NE(nullptr, db1);
+    ASSERT_NE(nullptr, db2);
+    ReadOptions read_opts;
+    read_opts.verify_checksums = true;
+    ASSERT_EQ(handles1.size(), handles2.size());
+    for (size_t i = 0; i != handles1.size(); ++i) {
+      std::unique_ptr<Iterator> it1(db1->NewIterator(read_opts, handles1[i]));
+      std::unique_ptr<Iterator> it2(db2->NewIterator(read_opts, handles2[i]));
+      it1->SeekToFirst();
+      it2->SeekToFirst();
+      for (; it1->Valid() && it2->Valid(); it1->Next(), it2->Next()) {
+        ASSERT_EQ(it1->key(), it2->key());
+        ASSERT_EQ(it1->value(), it2->value());
+      }
+      ASSERT_FALSE(it1->Valid());
+      ASSERT_FALSE(it2->Valid());
+
+      for (it1->SeekToFirst(); it1->Valid(); it1->Next()) {
+        std::string value;
+        ASSERT_OK(db2->Get(read_opts, handles2[i], it1->key(), &value));
+        ASSERT_EQ(it1->value(), value);
+      }
+      for (it2->SeekToFirst(); it2->Valid(); it2->Next()) {
+        std::string value;
+        ASSERT_OK(db1->Get(read_opts, handles1[i], it2->key(), &value));
+        ASSERT_EQ(it2->value(), value);
+      }
+    }
+  };
+  for (int k = 0; k != 8; ++k) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(0 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+      ASSERT_OK(Put(1 /*cf*/, "key" + std::to_string(k),
+                    "value" + std::to_string(k)));
+    }
+    TEST_SYNC_POINT(
+        "DBSecondaryTest::SwitchWALMultipleColumnFamilies:BeforeCatchUp");
+    ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+    verify_db(dbfull(), handles_, db_secondary_, handles_secondary_);
+    SyncPoint::GetInstance()->ClearTrace();
+  }
+}
+
+TEST_F(DBSecondaryTest, CatchUpAfterFlush) {
+  const int kNumKeysPerMemtable = 16;
+  Options options;
+  options.env = env_;
+  options.max_write_buffer_number = 4;
+  options.min_write_buffer_number_to_merge = 2;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+  Reopen(options);
+
+  Options options1;
+  options1.env = env_;
+  options1.max_open_files = -1;
+  OpenSecondary(options1);
+
+  WriteOptions write_opts;
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("key0", "value0"));
+  ASSERT_OK(wb.Put("key1", "value1"));
+  ASSERT_OK(dbfull()->Write(write_opts, &wb));
+  ReadOptions read_opts;
+  std::unique_ptr<Iterator> iter1(db_secondary_->NewIterator(read_opts));
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  iter1->Seek("key0");
+  ASSERT_FALSE(iter1->Valid());
+  iter1->Seek("key1");
+  ASSERT_FALSE(iter1->Valid());
+  ASSERT_OK(iter1->status());
+  std::unique_ptr<Iterator> iter2(db_secondary_->NewIterator(read_opts));
+  iter2->Seek("key0");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_EQ("value0", iter2->value());
+  iter2->Seek("key1");
+  ASSERT_TRUE(iter2->Valid());
+  ASSERT_OK(iter2->status());
+  ASSERT_EQ("value1", iter2->value());
+
+  {
+    WriteBatch wb1;
+    ASSERT_OK(wb1.Put("key0", "value01"));
+    ASSERT_OK(wb1.Put("key1", "value11"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb1));
+  }
+
+  {
+    WriteBatch wb2;
+    ASSERT_OK(wb2.Put("key0", "new_value0"));
+    ASSERT_OK(wb2.Delete("key1"));
+    ASSERT_OK(dbfull()->Write(write_opts, &wb2));
+  }
+
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_secondary_->TryCatchUpWithPrimary());
+  std::unique_ptr<Iterator> iter3(db_secondary_->NewIterator(read_opts));
+  // iter3 should not see value01 and value11 at all.
+  iter3->Seek("key0");
+  ASSERT_TRUE(iter3->Valid());
+  ASSERT_EQ("new_value0", iter3->value());
+  iter3->Seek("key1");
+  ASSERT_FALSE(iter3->Valid());
+  ASSERT_OK(iter3->status());
+}
+
+TEST_F(DBSecondaryTest, CheckConsistencyWhenOpen) {
+  bool called = false;
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImplSecondary::CheckConsistency:AfterFirstAttempt", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        called = true;
+        auto* s = reinterpret_cast<Status*>(arg);
+        ASSERT_NOK(*s);
+      });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::CheckConsistency:AfterGetLiveFilesMetaData",
+        "BackgroundCallCompaction:0"},
+       {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "DBImpl::CheckConsistency:BeforeGetFileSize"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("a", "value0"));
+  ASSERT_OK(Put("c", "value0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("b", "value1"));
+  ASSERT_OK(Put("d", "value1"));
+  ASSERT_OK(Flush());
+  port::Thread thread([this]() {
+    Options opts;
+    opts.env = env_;
+    opts.max_open_files = -1;
+    OpenSecondary(opts);
+  });
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+  ASSERT_TRUE(called);
+}
+
+TEST_F(DBSecondaryTest, StartFromInconsistent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Options options1;
+  options1.env = env_;
+  Status s = TryOpenSecondary(options1);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, InconsistencyDuringCatchUp) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_OK(Flush());
+
+  Options options1;
+  options1.env = env_;
+  OpenSecondary(options1);
+
+  {
+    std::string value;
+    ASSERT_OK(db_secondary_->Get(ReadOptions(), "foo", &value));
+    ASSERT_EQ("value", value);
+  }
+
+  ASSERT_OK(Put("bar", "value1"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        *(reinterpret_cast<Status*>(arg)) =
+            Status::Corruption("Inject corruption");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = db_secondary_->TryCatchUpWithPrimary();
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBSecondaryTest, OpenWithTransactionDB) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  // Destroy the DB to recreate as a TransactionDB.
+  Close();
+  Destroy(options, true);
+
+  // Create a TransactionDB.
+  TransactionDB* txn_db = nullptr;
+  TransactionDBOptions txn_db_opts;
+  ASSERT_OK(TransactionDB::Open(options, txn_db_opts, dbname_, &txn_db));
+  ASSERT_NE(txn_db, nullptr);
+  db_ = txn_db;
+
+  std::vector<std::string> cfs = {"new_CF"};
+  CreateColumnFamilies(cfs, options);
+  ASSERT_EQ(handles_.size(), 1);
+
+  WriteOptions wopts;
+  TransactionOptions txn_opts;
+  Transaction* txn1 = txn_db->BeginTransaction(wopts, txn_opts, nullptr);
+  ASSERT_NE(txn1, nullptr);
+  ASSERT_OK(txn1->Put(handles_[0], "k1", "v1"));
+  ASSERT_OK(txn1->Commit());
+  delete txn1;
+
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  ASSERT_OK(TryOpenSecondary(options));
+}
+
+class DBSecondaryTestWithTimestamp : public DBSecondaryTestBase {
+ public:
+  explicit DBSecondaryTestWithTimestamp()
+      : DBSecondaryTestBase("db_secondary_test_with_timestamp") {}
+};
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGetReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorAndGetReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_TRUE(db_->Get(read_opts, Key1(key), &value_from_get, &timestamp)
+                    .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorAndGetWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_TRUE(iter->status().IsInvalidArgument());
+  }
+
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    std::string value_from_get;
+    ASSERT_TRUE(
+        db_->Get(read_opts, Key1(key), &value_from_get).IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorAndGet) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  auto get_value_and_check = [](DB* db, ReadOptions read_opts, Slice key,
+                                Slice expected_value, std::string expected_ts) {
+    std::string value_from_get;
+    std::string timestamp;
+    ASSERT_OK(db->Get(read_opts, key.ToString(), &value_from_get, &timestamp));
+    ASSERT_EQ(expected_value, value_from_get);
+    ASSERT_EQ(expected_ts, timestamp);
+  };
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+      get_value_and_check(db_, read_opts, it->key(), it->value(),
+                          write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+        get_value_and_check(db_, read_opts, it->key(), it->value(),
+                            write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, IteratorsReadTimestampSizeMismatch) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  std::string different_size_read_timestamp;
+  PutFixed32(&different_size_read_timestamp, 2);
+  Slice different_size_read_ts = different_size_read_timestamp;
+  read_opts.timestamp = &different_size_read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorsReadTimestampSpecifiedWithoutWriteTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  const std::string read_timestamp = Timestamp(2, 0);
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp,
+       IteratorsWriteWithTimestampReadWithoutTimestamp) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  {
+    std::vector<Iterator*> iters;
+    ASSERT_TRUE(
+        db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters)
+            .IsInvalidArgument());
+  }
+
+  Close();
+}
+
+TEST_F(DBSecondaryTestWithTimestamp, Iterators) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::string write_timestamp = Timestamp(1, 0);
+  const std::string read_timestamp = Timestamp(2, 0);
+  WriteOptions write_opts;
+  for (uint64_t key = 0; key <= kMaxKey; ++key) {
+    Status s = db_->Put(write_opts, Key1(key), write_timestamp,
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+  }
+
+  // Reopen the database as secondary instance to test its timestamp support.
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReopenAsSecondary(options));
+
+  ReadOptions read_opts;
+  Slice read_ts = read_timestamp;
+  read_opts.timestamp = &read_ts;
+  std::vector<Iterator*> iters;
+  ASSERT_OK(db_->NewIterators(read_opts, {db_->DefaultColumnFamily()}, &iters));
+  ASSERT_EQ(static_cast<uint64_t>(1), iters.size());
+
+  int count = 0;
+  uint64_t key = 0;
+  // Forward iterate.
+  for (iters[0]->Seek(Key1(0)), key = 0; iters[0]->Valid();
+       iters[0]->Next(), ++count, ++key) {
+    CheckIterUserEntry(iters[0], Key1(key), kTypeValue,
+                       "value" + std::to_string(key), write_timestamp);
+  }
+
+  size_t expected_count = kMaxKey - 0 + 1;
+  ASSERT_EQ(expected_count, count);
+  delete iters[0];
+
+  Close();
+}
+#endif  //! ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_sst_test.cc b/src/rocksdb/db/db_sst_test.cc
new file mode 100644
index 000000000..7f031444a
--- /dev/null
+++ b/src/rocksdb/db/db_sst_test.cc
@@ -0,0 +1,1868 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "env/mock_env.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/sst_file_manager.h"
+#include "rocksdb/table.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBSSTTest : public DBTestBase {
+ public:
+  DBSSTTest() : DBTestBase("db_sst_test", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (auto fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+  void ClearFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBSSTTest, DontDeletePendingOutputs) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  // Every time we write to a table file, call FOF/POF with full DB scan. This
+  // will make sure our pending_outputs_ protection work correctly
+  std::function<void()> purge_obsolete_files_function = [&]() {
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /*force*/);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+    job_context.Clean();
+  };
+
+  env_->table_write_callback_ = &purge_obsolete_files_function;
+
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("a", "begin"));
+    ASSERT_OK(Put("z", "end"));
+    ASSERT_OK(Flush());
+  }
+
+  // If pending output guard does not work correctly, PurgeObsoleteFiles() will
+  // delete the file that Compaction is trying to create, causing this: error
+  // db/db_test.cc:975: IO error:
+  // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory
+  Compact("a", "b");
+}
+
+// 1 Create some SST files by inserting K-V pairs into DB
+// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
+// 3 Open DB and check if all key can be read
+TEST_F(DBSSTTest, SSTsWithLdbSuffixHandling) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_id = 0;
+  for (int i = 0; i < 10; ++i) {
+    GenerateNewFile(&rnd, &key_id, false);
+  }
+  ASSERT_OK(Flush());
+  Close();
+  int const num_files = GetSstFileCount(dbname_);
+  ASSERT_GT(num_files, 0);
+
+  Reopen(options);
+  std::vector<std::string> values;
+  values.reserve(key_id);
+  for (int k = 0; k < key_id; ++k) {
+    values.push_back(Get(Key(k)));
+  }
+  Close();
+
+  std::vector<std::string> filenames;
+  GetSstFiles(env_, dbname_, &filenames);
+  int num_ldb_files = 0;
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    if (i & 1) {
+      continue;
+    }
+    std::string const rdb_name = dbname_ + "/" + filenames[i];
+    std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
+    ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
+    ++num_ldb_files;
+  }
+  ASSERT_GT(num_ldb_files, 0);
+  ASSERT_EQ(num_files, GetSstFileCount(dbname_));
+
+  Reopen(options);
+  for (int k = 0; k < key_id; ++k) {
+    ASSERT_EQ(values[k], Get(Key(k)));
+  }
+  Destroy(options);
+}
+
+// Check that we don't crash when opening DB with
+// DBOptions::skip_checking_sst_file_sizes_on_db_open = true.
+TEST_F(DBSSTTest, SkipCheckingSSTFileSizesOnDBOpen) {
+  ASSERT_OK(Put("pika", "choo"));
+  ASSERT_OK(Flush());
+
+  // Just open the DB with the option set to true and check that we don't crash.
+  Options options;
+  options.env = env_;
+  options.skip_checking_sst_file_sizes_on_db_open = true;
+  Reopen(options);
+
+  ASSERT_EQ("choo", Get("pika"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBSSTTest, DontDeleteMovedFile) {
+  // This test triggers move compaction and verifies that the file is not
+  // deleted when it's part of move compaction
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  // If the moved file is actually deleted (the move-safeguard in
+  // ~Version::Version() is not there), we get this failure:
+  // Corruption: Can't access /000009.sst
+  Reopen(options);
+}
+
+// This reproduces a bug where we don't delete a file because when it was
+// supposed to be deleted, it was blocked by pending_outputs
+// Consider:
+// 1. current file_number is 13
+// 2. compaction (1) starts, blocks deletion of all files starting with 13
+// (pending outputs)
+// 3. file 13 is created by compaction (2)
+// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file
+// 13 has no references, it is put into VersionSet::obsolete_files_
+// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13
+// is deleted from obsolete_files_ set.
+// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by
+// pending outputs since compaction (1) is still running. It is not deleted and
+// it is not present in obsolete_files_ anymore. Therefore, we never delete it.
+TEST_F(DBSSTTest, DeleteObsoleteFilesPendingOutputs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 2 * 1024 * 1024;     // 2 MB
+  options.max_bytes_for_level_base = 1024 * 1024;  // 1 MB
+  options.level0_file_num_compaction_trigger =
+      2;  // trigger compaction when we have 2 files
+  options.max_background_flushes = 2;
+  options.max_background_compactions = 2;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  Random rnd(301);
+  // Create two 1MB sst files
+  for (int i = 0; i < 2; ++i) {
+    // Create 1MB sst file
+    for (int j = 0; j < 100; ++j) {
+      ASSERT_OK(Put(Key(i * 50 + j), rnd.RandomString(10 * 1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  // this should execute both L0->L1 and L1->(move)->L2 compactions
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1", FilesPerLevel(0));
+
+  test::SleepingBackgroundTask blocking_thread;
+  port::Mutex mutex_;
+  bool already_blocked(false);
+
+  // block the flush
+  std::function<void()> block_first_time = [&]() {
+    bool blocking = false;
+    {
+      MutexLock l(&mutex_);
+      if (!already_blocked) {
+        blocking = true;
+        already_blocked = true;
+      }
+    }
+    if (blocking) {
+      blocking_thread.DoSleep();
+    }
+  };
+  env_->table_write_callback_ = &block_first_time;
+  // Insert 2.5MB data, which should trigger a flush because we exceed
+  // write_buffer_size. The flush will be blocked with block_first_time
+  // pending_file is protecting all the files created after
+  for (int j = 0; j < 256; ++j) {
+    ASSERT_OK(Put(Key(j), rnd.RandomString(10 * 1024)));
+  }
+  blocking_thread.WaitUntilSleeping();
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr));
+
+  ASSERT_EQ("0,0,0,1", FilesPerLevel(0));
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+  auto file_on_L2 = metadata[0].name;
+  listener->SetExpectedFileName(dbname_ + file_on_L2);
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr, nullptr,
+                                        true /* disallow trivial move */));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+
+  // finish the flush!
+  blocking_thread.WakeUp();
+  blocking_thread.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  // File just flushed is too big for L0 and L1 so gets moved to L2.
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,0,1,0,1", FilesPerLevel(0));
+
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 2U);
+
+  // This file should have been deleted during last compaction
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(dbname_ + file_on_L2));
+  listener->VerifyMatchedCount(1);
+}
+
+// Test that producing an empty .sst file does not write it out to
+// disk, and that the DeleteFile() env method is not called for
+// removing the non-existing file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForNotCreatedSSTFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  // Flush the empty database.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  // We expect no .sst files.
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 0U);
+
+  // We expect no file deletions.
+  listener->VerifyMatchedCount(0);
+}
+
+// Test that producing a non-empty .sst file does write it out to
+// disk, and that the DeleteFile() env method is not called for removing
+// the file later.
+TEST_F(DBSSTTest, DeleteFileNotCalledForCreatedSSTFile) {
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  Reopen(options);
+
+  ASSERT_OK(Put("pika", "choo"));
+
+  // Flush the non-empty database.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("1", FilesPerLevel(0));
+
+  // We expect 1 .sst files.
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(metadata.size(), 1U);
+
+  // We expect no file deletions.
+  listener->VerifyMatchedCount(0);
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManager) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* /*arg*/) { files_added++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile",
+      [&](void* /*arg*/) { files_deleted++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 25; i++) {
+    GenerateNewRandomFile(&rnd);
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Verify that we are tracking all sst files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  // Verify that we are tracking all sst files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  // We flushed at least 25 files
+  ASSERT_GE(files_added, 25);
+  // Compaction must have deleted some files
+  ASSERT_GT(files_deleted, 0);
+  // No files were moved
+  ASSERT_EQ(files_moved, 0);
+
+  Close();
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened
+  Close();
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("Key_" + std::to_string(i), "Value_" + std::to_string(i)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  std::vector<uint64_t> blob_files = GetBlobFileNumbers();
+  ASSERT_EQ(files_added, blob_files.size());
+  // No blob file is obsoleted.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  // No files were moved.
+  ASSERT_EQ(files_moved, 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+
+  // Verify that we are tracking all sst and blob files in dbname_
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  // Verify the total files size
+  uint64_t total_files_size = 0;
+  for (auto& file_to_size : files_in_db) {
+    total_files_size += file_to_size.second;
+  }
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+  Close();
+
+  Reopen(options);
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Verify that we track all the files again after the DB is closed and opened.
+  Close();
+
+  sst_file_manager.reset(NewSstFileManager(env_));
+  options.sst_file_manager = sst_file_manager;
+  sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Reopen(options);
+
+  ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Destroy DB and it will remove all the blob files from sst file manager and
+  // blob files deletion will go through ScheduleFileDeletion.
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  ASSERT_EQ(files_deleted, blob_files.size());
+  ASSERT_EQ(files_scheduled_to_delete, blob_files.size());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBSSTTest, DBWithSstFileManagerForBlobFilesWithGC) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_moved = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (file_path->find(".blob") != std::string::npos) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnMoveFile", [&](void* /*arg*/) { files_moved++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "first_value";
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "second_value";
+
+  ASSERT_OK(Put(first_key, first_value));
+  ASSERT_OK(Put(second_key, second_value));
+  ASSERT_OK(Flush());
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "third_value";
+  constexpr char fourth_key[] = "fourth_key";
+  constexpr char fourth_value[] = "fourth_value";
+  constexpr char fifth_key[] = "fifth_key";
+  constexpr char fifth_value[] = "fifth_value";
+
+  ASSERT_OK(Put(third_key, third_value));
+  ASSERT_OK(Put(fourth_key, fourth_value));
+  ASSERT_OK(Put(fifth_key, fifth_value));
+  ASSERT_OK(Flush());
+
+  const std::vector<uint64_t> original_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(original_blob_files.size(), 5);
+  ASSERT_EQ(files_added, 5);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  ASSERT_EQ(files_moved, 0);
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  const size_t cutoff_index = static_cast<size_t>(
+      options.blob_garbage_collection_age_cutoff * original_blob_files.size());
+
+  size_t expected_number_of_files = original_blob_files.size();
+  // Note: turning off enable_blob_files before the compaction results in
+  // garbage collected values getting inlined.
+  ASSERT_OK(db_->SetOptions({{"enable_blob_files", "false"}}));
+  expected_number_of_files -= cutoff_index;
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(Get(first_key), first_value);
+  ASSERT_EQ(Get(second_key), second_value);
+  ASSERT_EQ(Get(third_key), third_value);
+  ASSERT_EQ(Get(fourth_key), fourth_value);
+  ASSERT_EQ(Get(fifth_key), fifth_value);
+
+  const std::vector<uint64_t> new_blob_files = GetBlobFileNumbers();
+
+  ASSERT_EQ(new_blob_files.size(), expected_number_of_files);
+  // No new file is added.
+  ASSERT_EQ(files_added, 0);
+  ASSERT_EQ(files_deleted, cutoff_index);
+  ASSERT_EQ(files_scheduled_to_delete, cutoff_index);
+  ASSERT_EQ(files_moved, 0);
+
+  // Original blob files below the cutoff should be gone, original blob files at
+  // or above the cutoff should be still there
+  for (size_t i = cutoff_index; i < original_blob_files.size(); ++i) {
+    ASSERT_EQ(new_blob_files[i - cutoff_index], original_blob_files[i]);
+  }
+
+  {
+    // Verify that we are tracking all sst and blob files in dbname_
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db));
+    ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db));
+    ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db);
+  }
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(files_deleted, 5);
+  ASSERT_EQ(files_scheduled_to_delete, 5);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+class DBSSTTestRateLimit : public DBSSTTest,
+                           public ::testing::WithParamInterface<bool> {
+ public:
+  DBSSTTestRateLimit() : DBSSTTest() {}
+  ~DBSSTTestRateLimit() override {}
+};
+
+TEST_P(DBSSTTestRateLimit, RateLimitedDelete) {
+  Destroy(last_options_);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBSSTTest::RateLimitedDelete:1",
+       "DeleteScheduler::BackgroundEmptyTrash"},
+  });
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "InstrumentedCondVar::TimedWaitInternal", [&](void* arg) {
+        // Turn timed wait into a simulated sleep
+        uint64_t* abs_time_us = static_cast<uint64_t*>(arg);
+        uint64_t cur_time = env_->NowMicros();
+        if (*abs_time_us > cur_time) {
+          env_->MockSleepForMicroseconds(*abs_time_us - cur_time);
+        }
+
+        // Plus an additional short, random amount
+        env_->MockSleepForMicroseconds(Random::GetTLSInstance()->Uniform(10));
+
+        // Set wait until time to before (actual) current time to force not
+        // to sleep
+        *abs_time_us = Env::Default()->NowMicros();
+      });
+
+  // Disable PeriodicTaskScheduler as it also has TimedWait, which could update
+  // the simulated sleep time
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::StartPeriodicTaskScheduler:DisableScheduler", [&](void* arg) {
+        bool* disable_scheduler = static_cast<bool*>(arg);
+        *disable_scheduler = true;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool different_wal_dir = GetParam();
+  Options options = CurrentOptions();
+  SetTimeElapseOnlySleepOnReopen(&options);
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.statistics = CreateDBStatistics();
+  if (different_wal_dir) {
+    options.wal_dir = alternative_wal_dir_;
+  }
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1.1);
+
+  WriteOptions wo;
+  if (!different_wal_dir) {
+    wo.disableWAL = true;
+  }
+  Reopen(options);
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key3", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key1", DummyString(1024, v), wo));
+    ASSERT_OK(Put("Key4", DummyString(1024, v), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  uint64_t delete_start_time = env_->NowMicros();
+  // Hold BackgroundEmptyTrash
+  TEST_SYNC_POINT("DBSSTTest::RateLimitedDelete:1");
+  sfm->WaitForEmptyTrash();
+  uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time;
+
+  uint64_t total_files_size = 0;
+  uint64_t expected_penlty = 0;
+  ASSERT_EQ(penalties.size(), metadata.size());
+  for (size_t i = 0; i < metadata.size(); i++) {
+    total_files_size += metadata[i].size;
+    expected_penlty = ((total_files_size * 1000000) / rate_bytes_per_sec);
+    ASSERT_EQ(expected_penlty, penalties[i]);
+  }
+  ASSERT_GT(time_spent_deleting, expected_penlty * 0.9);
+  ASSERT_LT(time_spent_deleting, expected_penlty * 1.1);
+  ASSERT_EQ(4, options.statistics->getAndResetTickerCount(FILES_MARKED_TRASH));
+  ASSERT_EQ(
+      0, options.statistics->getAndResetTickerCount(FILES_DELETED_IMMEDIATELY));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(RateLimitedDelete, DBSSTTestRateLimit,
+                        ::testing::Bool());
+
+TEST_F(DBSSTTest, RateLimitedWALDelete) {
+  Destroy(last_options_);
+
+  std::vector<uint64_t> penalties;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::BackgroundEmptyTrash:Wait",
+      [&](void* arg) { penalties.push_back(*(static_cast<uint64_t*>(arg))); });
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+  SetTimeElapseOnlySleepOnReopen(&options);
+
+  ASSERT_OK(TryReopen(options));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Compaction will move the 4 files in L0 to trash and create 1 L1 file
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(penalties.size(), 8);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class DBWALTestWithParam
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<std::string, bool>> {
+ public:
+  explicit DBWALTestWithParam()
+      : DBTestBase("db_wal_test_with_params", /*env_do_fsync=*/true) {
+    wal_dir_ = std::get<0>(GetParam());
+    wal_dir_same_as_dbname_ = std::get<1>(GetParam());
+  }
+
+  std::string wal_dir_;
+  bool wal_dir_same_as_dbname_;
+};
+
+TEST_P(DBWALTestWithParam, WALTrashCleanupOnOpen) {
+  class MyEnv : public EnvWrapper {
+   public:
+    MyEnv(Env* t) : EnvWrapper(t), fake_log_delete(false) {}
+    const char* Name() const override { return "MyEnv"; }
+    Status DeleteFile(const std::string& fname) override {
+      if (fname.find(".log.trash") != std::string::npos && fake_log_delete) {
+        return Status::OK();
+      }
+
+      return target()->DeleteFile(fname);
+    }
+
+    void set_fake_log_delete(bool fake) { fake_log_delete = fake; }
+
+   private:
+    bool fake_log_delete;
+  };
+
+  std::unique_ptr<MyEnv> env(new MyEnv(env_));
+  Destroy(last_options_);
+
+  env->set_fake_log_delete(true);
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.env = env.get();
+  options.wal_dir = dbname_ + wal_dir_;
+
+  int64_t rate_bytes_per_sec = 1024 * 10;  // 10 Kbs / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  options.sst_file_manager->SetDeleteRateBytesPerSecond(rate_bytes_per_sec);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(3.1);
+
+  Reopen(options);
+
+  // Create 4 files in L0
+  for (char v = 'a'; v <= 'd'; v++) {
+    if (v == 'c') {
+      // Maximize the change that the last log file will be preserved in trash
+      // before restarting the DB.
+      // We have to set this on the 2nd to last file for it to delay deletion
+      // on the last file. (Quirk of DeleteScheduler::BackgroundEmptyTrash())
+      options.sst_file_manager->SetDeleteRateBytesPerSecond(1);
+    }
+    ASSERT_OK(Put("Key2", DummyString(1024, v)));
+    ASSERT_OK(Put("Key3", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Put("Key1", DummyString(1024, v)));
+    ASSERT_OK(Put("Key4", DummyString(1024, v)));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  Close();
+
+  options.sst_file_manager.reset();
+  std::vector<std::string> filenames;
+  int trash_log_count = 0;
+  if (!wal_dir_same_as_dbname_) {
+    // Forcibly create some trash log files
+    std::unique_ptr<WritableFile> result;
+    ASSERT_OK(env->NewWritableFile(options.wal_dir + "/1000.log.trash", &result,
+                                   EnvOptions()));
+    result.reset();
+  }
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_GE(trash_log_count, 1);
+
+  env->set_fake_log_delete(false);
+  Reopen(options);
+
+  filenames.clear();
+  trash_log_count = 0;
+  ASSERT_OK(env->GetChildren(options.wal_dir, &filenames));
+  for (const std::string& fname : filenames) {
+    if (fname.find(".log.trash") != std::string::npos) {
+      trash_log_count++;
+    }
+  }
+  ASSERT_EQ(trash_log_count, 0);
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBWALTestWithParam, DBWALTestWithParam,
+                        ::testing::Values(std::make_tuple("", true),
+                                          std::make_tuple("_wal_dir", false)));
+
+TEST_F(DBSSTTest, OpenDBWithExistingTrash) {
+  Options options = CurrentOptions();
+
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 1024 * 1024 /* 1 MB/sec */));
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  Destroy(last_options_);
+
+  // Add some trash files to the db directory so the DB can clean them up
+  ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "001.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "002.sst.trash"));
+  ASSERT_OK(WriteStringToFile(env_, "abc", dbname_ + "/" + "003.sst.trash"));
+
+  // Reopen the DB and verify that it deletes existing trash files
+  Reopen(options);
+  sfm->WaitForEmptyTrash();
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "001.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "002.sst.trash"));
+  ASSERT_NOK(env_->FileExists(dbname_ + "/" + "003.sst.trash"));
+}
+
+// Create a DB with 2 db_paths, and generate multiple files in the 2
+// db_paths using CompactRangeOptions, make sure that files that were
+// deleted from first db_path were deleted using DeleteScheduler and
+// files in the second path were not.
+TEST_F(DBSSTTest, DeleteSchedulerMultipleDBPaths) {
+  std::atomic<int> bg_delete_file(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  // The deletion scheduler sometimes skips marking file as trash according to
+  // a heuristic. In that case the deletion will go through the below SyncPoint.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteFile", [&](void* /*arg*/) { bg_delete_file++; });
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.db_paths.emplace_back(dbname_, 1024 * 100);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 100);
+  options.env = env_;
+
+  int64_t rate_bytes_per_sec = 1024 * 1024;  // 1 Mb / Sec
+  Status s;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", rate_bytes_per_sec, false, &s,
+                        /* max_trash_db_ratio= */ 1.1));
+
+  ASSERT_OK(s);
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A'), wo));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  Slice begin("Key0");
+  Slice end("Key3");
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  // Create 4 files in L0
+  for (int i = 4; i < 8; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'B'), wo));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("4,1", FilesPerLevel(0));
+
+  // Compaction will delete files from L0 in first db path and generate a new
+  // file in L1 in second db path
+  begin = "Key4";
+  end = "Key7";
+  ASSERT_OK(db_->CompactRange(compact_options, &begin, &end));
+  ASSERT_EQ("0,2", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 8);
+
+  // Compaction will delete both files and regenerate a file in L1 in second
+  // db path. The deleted files should still be cleaned up via delete scheduler.
+  compact_options.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, 10);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DestroyDBWithRateLimitedDelete) {
+  int bg_delete_file = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DeleteScheduler::DeleteTrashFile:DeleteFile",
+      [&](void* /*arg*/) { bg_delete_file++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status s;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.sst_file_manager.reset(
+      NewSstFileManager(env_, nullptr, "", 0, false, &s, 0));
+  ASSERT_OK(s);
+  DestroyAndReopen(options);
+
+  // Create 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put("Key" + std::to_string(i), DummyString(1024, 'A')));
+    ASSERT_OK(Flush());
+  }
+  // We created 4 sst files in L0
+  ASSERT_EQ("4", FilesPerLevel(0));
+
+  // Close DB and destroy it using DeleteScheduler
+  Close();
+
+  int num_sst_files = 0;
+  int num_wal_files = 0;
+  std::vector<std::string> db_files;
+  ASSERT_OK(env_->GetChildren(dbname_, &db_files));
+  for (std::string f : db_files) {
+    if (f.substr(f.find_last_of(".") + 1) == "sst") {
+      num_sst_files++;
+    } else if (f.substr(f.find_last_of(".") + 1) == "log") {
+      num_wal_files++;
+    }
+  }
+  ASSERT_GT(num_sst_files, 0);
+  ASSERT_GT(num_wal_files, 0);
+
+  auto sfm = static_cast<SstFileManagerImpl*>(options.sst_file_manager.get());
+
+  sfm->SetDeleteRateBytesPerSecond(1024 * 1024);
+  // Set an extra high trash ratio to prevent immediate/non-rate limited
+  // deletions
+  sfm->delete_scheduler()->SetMaxTrashDBRatio(1000.0);
+  ASSERT_OK(DestroyDB(dbname_, options));
+  sfm->WaitForEmptyTrash();
+  ASSERT_EQ(bg_delete_file, num_sst_files + num_wal_files);
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowed) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 100 keys.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t first_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &first_file_size));
+  ASSERT_EQ(sfm->GetTotalSize(), first_file_size);
+
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(first_file_size + 1);
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will cause bg_error_ and will fail
+  ASSERT_NOK(Flush());
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedWithBlobFiles) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  uint64_t files_size = 0;
+  uint64_t total_files_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+
+  ASSERT_OK(GetAllDataFiles(kBlobFile, &files_in_db, &files_size));
+  // Make sure blob files are considered by SSTFileManage in size limits.
+  ASSERT_GT(files_size, 0);
+  total_files_size = files_size;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &files_size));
+  total_files_size += files_size;
+  ASSERT_EQ(sfm->GetTotalSize(), total_files_size);
+
+  // Set the maximum allowed space usage to the current total size.
+  sfm->SetMaxAllowedSpaceUsage(total_files_size + 1);
+
+  bool max_allowed_space_reached = false;
+  bool delete_blob_file = false;
+  // Sync point called after blob file is closed and max allowed space is
+  // checked.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileCompletionCallback::CallBack::MaxAllowedSpaceReached",
+      [&](void* /*arg*/) { max_allowed_space_reached = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable::AfterDeleteFile",
+      [&](void* /*arg*/) { delete_blob_file = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {
+          "BuildTable::AfterDeleteFile",
+          "DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1",
+      },
+  });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key1", "val1"));
+  // This flush will fail
+  ASSERT_NOK(Flush());
+  ASSERT_TRUE(max_allowed_space_reached);
+
+  TEST_SYNC_POINT("DBSSTTest::DBWithMaxSpaceAllowedWithBlobFiles:1");
+  ASSERT_TRUE(delete_blob_file);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.level0_file_num_compaction_trigger = 2;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* /*arg*/) {
+        sfm->SetMaxAllowedSpaceUsage(0);
+        ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial:AfterRun",
+      [&](void* /*arg*/) { completed_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Because we set a callback in CancelledCompaction, we actually
+  // let the compaction run
+  ASSERT_GT(completed_compactions, 0);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_GT(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            0);
+  ASSERT_EQ(0,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_MARKED_TRASH));
+  ASSERT_EQ(4,
+            dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                FILES_DELETED_IMMEDIATELY));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, CancellingManualCompactionsWorks) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.statistics = CreateDBStatistics();
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t total_file_size = 0;
+  std::unordered_map<std::string, uint64_t> files_in_db;
+  ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_file_size));
+  // Set the maximum allowed space usage to the current total size
+  sfm->SetMaxAllowedSpaceUsage(2 * total_file_size + 1);
+
+  // Generate another file to trigger compaction.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  // OK, now trigger a manual compaction
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsCompactionTooLarge());
+
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  // Make sure the stat is bumped
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            1);
+
+  // Now make sure CompactFiles also gets cancelled
+  auto l0_files = collector->GetFlushedFiles();
+  ASSERT_TRUE(
+      dbfull()
+          ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), l0_files, 0)
+          .IsCompactionTooLarge());
+
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(dbfull()->immutable_db_options().statistics.get()->getTickerCount(
+                COMPACTION_CANCELLED),
+            2);
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+
+  // Now let the flush through and make sure GetCompactionsReservedSize
+  // returns to normal
+  sfm->SetMaxAllowedSpaceUsage(0);
+  int completed_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactFilesImpl:End", [&](void* /*arg*/) { completed_compactions++; });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(dbfull()->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                   l0_files, 0));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(sfm->GetCompactionsReservedSize(), 0);
+  ASSERT_GT(completed_compactions, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, DBWithMaxSpaceAllowedRandomized) {
+  // This test will set a maximum allowed space for the DB, then it will
+  // keep filling the DB until the limit is reached and bg_error_ is set.
+  // When bg_error_ is set we will verify that the DB size is greater
+  // than the limit.
+
+  std::vector<int> max_space_limits_mbs = {1, 10};
+  std::atomic<bool> bg_error_set(false);
+
+  std::atomic<int> reached_max_space_on_flush(0);
+  std::atomic<int> reached_max_space_on_compaction(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached",
+      [&](void* arg) {
+        Status* bg_error = static_cast<Status*>(arg);
+        bg_error_set = true;
+        reached_max_space_on_flush++;
+        // clear error to ensure compaction callback is called
+        *bg_error = Status::OK();
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction():CancelledCompaction", [&](void* arg) {
+        bool* enough_room = static_cast<bool*>(arg);
+        *enough_room = true;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached",
+      [&](void* /*arg*/) {
+        bg_error_set = true;
+        reached_max_space_on_compaction++;
+      });
+
+  for (auto limit_mb : max_space_limits_mbs) {
+    bg_error_set = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+
+    Options options = CurrentOptions();
+    options.sst_file_manager = sst_file_manager;
+    options.write_buffer_size = 1024 * 512;  // 512 Kb
+    DestroyAndReopen(options);
+    Random rnd(301);
+
+    sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024);
+
+    // It is easy to detect if the test is stuck in a loop. No need for
+    // complex termination logic.
+    while (true) {
+      auto s = Put(rnd.RandomString(10), rnd.RandomString(50));
+      if (!s.ok()) {
+        break;
+      }
+    }
+    ASSERT_TRUE(bg_error_set);
+    uint64_t total_sst_files_size = 0;
+    std::unordered_map<std::string, uint64_t> files_in_db;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &files_in_db, &total_sst_files_size));
+    ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+
+  ASSERT_GT(reached_max_space_on_flush, 0);
+  ASSERT_GT(reached_max_space_on_compaction, 0);
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFiles) {
+  // Open DB with infinite max open files
+  //  - First iteration use 1 thread to open files
+  //  - Second iteration use 5 threads to open files
+  for (int iter = 0; iter < 2; iter++) {
+    Options options;
+    options.create_if_missing = true;
+    options.write_buffer_size = 100000;
+    options.disable_auto_compactions = true;
+    options.max_open_files = -1;
+    if (iter == 0) {
+      options.max_file_opening_threads = 1;
+    } else {
+      options.max_file_opening_threads = 5;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    // Create 12 Files in L0 (then move then to L2)
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L2_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    CompactRangeOptions compact_options;
+    compact_options.change_level = true;
+    compact_options.target_level = 2;
+    ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+    // Create 12 Files in L0
+    for (int i = 0; i < 12; i++) {
+      std::string k = "L0_" + Key(i);
+      ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+      ASSERT_OK(Flush());
+    }
+    Close();
+
+    // Reopening the DB will load all existing files
+    Reopen(options);
+    ASSERT_EQ("12,0,12", FilesPerLevel(0));
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files);
+
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        ASSERT_TRUE(file.table_reader_handle != nullptr);
+      }
+    }
+
+    for (int i = 0; i < 12; i++) {
+      ASSERT_EQ(Get("L0_" + Key(i)), "L0_" + Key(i) + std::string(1000, 'a'));
+      ASSERT_EQ(Get("L2_" + Key(i)), "L2_" + Key(i) + std::string(1000, 'a'));
+    }
+  }
+}
+
+TEST_F(DBSSTTest, OpenDBWithInfiniteMaxOpenFilesSubjectToMemoryLimit) {
+  for (CacheEntryRoleOptions::Decision charge_table_reader :
+       {CacheEntryRoleOptions::Decision::kEnabled,
+        CacheEntryRoleOptions::Decision::kDisabled}) {
+    // Open DB with infinite max open files
+    //  - First iteration use 1 thread to open files
+    //  - Second iteration use 5 threads to open files
+    for (int iter = 0; iter < 2; iter++) {
+      Options options;
+      options.create_if_missing = true;
+      options.write_buffer_size = 100000;
+      options.disable_auto_compactions = true;
+      options.max_open_files = -1;
+
+      BlockBasedTableOptions table_options;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      if (iter == 0) {
+        options.max_file_opening_threads = 1;
+      } else {
+        options.max_file_opening_threads = 5;
+      }
+
+      DestroyAndReopen(options);
+
+      // Create 5 Files in L0 (then move then to L2)
+      for (int i = 0; i < 5; i++) {
+        std::string k = "L2_" + Key(i);
+        ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+        ASSERT_OK(Flush()) << i;
+      }
+      CompactRangeOptions compact_options;
+      compact_options.change_level = true;
+      compact_options.target_level = 2;
+      ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+
+      // Create 5 Files in L0
+      for (int i = 0; i < 5; i++) {
+        std::string k = "L0_" + Key(i);
+        ASSERT_OK(Put(k, k + std::string(1000, 'a')));
+        ASSERT_OK(Flush());
+      }
+      Close();
+
+      table_options.cache_usage_options.options_overrides.insert(
+          {CacheEntryRole::kBlockBasedTableReader,
+           {/*.charged = */ charge_table_reader}});
+      table_options.block_cache =
+          NewLRUCache(1024 /* capacity */, 0 /* num_shard_bits */,
+                      true /* strict_capacity_limit */);
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+      // Reopening the DB will try to load all existing files, conditionally
+      // subject to memory limit
+      Status s = TryReopen(options);
+
+      if (charge_table_reader == CacheEntryRoleOptions::Decision::kEnabled) {
+        EXPECT_TRUE(s.IsMemoryLimit());
+        EXPECT_TRUE(s.ToString().find(
+                        kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                            CacheEntryRole::kBlockBasedTableReader)]) !=
+                    std::string::npos);
+        EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                    std::string::npos);
+
+      } else {
+        EXPECT_TRUE(s.ok());
+        ASSERT_EQ("5,0,5", FilesPerLevel(0));
+      }
+    }
+  }
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSize) {
+  // We don't propagate oldest-key-time table property on compaction and
+  // just write 0 as default value. This affect the exact table size, since
+  // we encode table properties as varint64. Force time to be 0 to work around
+  // it. Should remove the workaround after we propagate the property on
+  // compaction.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:oldest_ancester_time", [&](void* arg) {
+        uint64_t* current_time = static_cast<uint64_t*>(arg);
+        *current_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + std::to_string(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
+
+  // Compact 5 files into 1 file in L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 1);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 1 (compacted file)
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(live_sst_files_size, 1 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 6 (5 original files + compacted file)
+  ASSERT_EQ(total_sst_files_size, 6 * single_file_size);
+
+  ASSERT_OK(iter1->status());
+  iter1.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 1 (compacted file)
+  ASSERT_EQ(total_sst_files_size, 1 * single_file_size);
+
+  ASSERT_OK(iter2->status());
+  iter2.reset();
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBSSTTest, OpenDBWithoutGetFileSizeInvocations) {
+  Options options = CurrentOptions();
+  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+  options.env = env.get();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  options.enable_blob_files = true;
+  options.blob_file_size = 32;  // create one blob per file
+  options.skip_checking_sst_file_sizes_on_db_open = true;
+
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    for (int j = 0; j < 10; j++) {
+      std::string val = "val_file_" + std::to_string(i);
+      ASSERT_OK(Put(Key(j), val));
+    }
+    ASSERT_OK(Flush());
+  }
+  Close();
+
+  bool is_get_file_size_called = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "MockFileSystem::GetFileSize:CheckFileType", [&](void* arg) {
+        std::string* filename = reinterpret_cast<std::string*>(arg);
+        if (filename->find(".blob") != std::string::npos) {
+          is_get_file_size_called = true;
+        }
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_FALSE(is_get_file_size_called);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Destroy(options);
+}
+
+TEST_F(DBSSTTest, GetTotalSstFilesSizeVersionsFilesShared) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  // Generate 5 files in L0
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Put(Key(i), "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ("5", FilesPerLevel(0));
+
+  std::vector<LiveFileMetaData> live_files_meta;
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+  uint64_t single_file_size = live_files_meta[0].size;
+
+  uint64_t live_sst_files_size = 0;
+  uint64_t total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+
+  // Live SST files = 5
+  // Total SST files = 5
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter1(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter1->status());
+
+  // Compaction will do trivial move from L0 to L1
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,5", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 5);
+
+  live_sst_files_size = 0;
+  total_sst_files_size = 0;
+  for (const auto& file_meta : live_files_meta) {
+    live_sst_files_size += file_meta.size;
+  }
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 5
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(live_sst_files_size, 5 * single_file_size);
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  // hold current version
+  std::unique_ptr<Iterator> iter2(dbfull()->NewIterator(ReadOptions()));
+  ASSERT_OK(iter2->status());
+
+  // Delete all keys and compact, this will delete all live files
+  for (int i = 0; i < 5; i++) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("", FilesPerLevel(0));
+
+  live_files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&live_files_meta);
+  ASSERT_EQ(live_files_meta.size(), 0);
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 5 (used in 2 version)
+  ASSERT_EQ(total_sst_files_size, 5 * single_file_size);
+
+  ASSERT_OK(iter1->status());
+  iter1.reset();
+  ASSERT_OK(iter2->status());
+  iter2.reset();
+
+  ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.total-sst-files-size",
+                                       &total_sst_files_size));
+  // Live SST files = 0
+  // Total SST files = 0
+  ASSERT_EQ(total_sst_files_size, 0);
+}
+
+// This test if blob files are recorded by SST File Manager when Compaction job
+// creates/delete them and in case of AtomicFlush.
+TEST_F(DBSSTTest, DBWithSFMForBlobFilesAtomicFlush) {
+  std::shared_ptr<SstFileManager> sst_file_manager(NewSstFileManager(env_));
+  auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager.get());
+  Options options = CurrentOptions();
+  options.sst_file_manager = sst_file_manager;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.disable_auto_compactions = true;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+  options.atomic_flush = true;
+
+  int files_added = 0;
+  int files_deleted = 0;
+  int files_scheduled_to_delete = 0;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnAddFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_added++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::OnDeleteFile", [&](void* arg) {
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          files_deleted++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileManagerImpl::ScheduleFileDeletion", [&](void* arg) {
+        assert(arg);
+        const std::string* const file_path =
+            static_cast<const std::string*>(arg);
+        if (EndsWith(*file_path, ".blob")) {
+          ++files_scheduled_to_delete;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  ASSERT_OK(Put("key_1", "value_1"));
+  ASSERT_OK(Put("key_2", "value_2"));
+  ASSERT_OK(Put("key_3", "value_3"));
+  ASSERT_OK(Put("key_4", "value_4"));
+  ASSERT_OK(Flush());
+
+  // Overwrite will create the garbage data.
+  ASSERT_OK(Put("key_3", "new_value_3"));
+  ASSERT_OK(Put("key_4", "new_value_4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(files_added, 3);
+  ASSERT_EQ(files_deleted, 0);
+  ASSERT_EQ(files_scheduled_to_delete, 0);
+  files_added = 0;
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+  // Compaction job will create a new file and delete the older files.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(files_added, 1);
+  ASSERT_EQ(files_scheduled_to_delete, 1);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 1);
+
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+
+  ASSERT_EQ(files_scheduled_to_delete, 4);
+
+  sfm->WaitForEmptyTrash();
+
+  ASSERT_EQ(files_deleted, 4);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_statistics_test.cc b/src/rocksdb/db/db_statistics_test.cc
new file mode 100644
index 000000000..4d4655361
--- /dev/null
+++ b/src/rocksdb/db/db_statistics_test.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <string>
+
+#include "db/db_test_util.h"
+#include "monitoring/thread_status_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/statistics.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBStatisticsTest : public DBTestBase {
+ public:
+  DBStatisticsTest()
+      : DBTestBase("db_statistics_test", /*env_do_fsync=*/true) {}
+};
+
+TEST_F(DBStatisticsTest, CompressionStatsTest) {
+  CompressionType type;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compression = type;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kExceptTimeForMutex);
+  DestroyAndReopen(options);
+
+  int kNumKeysWritten = 100000;
+
+  // Check that compressions occur and are counted when compression is turned on
+  Random rnd(301);
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED), 0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_GT(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED), 0);
+
+  options.compression = kNoCompression;
+  DestroyAndReopen(options);
+  uint64_t currentCompressions =
+      options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED);
+  uint64_t currentDecompressions =
+      options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED);
+
+  // Check that compressions do not occur when turned off
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    // compressible string
+    ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_COMPRESSED) -
+                currentCompressions,
+            0);
+
+  for (int i = 0; i < kNumKeysWritten; ++i) {
+    auto r = Get(Key(i));
+  }
+  ASSERT_EQ(options.statistics->getTickerCount(NUMBER_BLOCK_DECOMPRESSED) -
+                currentDecompressions,
+            0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStatsDisabledByDefault) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, MutexWaitStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  const uint64_t kMutexWaitDelay = 100;
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT,
+                                       kMutexWaitDelay);
+  ASSERT_OK(Put("hello", "rocksdb"));
+  ASSERT_GE(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay);
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0);
+}
+
+TEST_F(DBStatisticsTest, ResetStats) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  for (int i = 0; i < 2; ++i) {
+    // pick arbitrary ticker and histogram. On first iteration they're zero
+    // because db is unused. On second iteration they're zero due to Reset().
+    ASSERT_EQ(0, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+    HistogramData histogram_data;
+    options.statistics->histogramData(DB_WRITE, &histogram_data);
+    ASSERT_EQ(0.0, histogram_data.max);
+
+    if (i == 0) {
+      // The Put() makes some of the ticker/histogram stats nonzero until we
+      // Reset().
+      ASSERT_OK(Put("hello", "rocksdb"));
+      ASSERT_EQ(1, TestGetTickerCount(options, NUMBER_KEYS_WRITTEN));
+      options.statistics->histogramData(DB_WRITE, &histogram_data);
+      ASSERT_GT(histogram_data.max, 0.0);
+      ASSERT_OK(options.statistics->Reset());
+    }
+  }
+}
+
+TEST_F(DBStatisticsTest, ExcludeTickers) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  options.statistics->set_stats_level(StatsLevel::kExceptTickers);
+  ASSERT_OK(Put("foo", "value"));
+  ASSERT_EQ(0, options.statistics->getTickerCount(BYTES_WRITTEN));
+  options.statistics->set_stats_level(StatsLevel::kExceptHistogramOrTimers);
+  Reopen(options);
+  ASSERT_EQ("value", Get("foo"));
+  ASSERT_GT(options.statistics->getTickerCount(BYTES_READ), 0);
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBStatisticsTest, VerifyChecksumReadStat) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Reopen(options);
+
+  // Expected to be populated regardless of `PerfLevel` in user thread
+  SetPerfLevel(kDisable);
+
+  {
+    // Scenario 0: only WAL data. Not verified so require ticker to be zero.
+    ASSERT_OK(Put("foo", "value"));
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_EQ(0,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  // Create one SST.
+  ASSERT_OK(Flush());
+  std::unordered_map<std::string, uint64_t> table_files;
+  uint64_t table_files_size = 0;
+  GetAllDataFiles(kTableFile, &table_files, &table_files_size);
+
+  {
+    // Scenario 1: Table verified in `VerifyFileChecksums()`. This should read
+    // the whole file so we require the ticker stat exactly matches the file
+    // size.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyFileChecksums(ReadOptions()));
+    ASSERT_EQ(table_files_size,
+              options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES));
+  }
+
+  {
+    // Scenario 2: Table verified in `VerifyChecksum()`. This opens a
+    // `TableReader` to verify each block. It can involve duplicate reads of the
+    // same data so we set a lower-bound only.
+    ASSERT_OK(options.statistics->Reset());
+    ASSERT_OK(db_->VerifyChecksum());
+    ASSERT_GE(options.statistics->getTickerCount(VERIFY_CHECKSUM_READ_BYTES),
+              table_files_size);
+  }
+}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_table_properties_test.cc b/src/rocksdb/db/db_table_properties_test.cc
new file mode 100644
index 000000000..981a514ad
--- /dev/null
+++ b/src/rocksdb/db/db_table_properties_test.cc
@@ -0,0 +1,625 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "table/format.h"
+#include "table/meta_blocks.h"
+#include "table/table_properties_internal.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper function that ensures the table properties returned in
+// `GetPropertiesOfAllTablesTest` is correct.
+// This test assumes entries size is different for each of the tables.
+namespace {
+
+void VerifyTableProperties(DB* db, uint64_t expected_entries_size) {
+  TablePropertiesCollection props;
+  ASSERT_OK(db->GetPropertiesOfAllTables(&props));
+
+  ASSERT_EQ(4U, props.size());
+  std::unordered_set<uint64_t> unique_entries;
+
+  // Indirect test
+  uint64_t sum = 0;
+  for (const auto& item : props) {
+    unique_entries.insert(item.second->num_entries);
+    sum += item.second->num_entries;
+  }
+
+  ASSERT_EQ(props.size(), unique_entries.size());
+  ASSERT_EQ(expected_entries_size, sum);
+
+  VerifySstUniqueIds(props);
+}
+}  // anonymous namespace
+
+class DBTablePropertiesTest : public DBTestBase,
+                              public testing::WithParamInterface<std::string> {
+ public:
+  DBTablePropertiesTest()
+      : DBTestBase("db_table_properties_test", /*env_do_fsync=*/false) {}
+  TablePropertiesCollection TestGetPropertiesOfTablesInRange(
+      std::vector<Range> ranges, std::size_t* num_properties = nullptr,
+      std::size_t* num_files = nullptr);
+};
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfAllTablesTest) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 8;
+  // Part of strategy to prevent pinning table files
+  options.max_open_files = 42;
+  Reopen(options);
+
+  // Create 4 tables
+  for (int table = 0; table < 4; ++table) {
+    // Use old meta name for table properties for one file
+    if (table == 3) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "BlockBasedTableBuilder::WritePropertiesBlock:Meta", [&](void* meta) {
+            *reinterpret_cast<const std::string**>(meta) =
+                &kPropertiesBlockOldName;
+          });
+      SyncPoint::GetInstance()->EnableProcessing();
+    }
+    // Build file
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(
+          db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"));
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  std::string original_session_id;
+  ASSERT_OK(db_->GetDbSessionId(original_session_id));
+
+  // Part of strategy to prevent pinning table files
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      [&](void* skip_load) { *reinterpret_cast<bool*>(skip_load) = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // 1. Read table properties directly from file
+  Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 2. Put two tables to table cache and
+  Reopen(options);
+  // Clear out auto-opened files
+  dbfull()->TEST_table_cache()->EraseUnRefEntries();
+  ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+  // fetch key from 1st and 2nd table, which will internally place that table to
+  // the table cache.
+  for (int i = 0; i < 2; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 3. Put all tables to table cache
+  Reopen(options);
+  // fetch key from all tables, which will place them in table cache.
+  for (int i = 0; i < 4; ++i) {
+    Get(std::to_string(i * 100 + 0));
+  }
+  VerifyTableProperties(db_, 10 + 11 + 12 + 13);
+
+  // 4. Try to read CORRUPT properties (a) directly from file, and (b)
+  // through reader on Get
+
+  // It's not practical to prevent table file read on Open, so we
+  // corrupt after open and after purging table cache.
+  for (bool direct : {true, false}) {
+    Reopen(options);
+    // Clear out auto-opened files
+    dbfull()->TEST_table_cache()->EraseUnRefEntries();
+    ASSERT_EQ(dbfull()->TEST_table_cache()->GetUsage(), 0U);
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    std::string sst_file = props.begin()->first;
+
+    // Corrupt the file's TableProperties using session id
+    std::string contents;
+    ASSERT_OK(
+        ReadFileToString(env_->GetFileSystem().get(), sst_file, &contents));
+    size_t pos = contents.find(original_session_id);
+    ASSERT_NE(pos, std::string::npos);
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+
+    // Try to read CORRUPT properties
+    if (direct) {
+      ASSERT_TRUE(db_->GetPropertiesOfAllTables(&props).IsCorruption());
+    } else {
+      bool found_corruption = false;
+      for (int i = 0; i < 4; ++i) {
+        std::string result = Get(std::to_string(i * 100 + 0));
+        if (result.find_first_of("Corruption: block checksum mismatch") !=
+            std::string::npos) {
+          found_corruption = true;
+        }
+      }
+      ASSERT_TRUE(found_corruption);
+    }
+
+    // UN-corrupt file for next iteration
+    ASSERT_OK(test::CorruptFile(env_, sst_file, static_cast<int>(pos), 1,
+                                /*verify checksum fails*/ false));
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTablePropertiesTest, InvalidIgnored) {
+  // RocksDB versions 2.5 - 2.7 generate some properties that Block considers
+  // invalid in some way. This approximates that.
+
+  // Inject properties block data that Block considers invalid
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:BlockData",
+      [&](void* block_data) {
+        *reinterpret_cast<Slice*>(block_data) = Slice("X");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // Corrupting the table properties corrupts the unique id.
+  // Ignore the unique id recorded in the manifest.
+  auto options = CurrentOptions();
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  // Build file
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), std::to_string(i), "val"));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Not crashing is good enough
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+}
+
+TEST_F(DBTablePropertiesTest, CreateOnDeletionCollectorFactory) {
+  ConfigOptions options;
+  options.ignore_unsupported_options = false;
+
+  std::shared_ptr<TablePropertiesCollectorFactory> factory;
+  std::string id = CompactOnDeletionCollectorFactory::kClassName();
+  ASSERT_OK(
+      TablePropertiesCollectorFactory::CreateFromString(options, id, &factory));
+  auto del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(0U, del_factory->GetWindowSize());
+  ASSERT_EQ(0U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options, "window_size=100; deletion_trigger=90; id=" + id, &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.0, del_factory->GetDeletionRatio());
+  ASSERT_OK(TablePropertiesCollectorFactory::CreateFromString(
+      options,
+      "window_size=100; deletion_trigger=90; deletion_ratio=0.5; id=" + id,
+      &factory));
+  del_factory = factory->CheckedCast<CompactOnDeletionCollectorFactory>();
+  ASSERT_NE(del_factory, nullptr);
+  ASSERT_EQ(100U, del_factory->GetWindowSize());
+  ASSERT_EQ(90U, del_factory->GetDeletionTrigger());
+  ASSERT_EQ(0.5, del_factory->GetDeletionRatio());
+}
+
+TablePropertiesCollection
+DBTablePropertiesTest::TestGetPropertiesOfTablesInRange(
+    std::vector<Range> ranges, std::size_t* num_properties,
+    std::size_t* num_files) {
+  // Since we deref zero element in the vector it can not be empty
+  // otherwise we pass an address to some random memory
+  EXPECT_GT(ranges.size(), 0U);
+  // run the query
+  TablePropertiesCollection props;
+  EXPECT_OK(db_->GetPropertiesOfTablesInRange(
+      db_->DefaultColumnFamily(), &ranges[0], ranges.size(), &props));
+
+  // Make sure that we've received properties for those and for those files
+  // only which fall within requested ranges
+  std::vector<LiveFileMetaData> vmd;
+  db_->GetLiveFilesMetaData(&vmd);
+  for (auto& md : vmd) {
+    std::string fn = md.db_path + md.name;
+    bool in_range = false;
+    for (auto& r : ranges) {
+      // smallestkey < limit && largestkey >= start
+      if (r.limit.compare(md.smallestkey) >= 0 &&
+          r.start.compare(md.largestkey) <= 0) {
+        in_range = true;
+        EXPECT_GT(props.count(fn), 0);
+      }
+    }
+    if (!in_range) {
+      EXPECT_EQ(props.count(fn), 0);
+    }
+  }
+
+  if (num_properties) {
+    *num_properties = props.size();
+  }
+
+  if (num_files) {
+    *num_files = vmd.size();
+  }
+  return props;
+}
+
+TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
+  // Fixed random sead
+  Random rnd(301);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 2048;
+  options.max_bytes_for_level_base = 40960;
+  options.max_bytes_for_level_multiplier = 4;
+  options.hard_pending_compaction_bytes_limit = 16 * 1024;
+  options.num_levels = 8;
+  options.env = env_;
+
+  DestroyAndReopen(options);
+
+  // build a decent LSM
+  for (int i = 0; i < 10000; i++) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  if (NumTableFilesAtLevel(0) == 0) {
+    ASSERT_OK(Put(test::RandomKey(&rnd, 5), rnd.RandomString(102)));
+    ASSERT_OK(Flush());
+  }
+
+  ASSERT_OK(db_->PauseBackgroundWork());
+
+  // Ensure that we have at least L0, L1 and L2
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(1), 0);
+  ASSERT_GT(NumTableFilesAtLevel(2), 0);
+
+  // Query the largest range
+  std::size_t num_properties, num_files;
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_EQ(num_properties, num_files);
+
+  // Query the empty range
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::SMALLEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_EQ(num_properties, 0);
+
+  // Query the middle rangee
+  TestGetPropertiesOfTablesInRange(
+      {Range(test::RandomKey(&rnd, 5, test::RandomKeyType::MIDDLE),
+             test::RandomKey(&rnd, 5, test::RandomKeyType::LARGEST))},
+      &num_properties, &num_files);
+  ASSERT_GT(num_files, 0);
+  ASSERT_GT(num_files, num_properties);
+  ASSERT_GT(num_properties, 0);
+
+  // Query a bunch of random ranges
+  for (int j = 0; j < 100; j++) {
+    // create a bunch of ranges
+    std::vector<std::string> random_keys;
+    // Random returns numbers with zero included
+    // when we pass empty ranges TestGetPropertiesOfTablesInRange()
+    // derefs random memory in the empty ranges[0]
+    // so want to be greater than zero and even since
+    // the below loop requires that random_keys.size() to be even.
+    auto n = 2 * (rnd.Uniform(50) + 1);
+
+    for (uint32_t i = 0; i < n; ++i) {
+      random_keys.push_back(test::RandomKey(&rnd, 5));
+    }
+
+    ASSERT_GT(random_keys.size(), 0U);
+    ASSERT_EQ((random_keys.size() % 2), 0U);
+
+    std::vector<Range> ranges;
+    auto it = random_keys.begin();
+    while (it != random_keys.end()) {
+      ranges.push_back(Range(*it, *(it + 1)));
+      it += 2;
+    }
+
+    TestGetPropertiesOfTablesInRange(std::move(ranges));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, GetColumnFamilyNameProperty) {
+  std::string kExtraCfName = "pikachu";
+  CreateAndReopenWithCF({kExtraCfName}, CurrentOptions());
+
+  // Create one table per CF, then verify it was created with the column family
+  // name property.
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string expected_cf_name;
+    if (cf > 0) {
+      expected_cf_name = kExtraCfName;
+    } else {
+      expected_cf_name = kDefaultColumnFamilyName;
+    }
+    ASSERT_EQ(expected_cf_name,
+              fname_to_props.begin()->second->column_family_name);
+    ASSERT_EQ(cf, static_cast<uint32_t>(
+                      fname_to_props.begin()->second->column_family_id));
+  }
+}
+
+TEST_F(DBTablePropertiesTest, GetDbIdentifiersProperty) {
+  CreateAndReopenWithCF({"goku"}, CurrentOptions());
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    std::string id, sid;
+    ASSERT_OK(db_->GetDbIdentity(id));
+    ASSERT_OK(db_->GetDbSessionId(sid));
+    ASSERT_EQ(id, fname_to_props.begin()->second->db_id);
+    ASSERT_EQ(sid, fname_to_props.begin()->second->db_session_id);
+  }
+}
+
+class DBTableHostnamePropertyTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, std::string>> {
+ public:
+  DBTableHostnamePropertyTest()
+      : DBTestBase("db_table_hostname_property_test",
+                   /*env_do_fsync=*/false) {}
+};
+
+TEST_P(DBTableHostnamePropertyTest, DbHostLocationProperty) {
+  option_config_ = std::get<0>(GetParam());
+  Options opts = CurrentOptions();
+  std::string expected_host_id = std::get<1>(GetParam());
+  ;
+  if (expected_host_id == kHostnameForDbHostId) {
+    ASSERT_OK(env_->GetHostNameString(&expected_host_id));
+  } else {
+    opts.db_host_id = expected_host_id;
+  }
+  CreateAndReopenWithCF({"goku"}, opts);
+
+  for (uint32_t cf = 0; cf < 2; ++cf) {
+    ASSERT_OK(Put(cf, "key", "val"));
+    ASSERT_OK(Put(cf, "foo", "bar"));
+    ASSERT_OK(Flush(cf));
+
+    TablePropertiesCollection fname_to_props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[cf], &fname_to_props));
+    ASSERT_EQ(1U, fname_to_props.size());
+
+    ASSERT_EQ(fname_to_props.begin()->second->db_host_id, expected_host_id);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DBTableHostnamePropertyTest, DBTableHostnamePropertyTest,
+    ::testing::Values(
+        // OptionConfig, override db_host_location
+        std::make_tuple(DBTestBase::OptionConfig::kDefault,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kDefault, ""),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        kHostnameForDbHostId),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "foobar"),
+        std::make_tuple(DBTestBase::OptionConfig::kPlainTableFirstBytePrefix,
+                        "")));
+
+class DeletionTriggeredCompactionTestListener : public EventListener {
+ public:
+  void OnCompactionBegin(DB*, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+
+  void OnCompactionCompleted(DB*, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.compaction_reason,
+              CompactionReason::kFilesMarkedForCompaction);
+  }
+};
+
+TEST_P(DBTablePropertiesTest, DeletionTriggeredCompactionMarking) {
+  int kNumKeys = 1000;
+  int kWindowSize = 100;
+  int kNumDelsTrigger = 90;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger);
+
+  Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  if (GetParam() == "kCompactionStyleUniversal") {
+    opts.compaction_style = kCompactionStyleUniversal;
+  }
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(1);
+
+  DeletionTriggeredCompactionTestListener* listener =
+      new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Change the window size and deletion trigger and ensure new values take
+  // effect
+  kWindowSize = 50;
+  kNumDelsTrigger = 40;
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+  // Change the window size to disable delete triggered compaction
+  kWindowSize = 0;
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetWindowSize(kWindowSize);
+  static_cast<CompactOnDeletionCollectorFactory*>(compact_on_del.get())
+      ->SetDeletionTrigger(kNumDelsTrigger);
+  for (int i = 0; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_WRITE_BYTES_MARKED));
+  ASSERT_LT(0, opts.statistics->getTickerCount(COMPACT_READ_BYTES_MARKED));
+}
+
+TEST_P(DBTablePropertiesTest, RatioBasedDeletionTriggeredCompactionMarking) {
+  constexpr int kNumKeys = 1000;
+  constexpr int kWindowSize = 0;
+  constexpr int kNumDelsTrigger = 0;
+  constexpr double kDeletionRatio = 0.1;
+  std::shared_ptr<TablePropertiesCollectorFactory> compact_on_del =
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger,
+                                           kDeletionRatio);
+
+  Options opts = CurrentOptions();
+  opts.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  opts.table_properties_collector_factories.emplace_back(compact_on_del);
+
+  Reopen(opts);
+
+  // Add an L2 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  auto* listener = new DeletionTriggeredCompactionTestListener();
+  opts.listeners.emplace_back(listener);
+  Reopen(opts);
+
+  // Generate one L0 with kNumKeys Put.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Put(Key(i), "not important"));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another L0 with kNumKeys Delete.
+  // This file, due to deletion ratio, will trigger compaction: 2@0 files to L1.
+  // The resulting L1 file has only one tombstone for user key 'Key(0)'.
+  // Again, due to deletion ratio, a compaction will be triggered: 1@1 + 1@2
+  // files to L2. However, the resulting file is empty because the tombstone
+  // and value are both dropped.
+  for (int i = 0; i < kNumKeys; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_EQ(0, NumTableFilesAtLevel(i));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(DBTablePropertiesTest, DBTablePropertiesTest,
+                        ::testing::Values("kCompactionStyleLevel",
+                                          "kCompactionStyleUniversal"));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_tailing_iter_test.cc b/src/rocksdb/db/db_tailing_iter_test.cc
new file mode 100644
index 000000000..af3194ac4
--- /dev/null
+++ b/src/rocksdb/db/db_tailing_iter_test.cc
@@ -0,0 +1,604 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#if !defined(ROCKSDB_LITE)
+
+#include "db/db_test_util.h"
+#include "db/forward_iterator.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTestTailingIterator : public DBTestBase,
+                              public ::testing::WithParamInterface<bool> {
+ public:
+  DBTestTailingIterator()
+      : DBTestBase("db_tailing_iterator_test", /*env_do_fsync=*/true) {}
+};
+
+INSTANTIATE_TEST_CASE_P(DBTestTailingIterator, DBTestTailingIterator,
+                        ::testing::Bool());
+
+TEST_P(DBTestTailingIterator, TailingIteratorSingle) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  iter->SeekToFirst();
+  ASSERT_TRUE(!iter->Valid());
+  ASSERT_OK(iter->status());
+
+  // add a record and check that iter can see it
+  ASSERT_OK(db_->Put(WriteOptions(), "mirko", "fodor"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "mirko");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorKeepAdding) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::string value(1024, 'a');
+
+  const int num_records = 10000;
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%016d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToNext) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
+  std::string value(1024, 'a');
+
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorTrimSeekToNext) {
+  const uint64_t k150KB = 150 * 1024;
+  Options options;
+  options.write_buffer_size = k150KB;
+  options.max_write_buffer_number = 3;
+  options.min_write_buffer_number_to_merge = 2;
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  int num_iters, deleted_iters;
+
+  char bufe[32];
+  snprintf(bufe, sizeof(bufe), "00b0%016d", 0);
+  Slice keyu(bufe, 20);
+  read_options.iterate_upper_bound = &keyu;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  std::unique_ptr<Iterator> itern(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(itern->status());
+  std::unique_ptr<Iterator> iterh(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iterh->status());
+  std::string value(1024, 'a');
+  bool file_iters_deleted = false;
+  bool file_iters_renewed_null = false;
+  bool file_iters_renewed_copy = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::Next:Return", [&](void* arg) {
+        ForwardIterator* fiter = reinterpret_cast<ForwardIterator*>(arg);
+        ASSERT_TRUE(!file_iters_deleted ||
+                    fiter->TEST_CheckDeletedIters(&deleted_iters, &num_iters));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Null",
+      [&](void* /*arg*/) { file_iters_renewed_null = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::RenewIterators:Copy",
+      [&](void* /*arg*/) { file_iters_renewed_copy = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  const int num_records = 1000;
+  for (int i = 1; i < num_records; ++i) {
+    char buf1[32];
+    char buf2[32];
+    char buf3[32];
+    char buf4[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+    snprintf(buf3, sizeof(buf3), "00b0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+    Slice keyn(buf3, 20);
+    ASSERT_OK(Put(1, keyn, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      if (i == 299) {
+        file_iters_deleted = true;
+      }
+      snprintf(buf4, sizeof(buf4), "00a0%016d", i * 5 / 2);
+      Slice target(buf4, 20);
+      iterh->Seek(target);
+      ASSERT_TRUE(iter->Valid());
+      for (int j = (i + 1) * 5 / 2; j < i * 5; j += 5) {
+        iterh->Next();
+        ASSERT_TRUE(iterh->Valid());
+      }
+      if (i == 299) {
+        file_iters_deleted = false;
+      }
+    }
+
+    file_iters_deleted = true;
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    if (i == 1) {
+      itern->SeekToFirst();
+    } else {
+      itern->Next();
+    }
+    ASSERT_TRUE(itern->Valid());
+    ASSERT_EQ(itern->key().compare(key), 0);
+    ASSERT_LE(num_iters, 1);
+    file_iters_deleted = false;
+  }
+  ASSERT_TRUE(file_iters_renewed_null);
+  ASSERT_TRUE(file_iters_renewed_copy);
+  iter = nullptr;
+  itern = nullptr;
+  iterh = nullptr;
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  table_options.block_cache_compressed = nullptr;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  read_options.read_tier = kBlockCacheTier;
+  std::unique_ptr<Iterator> iteri(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iteri->status());
+  char buf5[32];
+  snprintf(buf5, sizeof(buf5), "00a0%016d", (num_records / 2) * 5 - 2);
+  Slice target1(buf5, 20);
+  iteri->Seek(target1);
+  ASSERT_TRUE(iteri->status().IsIncomplete());
+  iteri = nullptr;
+
+  read_options.read_tier = kReadAllTier;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  iter.reset(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  for (int i = 2 * num_records; i > 0; --i) {
+    char buf1[32];
+    char buf2[32];
+    snprintf(buf1, sizeof(buf1), "00a0%016d", i * 5);
+
+    Slice key(buf1, 20);
+    ASSERT_OK(Put(1, key, value));
+
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush(1));
+    }
+
+    snprintf(buf2, sizeof(buf2), "00a0%016d", i * 5 - 2);
+    Slice target(buf2, 20);
+    iter->Seek(target);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(key), 0);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorDeletes) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+
+  // write a single record, read it using the iterator, then delete it
+  ASSERT_OK(Put(1, "0test", "test"));
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0test");
+  ASSERT_OK(Delete(1, "0test"));
+
+  // write many more records
+  const int num_records = 10000;
+  std::string value(1024, 'A');
+
+  for (int i = 0; i < num_records; ++i) {
+    char buf[32];
+    snprintf(buf, sizeof(buf), "1%015d", i);
+
+    Slice key(buf, 16);
+    ASSERT_OK(Put(1, key, value));
+  }
+
+  // force a flush to make sure that no records are read from memtable
+  ASSERT_OK(Flush(1));
+
+  // skip "0test"
+  iter->Next();
+
+  // make sure we can read all new records using the existing iterator
+  int count = 0;
+  for (; iter->Valid(); iter->Next(), ++count)
+    ;
+
+  ASSERT_EQ(count, num_records);
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorPrefixSeek) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+  options.allow_concurrent_memtable_write = false;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(iter->status());
+  ASSERT_OK(Put(1, "0101", "test"));
+
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(Put(1, "0202", "test"));
+
+  // Seek(0102) shouldn't find any records since 0202 has a different prefix
+  iter->Seek("0102");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("0202");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "0202");
+
+  iter->Next();
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorIncomplete) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  read_options.read_tier = kBlockCacheTier;
+
+  std::string key("key");
+  std::string value("value");
+
+  ASSERT_OK(db_->Put(WriteOptions(), key, value));
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+  iter->SeekToFirst();
+  // we either see the entry or it's not in cache
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  iter->SeekToFirst();
+  // should still be true after compaction
+  ASSERT_TRUE(iter->Valid() || iter->status().IsIncomplete());
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorSeekToSame) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 1000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const int NROWS = 10000;
+  // Write rows with keys 00000, 00002, 00004 etc.
+  for (int i = 0; i < NROWS; ++i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%05d", 2 * i);
+    std::string key(buf);
+    std::string value("value");
+    ASSERT_OK(db_->Put(WriteOptions(), key, value));
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+  // Seek to 00001.  We expect to find 00002.
+  std::string start_key = "00001";
+  iter->Seek(start_key);
+  ASSERT_TRUE(iter->Valid());
+
+  std::string found = iter->key().ToString();
+  ASSERT_EQ("00002", found);
+
+  // Now seek to the same key.  The iterator should remain in the same
+  // position.
+  iter->Seek(found);
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(found, iter->key().ToString());
+}
+
+// Sets iterate_upper_bound and verifies that ForwardIterator doesn't call
+// Seek() on immutable iterators when target key is >= prev_key and all
+// iterators, including the memtable iterator, are over the upper bound.
+TEST_P(DBTestTailingIterator, TailingIteratorUpperBound) {
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  const Slice upper_bound("20", 3);
+  ReadOptions read_options;
+  read_options.tailing = true;
+  read_options.iterate_upper_bound = &upper_bound;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  ASSERT_OK(Put(1, "11", "11"));
+  ASSERT_OK(Put(1, "12", "12"));
+  ASSERT_OK(Put(1, "22", "22"));
+  ASSERT_OK(Flush(1));  // flush all those keys to an immutable SST file
+
+  // Add another key to the memtable.
+  ASSERT_OK(Put(1, "21", "21"));
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  ASSERT_OK(it->status());
+  it->Seek("12");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("12", it->key().ToString());
+
+  it->Next();
+  // Not valid since "21" is over the upper bound.
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  // This keeps track of the number of times NeedToSeekImmutable() was true.
+  int immutable_seeks = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ForwardIterator::SeekInternal:Immutable",
+      [&](void* /*arg*/) { ++immutable_seeks; });
+
+  // Seek to 13. This should not require any immutable seeks.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  it->Seek("13");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_FALSE(it->Valid());
+  ASSERT_OK(it->status());
+  if (GetParam()) {
+    ASSERT_EQ(1, immutable_seeks);
+  } else {
+    ASSERT_EQ(0, immutable_seeks);
+  }
+}
+
+TEST_P(DBTestTailingIterator, TailingIteratorGap) {
+  // level 1:            [20, 25]  [35, 40]
+  // level 2:  [10 - 15]                    [45 - 50]
+  // level 3:            [20,    30,    40]
+  // Previously there is a bug in tailing_iterator that if there is a gap in
+  // lower level, the key will be skipped if it is within the range between
+  // the largest key of index n file and the smallest key of index n+1 file
+  // if both file fit in that gap. In this example, 25 < key < 35
+  // https://github.com/facebook/rocksdb/issues/1372
+  CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "30", "30"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(3, 1);
+
+  ASSERT_OK(Put(1, "10", "10"));
+  ASSERT_OK(Put(1, "15", "15"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "45", "45"));
+  ASSERT_OK(Put(1, "50", "50"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(2, 1);
+
+  ASSERT_OK(Put(1, "20", "20"));
+  ASSERT_OK(Put(1, "25", "25"));
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(1, "35", "35"));
+  ASSERT_OK(Put(1, "40", "40"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(1, 1);
+
+  ColumnFamilyMetaData meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &meta);
+
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_options, handles_[1]));
+  it->Seek("30");
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("30", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("35", it->key().ToString());
+
+  it->Next();
+  ASSERT_TRUE(it->Valid());
+  ASSERT_EQ("40", it->key().ToString());
+
+  ASSERT_OK(it->status());
+}
+
+TEST_P(DBTestTailingIterator, SeekWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+
+  iter->Seek("aa");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+TEST_P(DBTestTailingIterator, SeekToFirstWithUpperBoundBug) {
+  ReadOptions read_options;
+  read_options.tailing = true;
+  if (GetParam()) {
+    read_options.async_io = true;
+  }
+  const Slice upper_bound("cc", 3);
+  read_options.iterate_upper_bound = &upper_bound;
+
+  // 1st L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "aa", "SEEN"));
+  ASSERT_OK(Flush());
+
+  // 2nd L0 file
+  ASSERT_OK(db_->Put(WriteOptions(), "zz", "NOT-SEEN"));
+  ASSERT_OK(Flush());
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  ASSERT_OK(iter->status());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+
+  iter->Next();
+  ASSERT_FALSE(iter->Valid());
+
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->key().ToString(), "aa");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_test.cc b/src/rocksdb/db/db_test.cc
new file mode 100644
index 000000000..9575248b4
--- /dev/null
+++ b/src/rocksdb/db/db_test.cc
@@ -0,0 +1,7397 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// Introduction of SyncPoint effectively disabled building and running this test
+// in Release build.
+// which is a pity, it is a good test
+#include <fcntl.h>
+
+#include <algorithm>
+#include <set>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+
+#ifndef OS_WIN
+#include <unistd.h>
+#endif
+#ifdef OS_SOLARIS
+#include <alloca.h>
+#endif
+
+#include "cache/lru_cache.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/snapshot.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "rocksdb/thread_status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "rocksdb/utilities/optimistic_transaction_db.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Note that whole DBTest and its child classes disable fsync on files
+// and directories for speed.
+// If fsync needs to be covered in a test, put it in other places.
+class DBTest : public DBTestBase {
+ public:
+  DBTest() : DBTestBase("db_test", /*env_do_fsync=*/false) {}
+};
+
+class DBTestWithParam
+    : public DBTest,
+      public testing::WithParamInterface<std::tuple<uint32_t, bool>> {
+ public:
+  DBTestWithParam() {
+    max_subcompactions_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+
+  // Required if inheriting from testing::WithParamInterface<>
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
+  uint32_t max_subcompactions_;
+  bool exclusive_manual_compaction_;
+};
+
+TEST_F(DBTest, MockEnvTest) {
+  std::unique_ptr<MockEnv> env{MockEnv::Create(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+// TEST_FlushMemTable() is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+#endif  // ROCKSDB_LITE
+
+  delete db;
+}
+
+// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't
+// defined.
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, MemEnvTest) {
+  std::unique_ptr<Env> env{NewMemEnv(Env::Default())};
+  Options options;
+  options.create_if_missing = true;
+  options.env = env.get();
+  DB* db;
+
+  const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")};
+  const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")};
+
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i]));
+  }
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  Iterator* iterator = db->NewIterator(ReadOptions());
+  iterator->SeekToFirst();
+  for (size_t i = 0; i < 3; ++i) {
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_TRUE(keys[i] == iterator->key());
+    ASSERT_TRUE(vals[i] == iterator->value());
+    iterator->Next();
+  }
+  ASSERT_TRUE(!iterator->Valid());
+  delete iterator;
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db);
+  ASSERT_OK(dbi->TEST_FlushMemTable());
+
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+
+  delete db;
+
+  options.create_if_missing = false;
+  ASSERT_OK(DB::Open(options, "/dir/db", &db));
+  for (size_t i = 0; i < 3; ++i) {
+    std::string res;
+    ASSERT_OK(db->Get(ReadOptions(), keys[i], &res));
+    ASSERT_TRUE(res == vals[i]);
+  }
+  delete db;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, WriteEmptyBatch) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch empty_batch;
+  ASSERT_OK(dbfull()->Write(wo, &empty_batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+}
+
+TEST_F(DBTest, SkipDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (bool sync : {true, false}) {
+    for (bool disableWAL : {true, false}) {
+      if (sync && disableWAL) {
+        // sync and disableWAL is incompatible.
+        continue;
+      }
+      // Use a small number to ensure a large delay that is still effective
+      // when we do Put
+      // TODO(myabandeh): this is time dependent and could potentially make
+      // the test flaky
+      auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+      std::atomic<int> sleep_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Sleep",
+          [&](void* /*arg*/) { sleep_count.fetch_add(1); });
+      std::atomic<int> wait_count(0);
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "DBImpl::DelayWrite:Wait",
+          [&](void* /*arg*/) { wait_count.fetch_add(1); });
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+      WriteOptions wo;
+      wo.sync = sync;
+      wo.disableWAL = disableWAL;
+      wo.no_slowdown = true;
+      // Large enough to exceed allowance for one time interval
+      std::string large_value(1024, 'x');
+      // Perhaps ideally this first write would fail because of delay, but
+      // the current implementation does not guarantee that.
+      dbfull()->Put(wo, "foo", large_value).PermitUncheckedError();
+      // We need the 2nd write to trigger delay. This is because delay is
+      // estimated based on the last write size which is 0 for the first write.
+      ASSERT_NOK(dbfull()->Put(wo, "foo2", large_value));
+      ASSERT_GE(sleep_count.load(), 0);
+      ASSERT_GE(wait_count.load(), 0);
+      token.reset();
+
+      token = dbfull()->TEST_write_controler().GetDelayToken(1000000);
+      wo.no_slowdown = false;
+      ASSERT_OK(dbfull()->Put(wo, "foo3", large_value));
+      ASSERT_GE(sleep_count.load(), 1);
+      token.reset();
+    }
+  }
+}
+
+TEST_F(DBTest, MixedSlowdownOptions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:BeginWriteStallDone", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(sleep_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsInQueue) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetDelayToken(1);
+  std::atomic<int> sleep_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Sleep", [&](void* /*arg*/) {
+        sleep_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+      });
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { wait_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_EQ(sleep_count.load(), 1);
+  ASSERT_GE(wait_count.load(), 0);
+}
+
+TEST_F(DBTest, MixedSlowdownOptionsStop) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    ASSERT_NOK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> wakeup_writer = [&]() {
+    dbfull()->mutex_.Lock();
+    dbfull()->bg_cv_.SignalAll();
+    dbfull()->mutex_.Unlock();
+  };
+  // Use a small number to ensure a large delay that is still effective
+  // when we do Put
+  // TODO(myabandeh): this is time dependent and could potentially make
+  // the test flaky
+  auto token = dbfull()->TEST_write_controler().GetStopToken();
+  std::atomic<int> wait_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        wait_count.fetch_add(1);
+        if (threads.empty()) {
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_slowdown_func);
+          }
+          for (int i = 0; i < 2; ++i) {
+            threads.emplace_back(write_no_slowdown_func);
+          }
+          // Sleep for 2s to allow the threads to insert themselves into the
+          // write queue
+          env_->SleepForMicroseconds(3000000ULL);
+        }
+        token.reset();
+        threads.emplace_back(wakeup_writer);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  WriteOptions wo;
+  wo.sync = false;
+  wo.disableWAL = false;
+  wo.no_slowdown = false;
+  ASSERT_OK(dbfull()->Put(wo, "foo", "bar"));
+  // We need the 2nd write to trigger delay. This is because delay is
+  // estimated based on the last write size which is 0 for the first write.
+  ASSERT_OK(dbfull()->Put(wo, "foo2", "bar2"));
+  token.reset();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ASSERT_GE(wait_count.load(), 1);
+
+  wo.no_slowdown = true;
+  ASSERT_OK(dbfull()->Put(wo, "foo3", "bar"));
+}
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, LevelLimitReopen) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  const std::string value(1024 * 1024, ' ');
+  int i = 0;
+  while (NumTableFilesAtLevel(2, 1) == 0) {
+    ASSERT_OK(Put(1, Key(i++), value));
+  }
+
+  options.num_levels = 1;
+  options.max_bytes_for_level_multiplier_additional.resize(1, 1);
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_EQ(s.IsInvalidArgument(), true);
+  ASSERT_EQ(s.ToString(),
+            "Invalid argument: db has more levels than options.num_levels");
+
+  options.num_levels = 10;
+  options.max_bytes_for_level_multiplier_additional.resize(10, 1);
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, LevelReopenWithFIFO) {
+  const int kLevelCount = 4;
+  const int kKeyCount = 5;
+  const int kTotalSstFileCount = kLevelCount * kKeyCount;
+  const int kCF = 1;
+
+  Options options = CurrentOptions();
+  // Config level0_file_num_compaction_trigger to prevent L0 files being
+  // automatically compacted while we are constructing a LSM tree structure
+  // to test multi-level FIFO compaction.
+  options.level0_file_num_compaction_trigger = kKeyCount + 1;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // The expected number of files per level after each file creation.
+  const std::string expected_files_per_level[kLevelCount][kKeyCount] = {
+      {"0,0,0,1", "0,0,0,2", "0,0,0,3", "0,0,0,4", "0,0,0,5"},
+      {"0,0,1,5", "0,0,2,5", "0,0,3,5", "0,0,4,5", "0,0,5,5"},
+      {"0,1,5,5", "0,2,5,5", "0,3,5,5", "0,4,5,5", "0,5,5,5"},
+      {"1,5,5,5", "2,5,5,5", "3,5,5,5", "4,5,5,5", "5,5,5,5"},
+  };
+
+  const std::string expected_entries[kKeyCount][kLevelCount + 1] = {
+      {"[ ]", "[ a3 ]", "[ a2, a3 ]", "[ a1, a2, a3 ]", "[ a0, a1, a2, a3 ]"},
+      {"[ ]", "[ b3 ]", "[ b2, b3 ]", "[ b1, b2, b3 ]", "[ b0, b1, b2, b3 ]"},
+      {"[ ]", "[ c3 ]", "[ c2, c3 ]", "[ c1, c2, c3 ]", "[ c0, c1, c2, c3 ]"},
+      {"[ ]", "[ d3 ]", "[ d2, d3 ]", "[ d1, d2, d3 ]", "[ d0, d1, d2, d3 ]"},
+      {"[ ]", "[ e3 ]", "[ e2, e3 ]", "[ e1, e2, e3 ]", "[ e0, e1, e2, e3 ]"},
+  };
+
+  // The loop below creates the following LSM tree where each (k, v) pair
+  // represents a file that contains that entry.  When a file is created,
+  // the db is reopend with FIFO compaction and verified the LSM tree
+  // structure is still the same.
+  //
+  // The resulting LSM tree will contain 5 different keys.  Each key as
+  // 4 different versions, located in different level.
+  //
+  // L0:  (e, e0) (d, d0) (c, c0) (b, b0) (a, a0)
+  // L1:  (a, a1) (b, b1) (c, c1) (d, d1) (e, e1)
+  // L2:  (a, a2) (b, b2) (c, c2) (d, d2) (e, e2)
+  // L3:  (a, a3) (b, b3) (c, c3) (d, d3) (e, e3)
+  for (int l = 0; l < kLevelCount; ++l) {
+    int level = kLevelCount - 1 - l;
+    for (int p = 0; p < kKeyCount; ++p) {
+      std::string put_key = std::string(1, char('a' + p));
+      ASSERT_OK(Put(kCF, put_key, put_key + std::to_string(level)));
+      ASSERT_OK(Flush(kCF));
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      for (int g = 0; g < kKeyCount; ++g) {
+        int entry_count = (p >= g) ? l + 1 : l;
+        std::string get_key = std::string(1, char('a' + g));
+        CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count], get_key,
+                                      kCF, {"pikachu"}, options);
+      }
+      if (level != 0) {
+        MoveFilesToLevel(level, kCF);
+        for (int g = 0; g < kKeyCount; ++g) {
+          int entry_count = (p >= g) ? l + 1 : l;
+          std::string get_key = std::string(1, char('a' + g));
+          CheckAllEntriesWithFifoReopen(expected_entries[g][entry_count],
+                                        get_key, kCF, {"pikachu"}, options);
+        }
+      }
+      ASSERT_EQ(expected_files_per_level[l][p], FilesPerLevel(kCF));
+    }
+  }
+
+  // The expected number of sst files in each level after each FIFO compaction
+  // that deletes the oldest sst file.
+  const std::string expected_files_per_level_after_fifo[] = {
+      "5,5,5,4", "5,5,5,3", "5,5,5,2", "5,5,5,1", "5,5,5", "5,5,4", "5,5,3",
+      "5,5,2",   "5,5,1",   "5,5",     "5,4",     "5,3",   "5,2",   "5,1",
+      "5",       "4",       "3",       "2",       "1",     "",
+  };
+
+  // The expected value entries of each key after each FIFO compaction.
+  // This verifies whether FIFO removes the file with the smallest key in non-L0
+  // files first then the oldest files in L0.
+  const std::string expected_entries_after_fifo[kKeyCount][kLevelCount + 1] = {
+      {"[ a0, a1, a2, a3 ]", "[ a0, a1, a2 ]", "[ a0, a1 ]", "[ a0 ]", "[ ]"},
+      {"[ b0, b1, b2, b3 ]", "[ b0, b1, b2 ]", "[ b0, b1 ]", "[ b0 ]", "[ ]"},
+      {"[ c0, c1, c2, c3 ]", "[ c0, c1, c2 ]", "[ c0, c1 ]", "[ c0 ]", "[ ]"},
+      {"[ d0, d1, d2, d3 ]", "[ d0, d1, d2 ]", "[ d0, d1 ]", "[ d0 ]", "[ ]"},
+      {"[ e0, e1, e2, e3 ]", "[ e0, e1, e2 ]", "[ e0, e1 ]", "[ e0 ]", "[ ]"},
+  };
+
+  // In the 2nd phase, we reopen the DB with FIFO compaction.  In each reopen,
+  // we config max_table_files_size so that FIFO will remove exactly one file
+  // at a time upon compaction, and we will use it to verify whether the sst
+  // files are deleted in the correct order.
+  for (int i = 0; i < kTotalSstFileCount; ++i) {
+    uint64_t total_sst_files_size = 0;
+    ASSERT_TRUE(dbfull()->GetIntProperty(
+        handles_[1], "rocksdb.total-sst-files-size", &total_sst_files_size));
+    ASSERT_TRUE(total_sst_files_size > 0);
+
+    Options fifo_options(options);
+    fifo_options.compaction_style = kCompactionStyleFIFO;
+    options.create_if_missing = false;
+    fifo_options.max_open_files = -1;
+    fifo_options.disable_auto_compactions = false;
+    // Config max_table_files_size to be total_sst_files_size - 1 so that
+    // FIFO will delete one file.
+    fifo_options.compaction_options_fifo.max_table_files_size =
+        total_sst_files_size - 1;
+    ASSERT_OK(
+        TryReopenWithColumnFamilies({"default", "pikachu"}, fifo_options));
+    // For FIFO to pick a compaction
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact(false));
+    for (int g = 0; g < kKeyCount; ++g) {
+      std::string get_key = std::string(1, char('a' + g));
+      int status_index = i / kKeyCount;
+      if ((i % kKeyCount) >= g) {
+        // If true, then it means the sst file containing the get_key in the
+        // current level has already been deleted, so we need to move the
+        // status_index for checking the expected value.
+        status_index++;
+      }
+      CheckAllEntriesWithFifoReopen(
+          expected_entries_after_fifo[g][status_index], get_key, kCF,
+          {"pikachu"}, options);
+    }
+    ASSERT_EQ(expected_files_per_level_after_fifo[i], FilesPerLevel(kCF));
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(DBTest, PutSingleDeleteGet) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo2", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo2"));
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, ReadFromPersistedTier) {
+  do {
+    Random rnd(301);
+    Options options = CurrentOptions();
+    for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) {
+      CreateAndReopenWithCF({"pikachu"}, options);
+      WriteOptions wopt;
+      wopt.disableWAL = (disableWAL == 1);
+      // 1st round: put but not flush
+      ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first"));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one"));
+      ASSERT_EQ("first", Get(1, "foo"));
+      ASSERT_EQ("one", Get(1, "bar"));
+
+      // Read directly from persited data.
+      ReadOptions ropt;
+      ropt.read_tier = kPersistedTier;
+      std::string value;
+      if (wopt.disableWAL) {
+        // as data has not yet being flushed, we expect not found.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+        ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+      }
+
+      // Multiget
+      std::vector<ColumnFamilyHandle*> multiget_cfs;
+      multiget_cfs.push_back(handles_[1]);
+      multiget_cfs.push_back(handles_[1]);
+      std::vector<Slice> multiget_keys;
+      multiget_keys.push_back("foo");
+      multiget_keys.push_back("bar");
+      std::vector<std::string> multiget_values;
+      auto statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[0].IsNotFound());
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[0]);
+        ASSERT_OK(statuses[1]);
+      }
+
+      // 2nd round: flush and put a new value in memtable.
+      ASSERT_OK(Flush(1));
+      ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello"));
+
+      // once the data has been flushed, we are able to get the
+      // data when kPersistedTier is used.
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok());
+      ASSERT_EQ(value, "first");
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+      ASSERT_EQ(value, "one");
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(
+            db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound());
+      } else {
+        ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value));
+        ASSERT_EQ(value, "hello");
+      }
+
+      // Expect same result in multiget
+      multiget_cfs.push_back(handles_[1]);
+      multiget_keys.push_back("rocksdb");
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].ok());
+      ASSERT_EQ("first", multiget_values[0]);
+      ASSERT_TRUE(statuses[1].ok());
+      ASSERT_EQ("one", multiget_values[1]);
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[2].IsNotFound());
+      } else {
+        ASSERT_OK(statuses[2]);
+      }
+
+      // 3rd round: delete and flush
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "foo"));
+      Flush(1);
+      ASSERT_OK(db_->Delete(wopt, handles_[1], "bar"));
+
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound());
+      if (wopt.disableWAL) {
+        // Still expect finding the value as its delete has not yet being
+        // flushed.
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok());
+        ASSERT_EQ(value, "one");
+      } else {
+        ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound());
+      }
+      ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok());
+      ASSERT_EQ(value, "hello");
+
+      statuses =
+          db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values);
+      ASSERT_TRUE(statuses[0].IsNotFound());
+      if (wopt.disableWAL) {
+        ASSERT_TRUE(statuses[1].ok());
+        ASSERT_EQ("one", multiget_values[1]);
+      } else {
+        ASSERT_TRUE(statuses[1].IsNotFound());
+      }
+      ASSERT_TRUE(statuses[2].ok());
+      ASSERT_EQ("hello", multiget_values[2]);
+      if (wopt.disableWAL == 0) {
+        DestroyAndReopen(options);
+      }
+    }
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, SingleDeleteFlush) {
+  // Test to check whether flushing preserves a single delete hidden
+  // behind a put.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Put values on second level (so that they will not be in the same
+    // compaction as the other operations.
+    ASSERT_OK(Put(1, "foo", "first"));
+    ASSERT_OK(Put(1, "bar", "one"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
+
+    // (Single) delete hidden by a put
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
+    ASSERT_OK(Delete(1, "bar"));
+    ASSERT_OK(Put(1, "bar", "two"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Delete(1, "bar"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+
+    ASSERT_EQ("NOT_FOUND", Get(1, "bar"));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    // Skip FIFO and universal compaction beccaus they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+TEST_F(DBTest, SingleDeletePutFlush) {
+  // Single deletes that encounter the matching put in a flush should get
+  // removed.
+  do {
+    Random rnd(301);
+
+    Options options = CurrentOptions();
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", Slice()));
+    ASSERT_OK(Put(1, "a", Slice()));
+    ASSERT_OK(SingleDelete(1, "a"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("[ ]", AllEntriesFor("a", 1));
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_SanitizeVeryVeryLargeValue) {
+  const size_t kValueSize = 4 * size_t{1024 * 1024 * 1024};  // 4GB value
+  std::string raw(kValueSize, 'v');
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_TRUE(Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(Merge("foo", raw).IsInvalidArgument());
+
+  WriteBatch wb;
+  ASSERT_TRUE(wb.Put("foo", raw).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge("foo", raw).IsInvalidArgument());
+
+  Slice value_slice = raw;
+  Slice key_slice = "foo";
+  SliceParts sp_key(&key_slice, 1);
+  SliceParts sp_value(&value_slice, 1);
+
+  ASSERT_TRUE(wb.Put(sp_key, sp_value).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(sp_key, sp_value).IsInvalidArgument());
+}
+
+// Disable because not all platform can run it.
+// It requires more than 9GB memory to run it, With single allocation
+// of more than 3GB.
+TEST_F(DBTest, DISABLED_VeryLargeValue) {
+  const size_t kValueSize = 3221225472u;  // 3GB value
+  const size_t kKeySize = 8388608u;       // 8MB key
+  std::string raw(kValueSize, 'v');
+  std::string key1(kKeySize, 'c');
+  std::string key2(kKeySize, 'd');
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("boo", "v1"));
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put(key1, raw));
+  raw[0] = 'w';
+  ASSERT_OK(Put(key2, raw));
+  dbfull()->TEST_WaitForFlushMemTable();
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+#endif  // !ROCKSDB_LITE
+
+  std::string value;
+  Status s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+
+  // Compact all files.
+  Flush();
+  db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+
+  // Check DB is not in read-only state.
+  ASSERT_OK(Put("boo", "v1"));
+
+  s = db_->Get(ReadOptions(), key1, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('v', value[0]);
+
+  s = db_->Get(ReadOptions(), key2, &value);
+  ASSERT_OK(s);
+  ASSERT_EQ(kValueSize, value.size());
+  ASSERT_EQ('w', value[0]);
+}
+
+TEST_F(DBTest, GetFromImmutableLayer) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+
+    // Block sync calls
+    env_->delay_sstable_sync_.store(true, std::memory_order_release);
+    ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+    ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("NOT_FOUND", Get(0, "foo"));
+    // Release sync calls
+    env_->delay_sstable_sync_.store(false, std::memory_order_release);
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetLevel0Ordering) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Check that we process level-0 files in correct order.  The code
+    // below generates two level-0 files where the earlier one comes
+    // before the later one in the level-0 file list since the earlier
+    // one has a smaller "smallest" key.
+    ASSERT_OK(Put(1, "bar", "b"));
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, WrongLevel0Config) {
+  Options options = CurrentOptions();
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options));
+  options.level0_stop_writes_trigger = 1;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_file_num_compaction_trigger = 3;
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, GetOrderedByLevels) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    Compact(1, "a", "z");
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("v2", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetPicksCorrectFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    // Arrange to have multiple files in a non-level-0 level.
+    ASSERT_OK(Put(1, "a", "va"));
+    Compact(1, "a", "b");
+    ASSERT_OK(Put(1, "x", "vx"));
+    Compact(1, "x", "y");
+    ASSERT_OK(Put(1, "f", "vf"));
+    Compact(1, "f", "g");
+    ASSERT_EQ("va", Get(1, "a"));
+    ASSERT_EQ("vf", Get(1, "f"));
+    ASSERT_EQ("vx", Get(1, "x"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, GetEncountersEmptyLevel) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    // Arrange for the following to happen:
+    //   * sstable A in level 0
+    //   * nothing in level 1
+    //   * sstable B in level 2
+    // Then do enough Get() calls to arrange for an automatic compaction
+    // of sstable A.  A bug would cause the compaction to be marked as
+    // occurring at level 1 (instead of the correct level 0).
+
+    // Step 1: First place sstables in levels 0 and 2
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GT(NumTableFilesAtLevel(2, 1), 0);
+
+    // Step 2: clear level 1 if necessary.
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);
+    ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(2, 1), 1);
+
+    // Step 3: read a bunch of times
+    for (int i = 0; i < 1000; i++) {
+      ASSERT_EQ("NOT_FOUND", Get(1, "missing"));
+    }
+
+    // Step 4: Wait for compaction to finish
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 1);  // XXX
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FlushMultipleMemtable) {
+  do {
+    Options options = CurrentOptions();
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    options.max_write_buffer_number = 4;
+    options.min_write_buffer_number_to_merge = 3;
+    options.max_write_buffer_size_to_maintain = -1;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+    ASSERT_OK(Flush(1));
+  } while (ChangeCompactOptions());
+}
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushSchedule) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.min_write_buffer_number_to_merge = 1;
+  options.max_write_buffer_size_to_maintain =
+      static_cast<int64_t>(options.write_buffer_size);
+  options.max_write_buffer_number = 2;
+  options.write_buffer_size = 120 * 1024;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferFull;
+  options.listeners.push_back(flush_listener);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  std::vector<port::Thread> threads;
+
+  std::atomic<int> thread_num(0);
+  // each column family will have 5 thread, each thread generating 2 memtables.
+  // each column family should end up with 10 table files
+  std::function<void()> fill_memtable_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    Random rnd(a);
+    WriteOptions wo;
+    // this should fill up 2 memtables
+    for (int k = 0; k < 5000; ++k) {
+      ASSERT_OK(db_->Put(wo, handles_[a & 1], rnd.RandomString(13), ""));
+    }
+  };
+
+  for (int i = 0; i < 10; ++i) {
+    threads.emplace_back(fill_memtable_func);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default");
+  auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu");
+  ASSERT_LE(default_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(default_tables, static_cast<uint64_t>(0));
+  ASSERT_LE(pikachu_tables, static_cast<uint64_t>(10));
+  ASSERT_GT(pikachu_tables, static_cast<uint64_t>(0));
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+
+class DelayFilter : public CompactionFilter {
+ public:
+  explicit DelayFilter(DBTestBase* d) : db_test(d) {}
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    db_test->env_->MockSleepForMicroseconds(1000);
+    return true;
+  }
+
+  const char* Name() const override { return "DelayFilter"; }
+
+ private:
+  DBTestBase* db_test;
+};
+
+class DelayFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit DelayFilterFactory(DBTestBase* d) : db_test(d) {}
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& /*context*/) override {
+    return std::unique_ptr<CompactionFilter>(new DelayFilter(db_test));
+  }
+
+  const char* Name() const override { return "DelayFilterFactory"; }
+
+ private:
+  DBTestBase* db_test;
+};
+}  // anonymous namespace
+
+#ifndef ROCKSDB_LITE
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FailMoreDbPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 10000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_3", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_4", 1000000);
+  options.db_paths.emplace_back(dbname_ + "_5", 1000000);
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+void CheckColumnFamilyMeta(
+    const ColumnFamilyMetaData& cf_meta, const std::string& cf_name,
+    const std::vector<std::vector<FileMetaData>>& files_by_level,
+    uint64_t start_time, uint64_t end_time) {
+  ASSERT_EQ(cf_meta.name, cf_name);
+  ASSERT_EQ(cf_meta.levels.size(), files_by_level.size());
+
+  uint64_t cf_size = 0;
+  size_t file_count = 0;
+
+  for (size_t i = 0; i < cf_meta.levels.size(); ++i) {
+    const auto& level_meta_from_cf = cf_meta.levels[i];
+    const auto& level_meta_from_files = files_by_level[i];
+
+    ASSERT_EQ(level_meta_from_cf.level, i);
+    ASSERT_EQ(level_meta_from_cf.files.size(), level_meta_from_files.size());
+
+    file_count += level_meta_from_cf.files.size();
+
+    uint64_t level_size = 0;
+    for (size_t j = 0; j < level_meta_from_cf.files.size(); ++j) {
+      const auto& file_meta_from_cf = level_meta_from_cf.files[j];
+      const auto& file_meta_from_files = level_meta_from_files[j];
+
+      level_size += file_meta_from_cf.size;
+
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                file_meta_from_files.fd.GetNumber());
+      ASSERT_EQ(file_meta_from_cf.file_number,
+                TableFileNameToNumber(file_meta_from_cf.name));
+      ASSERT_EQ(file_meta_from_cf.size, file_meta_from_files.fd.file_size);
+      ASSERT_EQ(file_meta_from_cf.smallest_seqno,
+                file_meta_from_files.fd.smallest_seqno);
+      ASSERT_EQ(file_meta_from_cf.largest_seqno,
+                file_meta_from_files.fd.largest_seqno);
+      ASSERT_EQ(file_meta_from_cf.smallestkey,
+                file_meta_from_files.smallest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.largestkey,
+                file_meta_from_files.largest.user_key().ToString());
+      ASSERT_EQ(file_meta_from_cf.oldest_blob_file_number,
+                file_meta_from_files.oldest_blob_file_number);
+      ASSERT_EQ(file_meta_from_cf.oldest_ancester_time,
+                file_meta_from_files.oldest_ancester_time);
+      ASSERT_EQ(file_meta_from_cf.file_creation_time,
+                file_meta_from_files.file_creation_time);
+      ASSERT_GE(file_meta_from_cf.file_creation_time, start_time);
+      ASSERT_LE(file_meta_from_cf.file_creation_time, end_time);
+      ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time);
+      ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time);
+      // More from FileStorageInfo
+      ASSERT_EQ(file_meta_from_cf.file_type, kTableFile);
+      ASSERT_EQ(file_meta_from_cf.name,
+                "/" + file_meta_from_cf.relative_filename);
+      ASSERT_EQ(file_meta_from_cf.directory, file_meta_from_cf.db_path);
+    }
+
+    ASSERT_EQ(level_meta_from_cf.size, level_size);
+    cf_size += level_size;
+  }
+
+  ASSERT_EQ(cf_meta.file_count, file_count);
+  ASSERT_EQ(cf_meta.size, cf_size);
+}
+
+void CheckLiveFilesMeta(
+    const std::vector<LiveFileMetaData>& live_file_meta,
+    const std::vector<std::vector<FileMetaData>>& files_by_level) {
+  size_t total_file_count = 0;
+  for (const auto& f : files_by_level) {
+    total_file_count += f.size();
+  }
+
+  ASSERT_EQ(live_file_meta.size(), total_file_count);
+
+  int level = 0;
+  int i = 0;
+
+  for (const auto& meta : live_file_meta) {
+    if (level != meta.level) {
+      level = meta.level;
+      i = 0;
+    }
+
+    ASSERT_LT(i, files_by_level[level].size());
+
+    const auto& expected_meta = files_by_level[level][i];
+
+    ASSERT_EQ(meta.column_family_name, kDefaultColumnFamilyName);
+    ASSERT_EQ(meta.file_number, expected_meta.fd.GetNumber());
+    ASSERT_EQ(meta.file_number, TableFileNameToNumber(meta.name));
+    ASSERT_EQ(meta.size, expected_meta.fd.file_size);
+    ASSERT_EQ(meta.smallest_seqno, expected_meta.fd.smallest_seqno);
+    ASSERT_EQ(meta.largest_seqno, expected_meta.fd.largest_seqno);
+    ASSERT_EQ(meta.smallestkey, expected_meta.smallest.user_key().ToString());
+    ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString());
+    ASSERT_EQ(meta.oldest_blob_file_number,
+              expected_meta.oldest_blob_file_number);
+
+    // More from FileStorageInfo
+    ASSERT_EQ(meta.file_type, kTableFile);
+    ASSERT_EQ(meta.name, "/" + meta.relative_filename);
+    ASSERT_EQ(meta.directory, meta.db_path);
+
+    ++i;
+  }
+}
+
+#ifndef ROCKSDB_LITE
+void AddBlobFile(const ColumnFamilyHandle* cfh, uint64_t blob_file_number,
+                 uint64_t total_blob_count, uint64_t total_blob_bytes,
+                 const std::string& checksum_method,
+                 const std::string& checksum_value,
+                 uint64_t garbage_blob_count = 0,
+                 uint64_t garbage_blob_bytes = 0) {
+  ColumnFamilyData* cfd =
+      (static_cast<const ColumnFamilyHandleImpl*>(cfh))->cfd();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  // Add a live blob file.
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+      checksum_value);
+
+  auto meta = BlobFileMetaData::Create(std::move(shared_meta),
+                                       BlobFileMetaData::LinkedSsts(),
+                                       garbage_blob_count, garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+}
+
+static void CheckBlobMetaData(
+    const BlobMetaData& bmd, uint64_t blob_file_number,
+    uint64_t total_blob_count, uint64_t total_blob_bytes,
+    const std::string& checksum_method, const std::string& checksum_value,
+    uint64_t garbage_blob_count = 0, uint64_t garbage_blob_bytes = 0) {
+  ASSERT_EQ(bmd.blob_file_number, blob_file_number);
+  ASSERT_EQ(bmd.blob_file_name, BlobFileName("", blob_file_number));
+  ASSERT_EQ(bmd.blob_file_size,
+            total_blob_bytes + BlobLogHeader::kSize + BlobLogFooter::kSize);
+
+  ASSERT_EQ(bmd.total_blob_count, total_blob_count);
+  ASSERT_EQ(bmd.total_blob_bytes, total_blob_bytes);
+  ASSERT_EQ(bmd.garbage_blob_count, garbage_blob_count);
+  ASSERT_EQ(bmd.garbage_blob_bytes, garbage_blob_bytes);
+  ASSERT_EQ(bmd.checksum_method, checksum_method);
+  ASSERT_EQ(bmd.checksum_value, checksum_value);
+}
+
+TEST_F(DBTest, MetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_index = 0;
+  for (int i = 0; i < 100; ++i) {
+    // Add a single blob reference to each file
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, /* blob_file_number */ i + 1000,
+                          /* offset */ 1234, /* size */ 5678, kNoCompression);
+
+    WriteBatch batch;
+    ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 0, Key(key_index),
+                                               blob_index));
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+    ++key_index;
+
+    // Fill up the rest of the file with random values.
+    GenerateNewFile(&rnd, &key_index, /* nowait */ true);
+
+    ASSERT_OK(Flush());
+  }
+
+  std::vector<std::vector<FileMetaData>> files_by_level;
+  dbfull()->TEST_GetFilesMetaData(db_->DefaultColumnFamily(), &files_by_level);
+
+  options.env->GetCurrentTime(&temp_time);
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  CheckColumnFamilyMeta(cf_meta, kDefaultColumnFamilyName, files_by_level,
+                        start_time, end_time);
+  std::vector<LiveFileMetaData> live_file_meta;
+  db_->GetLiveFilesMetaData(&live_file_meta);
+  CheckLiveFilesMeta(live_file_meta, files_by_level);
+}
+
+TEST_F(DBTest, AllMetaDataTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  int64_t temp_time = 0;
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t start_time = static_cast<uint64_t>(temp_time);
+
+  Random rnd(301);
+  dbfull()->TEST_LockMutex();
+  for (int cf = 0; cf < 2; cf++) {
+    AddBlobFile(handles_[cf], blob_file_number * (cf + 1),
+                total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                checksum_method, checksum_value);
+  }
+  dbfull()->TEST_UnlockMutex();
+
+  std::vector<ColumnFamilyMetaData> all_meta;
+  db_->GetAllColumnFamilyMetaData(&all_meta);
+
+  std::vector<std::vector<FileMetaData>> default_files_by_level;
+  std::vector<std::vector<FileMetaData>> pikachu_files_by_level;
+  dbfull()->TEST_GetFilesMetaData(handles_[0], &default_files_by_level);
+  dbfull()->TEST_GetFilesMetaData(handles_[1], &pikachu_files_by_level);
+
+  options.env->GetCurrentTime(&temp_time).PermitUncheckedError();
+  uint64_t end_time = static_cast<uint64_t>(temp_time);
+
+  ASSERT_EQ(all_meta.size(), 2);
+  for (int cf = 0; cf < 2; cf++) {
+    const auto& cfmd = all_meta[cf];
+    if (cf == 0) {
+      CheckColumnFamilyMeta(cfmd, "default", default_files_by_level, start_time,
+                            end_time);
+    } else {
+      CheckColumnFamilyMeta(cfmd, "pikachu", pikachu_files_by_level, start_time,
+                            end_time);
+    }
+    ASSERT_EQ(cfmd.blob_files.size(), 1U);
+    const auto& bmd = cfmd.blob_files[0];
+    ASSERT_EQ(cfmd.blob_file_count, 1U);
+    ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+    ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+    CheckBlobMetaData(bmd, blob_file_number * (cf + 1),
+                      total_blob_count * (cf + 1), total_blob_bytes * (cf + 1),
+                      checksum_method, checksum_value);
+  }
+}
+
+namespace {
+void MinLevelHelper(DBTest* self, Options& options) {
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      values.push_back(rnd.RandomString(10000));
+      ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+    }
+    ASSERT_OK(self->dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(self->NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(rnd.RandomString(10000));
+    ASSERT_OK(self->Put(DBTestBase::Key(i), values[i]));
+  }
+  ASSERT_OK(self->dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(self->NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(self->NumTableFilesAtLevel(1), 1);
+}
+
+// returns false if the calling-Test should be skipped
+bool MinLevelToCompress(CompressionType& type, Options& options, int wbits,
+                        int lev, int strategy) {
+  fprintf(stderr,
+          "Test with compression options : window_bits = %d, level =  %d, "
+          "strategy = %d}\n",
+          wbits, lev, strategy);
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.arena_block_size = 4096;
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  options.create_if_missing = true;
+
+  if (Snappy_Supported()) {
+    type = kSnappyCompression;
+    fprintf(stderr, "using snappy\n");
+  } else if (Zlib_Supported()) {
+    type = kZlibCompression;
+    fprintf(stderr, "using zlib\n");
+  } else if (BZip2_Supported()) {
+    type = kBZip2Compression;
+    fprintf(stderr, "using bzip2\n");
+  } else if (LZ4_Supported()) {
+    type = kLZ4Compression;
+    fprintf(stderr, "using lz4\n");
+  } else if (XPRESS_Supported()) {
+    type = kXpressCompression;
+    fprintf(stderr, "using xpress\n");
+  } else if (ZSTD_Supported()) {
+    type = kZSTD;
+    fprintf(stderr, "using ZSTD\n");
+  } else {
+    fprintf(stderr, "skipping test, compression disabled\n");
+    return false;
+  }
+  options.compression_per_level.resize(options.num_levels);
+
+  // do not compress L0
+  for (int i = 0; i < 1; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 1; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  return true;
+}
+}  // anonymous namespace
+
+TEST_F(DBTest, MinLevelToCompress1) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, -14, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+TEST_F(DBTest, MinLevelToCompress2) {
+  Options options = CurrentOptions();
+  CompressionType type = kSnappyCompression;
+  if (!MinLevelToCompress(type, options, 15, -1, 0)) {
+    return;
+  }
+  Reopen(options);
+  MinLevelHelper(this, options);
+
+  // do not compress L0 and L1
+  for (int i = 0; i < 2; i++) {
+    options.compression_per_level[i] = kNoCompression;
+  }
+  for (int i = 2; i < options.num_levels; i++) {
+    options.compression_per_level[i] = type;
+  }
+  DestroyAndReopen(options);
+  MinLevelHelper(this, options);
+}
+
+// This test may fail because of a legit case that multiple L0 files
+// are trivial moved to L1.
+TEST_F(DBTest, DISABLED_RepeatedWritesToSameKey) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.write_buffer_size = 100000;  // Small write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // We must have at most one file per level except for level-0,
+    // which may have up to kL0_StopWritesTrigger files.
+    const int kMaxFiles =
+        options.num_levels + options.level0_stop_writes_trigger;
+
+    Random rnd(301);
+    std::string value =
+        rnd.RandomString(static_cast<int>(2 * options.write_buffer_size));
+    for (int i = 0; i < 5 * kMaxFiles; i++) {
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_LE(TotalTableFiles(1), kMaxFiles);
+    }
+  } while (ChangeCompactOptions());
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+static bool Between(uint64_t val, uint64_t low, uint64_t high) {
+  bool result = (val >= low) && (val <= high);
+  if (!result) {
+    fprintf(stderr, "Value %llu is not in range [%llu, %llu]\n",
+            (unsigned long long)(val), (unsigned long long)(low),
+            (unsigned long long)(high));
+  }
+  return result;
+}
+
+TEST_F(DBTest, ApproximateSizesMemTable) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtables = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+  }
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+
+  options.max_write_buffer_number = 8;
+  options.min_write_buffer_number_to_merge = 5;
+  options.write_buffer_size = 1024 * N;  // Not very large
+  DestroyAndReopen(options);
+  default_cf = db_->DefaultColumnFamily();
+
+  int keys[N * 3];
+  for (int i = 0; i < N; i++) {
+    keys[i * 3] = i * 5;
+    keys[i * 3 + 1] = i * 5 + 1;
+    keys[i * 3 + 2] = i * 5 + 2;
+  }
+  // MemTable entry counting is estimated and can vary greatly depending on
+  // layout. Thus, using deterministic seed for test stability.
+  RandomShuffle(std::begin(keys), std::end(keys), rnd.Next());
+
+  for (int i = 0; i < N * 3; i++) {
+    ASSERT_OK(Put(Key(keys[i] + 1000), rnd.RandomString(1024)));
+  }
+
+  start = Key(100);
+  end = Key(300);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+
+  start = Key(2100);
+  end = Key(2300);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  uint64_t size_with_mt, size_without_mt;
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_GT(size_with_mt, 6000);
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+  ASSERT_EQ(size_without_mt, 0);
+
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i + 1000), rnd.RandomString(1024)));
+  }
+
+  start = Key(1050);
+  end = Key(1080);
+  r = Range(start, end);
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                     &size_with_mt));
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size_without_mt));
+  ASSERT_GT(size_with_mt, size_without_mt);
+  ASSERT_GT(size_without_mt, 6000);
+
+  // Check that include_memtables flag works as expected
+  size_approx_options.include_memtables = false;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, size_without_mt);
+
+  // Check that files_size_error_margin works as expected, when the heuristic
+  // conditions are not met
+  start = Key(1);
+  end = Key(1000 + N - 2);
+  r = Range(start, end);
+  size_approx_options.files_size_error_margin = -1.0;  // disabled
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  uint64_t size2;
+  size_approx_options.files_size_error_margin = 0.5;  // enabled, but not used
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size2));
+  ASSERT_EQ(size, size2);
+}
+
+TEST_F(DBTest, ApproximateSizesFilesWithErrorMargin) {
+  // Roughly 4 keys per data block, 1000 keys per file,
+  // with filter substantially larger than a data block
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(16));
+  table_options.block_size = 100;
+  Options options = CurrentOptions();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.write_buffer_size = 24 * 1024;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.target_file_size_base = 24 * 1024;
+  DestroyAndReopen(options);
+  const auto default_cf = db_->DefaultColumnFamily();
+
+  const int N = 64000;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+  }
+  // Flush everything to files
+  ASSERT_OK(Flush());
+  // Compact the entire key space into the next level
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), default_cf, nullptr, nullptr));
+
+  // Write more keys
+  for (int i = N; i < (N + N / 4); i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(24)));
+  }
+  // Flush everything to files again
+  ASSERT_OK(Flush());
+
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  {
+    const std::string start = Key(0);
+    const std::string end = Key(2 * N);
+    const Range r(start, end);
+
+    SizeApproximationOptions size_approx_options;
+    size_approx_options.include_memtables = false;
+    size_approx_options.include_files = true;
+    size_approx_options.files_size_error_margin = -1.0;  // disabled
+
+    // Get the precise size without any approximation heuristic
+    uint64_t size;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size));
+    ASSERT_NE(size, 0);
+
+    // Get the size with an approximation heuristic
+    uint64_t size2;
+    const double error_margin = 0.2;
+    size_approx_options.files_size_error_margin = error_margin;
+    ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1,
+                                       &size2));
+    ASSERT_LT(size2, size * (1 + error_margin));
+    ASSERT_GT(size2, size * (1 - error_margin));
+  }
+
+  {
+    // Ensure that metadata is not falsely attributed only to the last data in
+    // the file. (In some applications, filters can be large portion of data
+    // size.)
+    // Perform many queries over small range, enough to ensure crossing file
+    // boundary, and make sure we never see a spike for large filter.
+    for (int i = 0; i < 3000; i += 10) {
+      const std::string start = Key(i);
+      const std::string end = Key(i + 11);  // overlap by 1 key
+      const Range r(start, end);
+      uint64_t size;
+      ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+      ASSERT_LE(size, 11 * 100);
+    }
+  }
+}
+
+TEST_F(DBTest, GetApproximateMemTableStats) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+
+  uint64_t count;
+  uint64_t size;
+
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 0);
+  ASSERT_LE(count, N);
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  ASSERT_OK(Flush());
+
+  start = Key(50);
+  end = Key(60);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_EQ(count, 0);
+  ASSERT_EQ(size, 0);
+
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(Put(Key(1000 + i), rnd.RandomString(1024)));
+  }
+
+  start = Key(100);
+  end = Key(1020);
+  r = Range(start, end);
+  db_->GetApproximateMemTableStats(r, &count, &size);
+  ASSERT_GT(count, 20);
+  ASSERT_GT(size, 6000);
+}
+
+TEST_F(DBTest, ApproximateSizes) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    uint64_t size;
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_OK(Size("", "xyz", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    const int N = 80;
+    static const int S1 = 100000;
+    static const int S2 = 105000;  // Allow some expansion from metadata
+    Random rnd(301);
+    for (int i = 0; i < N; i++) {
+      ASSERT_OK(Put(1, Key(i), rnd.RandomString(S1)));
+    }
+
+    // 0 because GetApproximateSizes() does not account for memtable space
+    ASSERT_OK(Size("", Key(50), 1, &size));
+    ASSERT_TRUE(Between(size, 0, 0));
+
+    // Check sizes across recovery by reopening a few times
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      for (int compact_start = 0; compact_start < N; compact_start += 10) {
+        for (int i = 0; i < N; i += 10) {
+          ASSERT_OK(Size("", Key(i), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * i, S2 * i));
+          ASSERT_OK(Size("", Key(i) + ".suffix", 1, &size));
+          ASSERT_TRUE(Between(size, S1 * (i + 1), S2 * (i + 1)));
+          ASSERT_OK(Size(Key(i), Key(i + 10), 1, &size));
+          ASSERT_TRUE(Between(size, S1 * 10, S2 * 10));
+        }
+        ASSERT_OK(Size("", Key(50), 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+        ASSERT_OK(Size("", Key(50) + ".suffix", 1, &size));
+        ASSERT_TRUE(Between(size, S1 * 50, S2 * 50));
+
+        std::string cstart_str = Key(compact_start);
+        std::string cend_str = Key(compact_start + 9);
+        Slice cstart = cstart_str;
+        Slice cend = cend_str;
+        ASSERT_OK(dbfull()->TEST_CompactRange(0, &cstart, &cend, handles_[1]));
+      }
+
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable | kSkipHashIndex));
+}
+
+TEST_F(DBTest, ApproximateSizes_MixOfSmallAndLarge) {
+  do {
+    Options options = CurrentOptions();
+    options.compression = kNoCompression;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+    std::string big1 = rnd.RandomString(100000);
+    ASSERT_OK(Put(1, Key(0), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(1), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(2), big1));
+    ASSERT_OK(Put(1, Key(3), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(4), big1));
+    ASSERT_OK(Put(1, Key(5), rnd.RandomString(10000)));
+    ASSERT_OK(Put(1, Key(6), rnd.RandomString(300000)));
+    ASSERT_OK(Put(1, Key(7), rnd.RandomString(10000)));
+
+    // Check sizes across recovery by reopening a few times
+    uint64_t size;
+    for (int run = 0; run < 3; run++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      ASSERT_OK(Size("", Key(0), 1, &size));
+      ASSERT_TRUE(Between(size, 0, 0));
+      ASSERT_OK(Size("", Key(1), 1, &size));
+      ASSERT_TRUE(Between(size, 10000, 11000));
+      ASSERT_OK(Size("", Key(2), 1, &size));
+      ASSERT_TRUE(Between(size, 20000, 21000));
+      ASSERT_OK(Size("", Key(3), 1, &size));
+      ASSERT_TRUE(Between(size, 120000, 121000));
+      ASSERT_OK(Size("", Key(4), 1, &size));
+      ASSERT_TRUE(Between(size, 130000, 131000));
+      ASSERT_OK(Size("", Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 230000, 232000));
+      ASSERT_OK(Size("", Key(6), 1, &size));
+      ASSERT_TRUE(Between(size, 240000, 242000));
+      // Ensure some overhead is accounted for, even without including all
+      ASSERT_OK(Size("", Key(7), 1, &size));
+      ASSERT_TRUE(Between(size, 540500, 545000));
+      ASSERT_OK(Size("", Key(8), 1, &size));
+      ASSERT_TRUE(Between(size, 550500, 555000));
+
+      ASSERT_OK(Size(Key(3), Key(5), 1, &size));
+      ASSERT_TRUE(Between(size, 110100, 111000));
+
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+    }
+    // ApproximateOffsetOf() is not yet implemented in plain table format.
+  } while (ChangeOptions(kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, Snapshot) {
+  env_->SetMockSleep();
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override));
+    ASSERT_OK(Put(0, "foo", "0v1"));
+    ASSERT_OK(Put(1, "foo", "1v1"));
+
+    const Snapshot* s1 = db_->GetSnapshot();
+    ASSERT_EQ(1U, GetNumSnapshots());
+    uint64_t time_snap1 = GetTimeOldestSnapshots();
+    ASSERT_GT(time_snap1, 0U);
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_OK(Put(0, "foo", "0v2"));
+    ASSERT_OK(Put(1, "foo", "1v2"));
+
+    env_->MockSleepForSeconds(1);
+
+    const Snapshot* s2 = db_->GetSnapshot();
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_OK(Put(0, "foo", "0v3"));
+    ASSERT_OK(Put(1, "foo", "1v3"));
+
+    {
+      ManagedSnapshot s3(db_);
+      ASSERT_EQ(3U, GetNumSnapshots());
+      ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+      ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+      ASSERT_EQ(GetTimeOldestSnapshots(),
+                static_cast<uint64_t>(s1->GetUnixTime()));
+
+      ASSERT_OK(Put(0, "foo", "0v4"));
+      ASSERT_OK(Put(1, "foo", "1v4"));
+      ASSERT_EQ("0v1", Get(0, "foo", s1));
+      ASSERT_EQ("1v1", Get(1, "foo", s1));
+      ASSERT_EQ("0v2", Get(0, "foo", s2));
+      ASSERT_EQ("1v2", Get(1, "foo", s2));
+      ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot()));
+      ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot()));
+      ASSERT_EQ("0v4", Get(0, "foo"));
+      ASSERT_EQ("1v4", Get(1, "foo"));
+    }
+
+    ASSERT_EQ(2U, GetNumSnapshots());
+    ASSERT_EQ(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s1->GetUnixTime()));
+    ASSERT_EQ("0v1", Get(0, "foo", s1));
+    ASSERT_EQ("1v1", Get(1, "foo", s1));
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+
+    db_->ReleaseSnapshot(s1);
+    ASSERT_EQ("0v2", Get(0, "foo", s2));
+    ASSERT_EQ("1v2", Get(1, "foo", s2));
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+    ASSERT_EQ(1U, GetNumSnapshots());
+    ASSERT_LT(time_snap1, GetTimeOldestSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber());
+    ASSERT_EQ(GetTimeOldestSnapshots(),
+              static_cast<uint64_t>(s2->GetUnixTime()));
+
+    db_->ReleaseSnapshot(s2);
+    ASSERT_EQ(0U, GetNumSnapshots());
+    ASSERT_EQ(GetSequenceOldestSnapshots(), 0);
+    ASSERT_EQ("0v4", Get(0, "foo"));
+    ASSERT_EQ("1v4", Get(1, "foo"));
+  } while (ChangeOptions());
+}
+
+TEST_F(DBTest, HiddenValuesAreRemoved) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  uint64_t size;
+  do {
+    Options options = CurrentOptions(options_override);
+    CreateAndReopenWithCF({"pikachu"}, options);
+    Random rnd(301);
+    FillLevels("a", "z", 1);
+
+    std::string big = rnd.RandomString(50000);
+    ASSERT_OK(Put(1, "foo", big));
+    ASSERT_OK(Put(1, "pastfoo", "v"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(Put(1, "foo", "tiny"));
+    ASSERT_OK(Put(1, "pastfoo2", "v2"));  // Advance sequence number one more
+
+    ASSERT_OK(Flush(1));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 0);
+
+    ASSERT_EQ(big, Get(1, "foo", snapshot));
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 50000, 60000));
+    db_->ReleaseSnapshot(snapshot);
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny, " + big + " ]");
+    Slice x("x");
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, &x, handles_[1]));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    ASSERT_GE(NumTableFilesAtLevel(1, 1), 1);
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, &x, handles_[1]));
+    ASSERT_EQ(AllEntriesFor("foo", 1), "[ tiny ]");
+
+    ASSERT_OK(Size("", "pastfoo", 1, &size));
+    ASSERT_TRUE(Between(size, 0, 1000));
+    // ApproximateOffsetOf() is not yet implemented in plain table format,
+    // which is used by Size().
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction |
+                         kSkipPlainTable));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, UnremovableSingleDelete) {
+  // If we compact:
+  //
+  // Put(A, v1) Snapshot SingleDelete(A) Put(A, v2)
+  //
+  // We do not want to end up with:
+  //
+  // Put(A, v1) Snapshot Put(A, v2)
+  //
+  // Because a subsequent SingleDelete(A) would delete the Put(A, v2)
+  // but not Put(A, v1), so Get(A) would return v1.
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  do {
+    Options options = CurrentOptions(options_override);
+    options.disable_auto_compactions = true;
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "first"));
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ASSERT_OK(SingleDelete(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "second"));
+    ASSERT_OK(Flush(1));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("second", Get(1, "foo"));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+    ASSERT_EQ("[ second, SDEL, first ]", AllEntriesFor("foo", 1));
+
+    ASSERT_OK(SingleDelete(1, "foo"));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1],
+                                     nullptr, nullptr));
+
+    ASSERT_EQ("first", Get(1, "foo", snapshot));
+    ASSERT_EQ("NOT_FOUND", Get(1, "foo"));
+    db_->ReleaseSnapshot(snapshot);
+    // Skip FIFO and universal compaction because they do not apply to the test
+    // case. Skip MergePut because single delete does not get removed when it
+    // encounters a merge.
+  } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction |
+                         kSkipMergePut));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DeletionMarkers1) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  ASSERT_OK(Delete(1, "foo"));
+  ASSERT_OK(Put(1, "foo", "v2"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  Slice z("z");
+  ASSERT_OK(dbfull()->TEST_CompactRange(last - 2, nullptr, &z, handles_[1]));
+  // DEL eliminated, but v1 remains because we aren't compacting that level
+  // (DEL can be eliminated because v2 hides v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]");
+}
+
+TEST_F(DBTest, DeletionMarkers2) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_OK(Flush(1));
+  const int last = 2;
+  MoveFilesToLevel(last, 1);
+  // foo => v1 is now in last level
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+
+  // Place a table at level last-1 to prevent merging with preceding mutation
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(last - 1, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last, 1), 1);
+  ASSERT_EQ(NumTableFilesAtLevel(last - 1, 1), 1);
+
+  ASSERT_OK(Delete(1, "foo"));
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(Flush(1));  // Moves to level last-2
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 2, nullptr, nullptr, handles_[1]));
+  // DEL kept: "last" file overlaps
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v1 ]");
+  ASSERT_OK(
+      dbfull()->TEST_CompactRange(last - 1, nullptr, nullptr, handles_[1]));
+  // Merging last-1 w/ last, so we are the base level for "foo", so
+  // DEL is removed.  (as is v1).
+  ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]");
+}
+
+TEST_F(DBTest, OverlapInLevel0) {
+  do {
+    Options options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Fill levels 1 and 2 to disable the pushing of new memtables to levels >
+    // 0.
+    ASSERT_OK(Put(1, "100", "v100"));
+    ASSERT_OK(Put(1, "999", "v999"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(2, 1);
+    ASSERT_OK(Delete(1, "100"));
+    ASSERT_OK(Delete(1, "999"));
+    ASSERT_OK(Flush(1));
+    MoveFilesToLevel(1, 1);
+    ASSERT_EQ("0,1,1", FilesPerLevel(1));
+
+    // Make files spanning the following ranges in level-0:
+    //  files[0]  200 .. 900
+    //  files[1]  300 .. 500
+    // Note that files are sorted by smallest key.
+    ASSERT_OK(Put(1, "300", "v300"));
+    ASSERT_OK(Put(1, "500", "v500"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "200", "v200"));
+    ASSERT_OK(Put(1, "600", "v600"));
+    ASSERT_OK(Put(1, "900", "v900"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("2,1,1", FilesPerLevel(1));
+
+    // BEGIN addition to existing test
+    // Take this opportunity to verify SST unique ids (including Plain table)
+    TablePropertiesCollection tbc;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &tbc));
+    VerifySstUniqueIds(tbc);
+    // END addition to existing test
+
+    // Compact away the placeholder files we created initially
+    ASSERT_OK(dbfull()->TEST_CompactRange(1, nullptr, nullptr, handles_[1]));
+    ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr, handles_[1]));
+    ASSERT_EQ("2", FilesPerLevel(1));
+
+    // Do a memtable compaction.  Before bug-fix, the compaction would
+    // not detect the overlap with level-0 files and would incorrectly place
+    // the deletion in a deeper level.
+    ASSERT_OK(Delete(1, "600"));
+    ASSERT_OK(Flush(1));
+    ASSERT_EQ("3", FilesPerLevel(1));
+    ASSERT_EQ("NOT_FOUND", Get(1, "600"));
+  } while (ChangeOptions(kSkipUniversalCompaction | kSkipFIFOCompaction));
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, ComparatorCheck) {
+  class NewComparator : public Comparator {
+   public:
+    const char* Name() const override { return "rocksdb.NewComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return BytewiseComparator()->Compare(a, b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      BytewiseComparator()->FindShortestSeparator(s, l);
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      BytewiseComparator()->FindShortSuccessor(key);
+    }
+  };
+  Options new_options, options;
+  NewComparator cmp;
+  do {
+    options = CurrentOptions();
+    CreateAndReopenWithCF({"pikachu"}, options);
+    new_options = CurrentOptions();
+    new_options.comparator = &cmp;
+    // only the non-default column family has non-matching comparator
+    Status s = TryReopenWithColumnFamilies(
+        {"default", "pikachu"}, std::vector<Options>({options, new_options}));
+    ASSERT_TRUE(!s.ok());
+    ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos)
+        << s.ToString();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, CustomComparator) {
+  class NumberComparator : public Comparator {
+   public:
+    const char* Name() const override { return "test.NumberComparator"; }
+    int Compare(const Slice& a, const Slice& b) const override {
+      return ToNumber(a) - ToNumber(b);
+    }
+    void FindShortestSeparator(std::string* s, const Slice& l) const override {
+      ToNumber(*s);  // Check format
+      ToNumber(l);   // Check format
+    }
+    void FindShortSuccessor(std::string* key) const override {
+      ToNumber(*key);  // Check format
+    }
+
+   private:
+    static int ToNumber(const Slice& x) {
+      // Check that there are no extra characters.
+      EXPECT_TRUE(x.size() >= 2 && x[0] == '[' && x[x.size() - 1] == ']')
+          << EscapeString(x);
+      int val;
+      char ignored;
+      EXPECT_TRUE(sscanf(x.ToString().c_str(), "[%i]%c", &val, &ignored) == 1)
+          << EscapeString(x);
+      return val;
+    }
+  };
+  Options new_options;
+  NumberComparator cmp;
+  do {
+    new_options = CurrentOptions();
+    new_options.create_if_missing = true;
+    new_options.comparator = &cmp;
+    new_options.write_buffer_size = 4096;  // Compact more often
+    new_options.arena_block_size = 4096;
+    new_options = CurrentOptions(new_options);
+    DestroyAndReopen(new_options);
+    CreateAndReopenWithCF({"pikachu"}, new_options);
+    ASSERT_OK(Put(1, "[10]", "ten"));
+    ASSERT_OK(Put(1, "[0x14]", "twenty"));
+    for (int i = 0; i < 2; i++) {
+      ASSERT_EQ("ten", Get(1, "[10]"));
+      ASSERT_EQ("ten", Get(1, "[0xa]"));
+      ASSERT_EQ("twenty", Get(1, "[20]"));
+      ASSERT_EQ("twenty", Get(1, "[0x14]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[15]"));
+      ASSERT_EQ("NOT_FOUND", Get(1, "[0xf]"));
+      Compact(1, "[0]", "[9999]");
+    }
+
+    for (int run = 0; run < 2; run++) {
+      for (int i = 0; i < 1000; i++) {
+        char buf[100];
+        snprintf(buf, sizeof(buf), "[%d]", i * 10);
+        ASSERT_OK(Put(1, buf, buf));
+      }
+      Compact(1, "[0]", "[1000000]");
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, DBOpen_Options) {
+  Options options = CurrentOptions();
+  std::string dbname = test::PerThreadDBPath("db_options_test");
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Does not exist, and create_if_missing == false: error
+  DB* db = nullptr;
+  options.create_if_missing = false;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does not exist, and create_if_missing == true: OK
+  options.create_if_missing = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+
+  // Does exist, and error_if_exists == true: error
+  options.create_if_missing = false;
+  options.error_if_exists = true;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr);
+  ASSERT_TRUE(db == nullptr);
+
+  // Does exist, and error_if_exists == false: OK
+  options.create_if_missing = true;
+  options.error_if_exists = false;
+  s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+  ASSERT_TRUE(db != nullptr);
+
+  delete db;
+  db = nullptr;
+}
+
+TEST_F(DBTest, DBOpen_Change_NumLevels) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_TRUE(db_ != nullptr);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "123"));
+  ASSERT_OK(Put(1, "b", "234"));
+  ASSERT_OK(Flush(1));
+  MoveFilesToLevel(3, 1);
+  Close();
+
+  options.create_if_missing = false;
+  options.num_levels = 2;
+  Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
+  ASSERT_TRUE(db_ == nullptr);
+}
+
+TEST_F(DBTest, DestroyDBMetaDatabase) {
+  std::string dbname = test::PerThreadDBPath("db_meta");
+  ASSERT_OK(env_->CreateDirIfMissing(dbname));
+  std::string metadbname = MetaDatabaseName(dbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metadbname));
+  std::string metametadbname = MetaDatabaseName(metadbname, 0);
+  ASSERT_OK(env_->CreateDirIfMissing(metametadbname));
+
+  // Destroy previous versions if they exist. Using the long way.
+  Options options = CurrentOptions();
+  ASSERT_OK(DestroyDB(metametadbname, options));
+  ASSERT_OK(DestroyDB(metadbname, options));
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Setup databases
+  DB* db = nullptr;
+  ASSERT_OK(DB::Open(options, dbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metadbname, &db));
+  delete db;
+  db = nullptr;
+  ASSERT_OK(DB::Open(options, metametadbname, &db));
+  delete db;
+  db = nullptr;
+
+  // Delete databases
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  // Check if deletion worked.
+  options.create_if_missing = false;
+  ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok());
+  ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SnapshotFiles) {
+  do {
+    Options options = CurrentOptions();
+    options.write_buffer_size = 100000000;  // Large write buffer
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    Random rnd(301);
+
+    // Write 8MB (80 values, each 100K)
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    std::vector<std::string> values;
+    for (int i = 0; i < 80; i++) {
+      values.push_back(rnd.RandomString(100000));
+      ASSERT_OK(Put((i < 40), Key(i), values[i]));
+    }
+
+    // assert that nothing makes it to disk yet.
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+
+    // get a file snapshot
+    uint64_t manifest_number = 0;
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+    // CURRENT, MANIFEST, OPTIONS, *.sst files (one for each CF)
+    ASSERT_EQ(files.size(), 5U);
+
+    uint64_t number = 0;
+    FileType type;
+
+    // copy these files to a new snapshot directory
+    std::string snapdir = dbname_ + ".snapdir/";
+    if (env_->FileExists(snapdir).ok()) {
+      ASSERT_OK(DestroyDir(env_, snapdir));
+    }
+    ASSERT_OK(env_->CreateDir(snapdir));
+
+    for (size_t i = 0; i < files.size(); i++) {
+      // our clients require that GetLiveFiles returns
+      // files with "/" as first character!
+      ASSERT_EQ(files[i][0], '/');
+      std::string src = dbname_ + files[i];
+      std::string dest = snapdir + files[i];
+
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(src, &size));
+
+      // record the number and the size of the
+      // latest manifest file
+      if (ParseFileName(files[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          ASSERT_EQ(manifest_number, 0);
+          manifest_number = number;
+          ASSERT_GE(size, manifest_size);
+          size = manifest_size;  // copy only valid MANIFEST data
+        }
+      }
+      CopyFile(src, dest, size);
+    }
+
+    // release file snapshot
+    ASSERT_OK(dbfull()->EnableFileDeletions(/*force*/ false));
+    // overwrite one key, this key should not appear in the snapshot
+    std::vector<std::string> extras;
+    for (unsigned int i = 0; i < 1; i++) {
+      extras.push_back(rnd.RandomString(100000));
+      ASSERT_OK(Put(0, Key(i), extras[i]));
+    }
+
+    // verify that data in the snapshot are correct
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.emplace_back("default", ColumnFamilyOptions());
+    column_families.emplace_back("pikachu", ColumnFamilyOptions());
+    std::vector<ColumnFamilyHandle*> cf_handles;
+    DB* snapdb;
+    DBOptions opts;
+    opts.env = env_;
+    opts.create_if_missing = false;
+    Status stat =
+        DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb);
+    ASSERT_OK(stat);
+
+    ReadOptions roptions;
+    std::string val;
+    for (unsigned int i = 0; i < 80; i++) {
+      ASSERT_OK(snapdb->Get(roptions, cf_handles[i < 40], Key(i), &val));
+      ASSERT_EQ(values[i].compare(val), 0);
+    }
+    for (auto cfh : cf_handles) {
+      delete cfh;
+    }
+    delete snapdb;
+
+    // look at the new live files after we added an 'extra' key
+    // and after we took the first snapshot.
+    uint64_t new_manifest_number = 0;
+    uint64_t new_manifest_size = 0;
+    std::vector<std::string> newfiles;
+    ASSERT_OK(dbfull()->DisableFileDeletions());
+    ASSERT_OK(dbfull()->GetLiveFiles(newfiles, &new_manifest_size));
+
+    // find the new manifest file. assert that this manifest file is
+    // the same one as in the previous snapshot. But its size should be
+    // larger because we added an extra key after taking the
+    // previous shapshot.
+    for (size_t i = 0; i < newfiles.size(); i++) {
+      std::string src = dbname_ + "/" + newfiles[i];
+      // record the lognumber and the size of the
+      // latest manifest file
+      if (ParseFileName(newfiles[i].substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          ASSERT_EQ(new_manifest_number, 0);
+          uint64_t size;
+          new_manifest_number = number;
+          ASSERT_OK(env_->GetFileSize(src, &size));
+          ASSERT_GE(size, new_manifest_size);
+        }
+      }
+    }
+    ASSERT_EQ(manifest_number, new_manifest_number);
+    ASSERT_GT(new_manifest_size, manifest_size);
+
+    // Also test GetLiveFilesStorageInfo
+    std::vector<LiveFileStorageInfo> new_infos;
+    ASSERT_OK(db_->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(),
+                                           &new_infos));
+
+    // Close DB (while deletions disabled)
+    Close();
+
+    // Validate
+    for (auto& info : new_infos) {
+      std::string path = info.directory + "/" + info.relative_filename;
+      uint64_t size;
+      ASSERT_OK(env_->GetFileSize(path, &size));
+      if (info.trim_to_size) {
+        ASSERT_LE(info.size, size);
+      } else if (!info.replacement_contents.empty()) {
+        ASSERT_EQ(info.size, info.replacement_contents.size());
+      } else {
+        ASSERT_EQ(info.size, size);
+      }
+      if (info.file_type == kDescriptorFile) {
+        ASSERT_EQ(info.file_number, manifest_number);
+      }
+    }
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, ReadonlyDBGetLiveManifestSize) {
+  do {
+    Options options = CurrentOptions();
+    options.level0_file_num_compaction_trigger = 2;
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+    Close();
+    ASSERT_OK(ReadOnlyReopen(options));
+
+    uint64_t manifest_size = 0;
+    std::vector<std::string> files;
+    ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+    for (const std::string& f : files) {
+      uint64_t number = 0;
+      FileType type;
+      if (ParseFileName(f.substr(1), &number, &type)) {
+        if (type == kDescriptorFile) {
+          uint64_t size_on_disk;
+          ASSERT_OK(env_->GetFileSize(dbname_ + "/" + f, &size_on_disk));
+          ASSERT_EQ(manifest_size, size_on_disk);
+          break;
+        }
+      }
+    }
+    Close();
+  } while (ChangeCompactOptions());
+}
+
+TEST_F(DBTest, GetLiveBlobFiles) {
+  // Note: the following prevents an otherwise harmless data race between the
+  // test setup code (AddBlobFile) below and the periodic stat dumping thread.
+  Options options = CurrentOptions();
+  options.stats_dump_period_sec = 0;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  Reopen(options);
+
+  AddBlobFile(db_->DefaultColumnFamily(), blob_file_number, total_blob_count,
+              total_blob_bytes, checksum_method, checksum_value,
+              garbage_blob_count, garbage_blob_bytes);
+  // Make sure it appears in the results returned by GetLiveFiles.
+  uint64_t manifest_size = 0;
+  std::vector<std::string> files;
+  ASSERT_OK(dbfull()->GetLiveFiles(files, &manifest_size));
+
+  ASSERT_FALSE(files.empty());
+  ASSERT_EQ(files[0], BlobFileName("", blob_file_number));
+
+  ColumnFamilyMetaData cfmd;
+
+  db_->GetColumnFamilyMetaData(&cfmd);
+  ASSERT_EQ(cfmd.blob_files.size(), 1);
+  const BlobMetaData& bmd = cfmd.blob_files[0];
+
+  CheckBlobMetaData(bmd, blob_file_number, total_blob_count, total_blob_bytes,
+                    checksum_method, checksum_value, garbage_blob_count,
+                    garbage_blob_bytes);
+  ASSERT_EQ(NormalizePath(bmd.blob_file_path), NormalizePath(dbname_));
+  ASSERT_EQ(cfmd.blob_file_count, 1U);
+  ASSERT_EQ(cfmd.blob_file_size, bmd.blob_file_size);
+}
+#endif
+
+TEST_F(DBTest, PurgeInfoLogs) {
+  Options options = CurrentOptions();
+  options.keep_log_file_num = 5;
+  options.create_if_missing = true;
+  options.env = env_;
+  for (int mode = 0; mode <= 1; mode++) {
+    if (mode == 1) {
+      options.db_log_dir = dbname_ + "_logs";
+      ASSERT_OK(env_->CreateDirIfMissing(options.db_log_dir));
+    } else {
+      options.db_log_dir = "";
+    }
+    for (int i = 0; i < 8; i++) {
+      Reopen(options);
+    }
+
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(
+        options.db_log_dir.empty() ? dbname_ : options.db_log_dir, &files));
+    int info_log_count = 0;
+    for (std::string file : files) {
+      if (file.find("LOG") != std::string::npos) {
+        info_log_count++;
+      }
+    }
+    ASSERT_EQ(5, info_log_count);
+
+    Destroy(options);
+    // For mode (1), test DestroyDB() to delete all the logs under DB dir.
+    // For mode (2), no info log file should have been put under DB dir.
+    // Since dbname_ has no children, there is no need to loop db_files
+    std::vector<std::string> db_files;
+    ASSERT_TRUE(env_->GetChildren(dbname_, &db_files).IsNotFound());
+    ASSERT_TRUE(db_files.empty());
+
+    if (mode == 1) {
+      // Cleaning up
+      ASSERT_OK(env_->GetChildren(options.db_log_dir, &files));
+      for (std::string file : files) {
+        ASSERT_OK(env_->DeleteFile(options.db_log_dir + "/" + file));
+      }
+      ASSERT_OK(env_->DeleteDir(options.db_log_dir));
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// Multi-threaded test:
+namespace {
+
+static const int kColumnFamilies = 10;
+static const int kNumThreads = 10;
+static const int kTestSeconds = 10;
+static const int kNumKeys = 1000;
+
+struct MTState {
+  DBTest* test;
+  std::atomic<int> counter[kNumThreads];
+};
+
+struct MTThread {
+  MTState* state;
+  int id;
+  bool multiget_batched;
+};
+
+static void MTThreadBody(void* arg) {
+  MTThread* t = reinterpret_cast<MTThread*>(arg);
+  int id = t->id;
+  DB* db = t->state->test->db_;
+  int counter = 0;
+  std::shared_ptr<SystemClock> clock = SystemClock::Default();
+  auto end_micros = clock->NowMicros() + kTestSeconds * 1000000U;
+
+  fprintf(stderr, "... starting thread %d\n", id);
+  Random rnd(1000 + id);
+  char valbuf[1500];
+  while (clock->NowMicros() < end_micros) {
+    t->state->counter[id].store(counter, std::memory_order_release);
+
+    int key = rnd.Uniform(kNumKeys);
+    char keybuf[20];
+    snprintf(keybuf, sizeof(keybuf), "%016d", key);
+
+    if (rnd.OneIn(2)) {
+      // Write values of the form <key, my id, counter, cf, unique_id>.
+      // into each of the CFs
+      // We add some padding for force compactions.
+      int unique_id = rnd.Uniform(1000000);
+
+      // Half of the time directly use WriteBatch. Half of the time use
+      // WriteBatchWithIndex.
+      if (rnd.OneIn(2)) {
+        WriteBatch batch;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), &batch));
+      } else {
+        WriteBatchWithIndex batch(db->GetOptions().comparator);
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          snprintf(valbuf, sizeof(valbuf), "%d.%d.%d.%d.%-1000d", key, id,
+                   static_cast<int>(counter), cf, unique_id);
+          ASSERT_OK(batch.Put(t->state->test->handles_[cf], Slice(keybuf),
+                              Slice(valbuf)));
+        }
+        ASSERT_OK(db->Write(WriteOptions(), batch.GetWriteBatch()));
+      }
+    } else {
+      // Read a value and verify that it matches the pattern written above
+      // and that writes to all column families were atomic (unique_id is the
+      // same)
+      std::vector<Slice> keys(kColumnFamilies, Slice(keybuf));
+      std::vector<std::string> values;
+      std::vector<Status> statuses;
+      if (!t->multiget_batched) {
+        statuses = db->MultiGet(ReadOptions(), t->state->test->handles_, keys,
+                                &values);
+      } else {
+        std::vector<PinnableSlice> pin_values(keys.size());
+        statuses.resize(keys.size());
+        const Snapshot* snapshot = db->GetSnapshot();
+        ReadOptions ro;
+        ro.snapshot = snapshot;
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          db->MultiGet(ro, t->state->test->handles_[cf], 1, &keys[cf],
+                       &pin_values[cf], &statuses[cf]);
+        }
+        db->ReleaseSnapshot(snapshot);
+        values.resize(keys.size());
+        for (int cf = 0; cf < kColumnFamilies; ++cf) {
+          if (statuses[cf].ok()) {
+            values[cf].assign(pin_values[cf].data(), pin_values[cf].size());
+          }
+        }
+      }
+      Status s = statuses[0];
+      // all statuses have to be the same
+      for (size_t i = 1; i < statuses.size(); ++i) {
+        // they are either both ok or both not-found
+        ASSERT_TRUE((s.ok() && statuses[i].ok()) ||
+                    (s.IsNotFound() && statuses[i].IsNotFound()));
+      }
+      if (s.IsNotFound()) {
+        // Key has not yet been written
+      } else {
+        // Check that the writer thread counter is >= the counter in the value
+        ASSERT_OK(s);
+        int unique_id = -1;
+        for (int i = 0; i < kColumnFamilies; ++i) {
+          int k, w, c, cf, u;
+          ASSERT_EQ(5, sscanf(values[i].c_str(), "%d.%d.%d.%d.%d", &k, &w, &c,
+                              &cf, &u))
+              << values[i];
+          ASSERT_EQ(k, key);
+          ASSERT_GE(w, 0);
+          ASSERT_LT(w, kNumThreads);
+          ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire));
+          ASSERT_EQ(cf, i);
+          if (i == 0) {
+            unique_id = u;
+          } else {
+            // this checks that updates across column families happened
+            // atomically -- all unique ids are the same
+            ASSERT_EQ(u, unique_id);
+          }
+        }
+      }
+    }
+    counter++;
+  }
+  fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter));
+}
+
+}  // anonymous namespace
+
+class MultiThreadedDBTest
+    : public DBTest,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  void SetUp() override {
+    std::tie(option_config_, multiget_batched_) = GetParam();
+  }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> optionConfigs;
+    for (int optionConfig = kDefault; optionConfig < kEnd; ++optionConfig) {
+      optionConfigs.push_back(optionConfig);
+    }
+    return optionConfigs;
+  }
+
+  bool multiget_batched_;
+};
+
+TEST_P(MultiThreadedDBTest, MultiThreaded) {
+  if (option_config_ == kPipelinedWrite) return;
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  std::vector<std::string> cfs;
+  for (int i = 1; i < kColumnFamilies; ++i) {
+    cfs.push_back(std::to_string(i));
+  }
+  Reopen(options);
+  CreateAndReopenWithCF(cfs, options);
+  // Initialize state
+  MTState mt;
+  mt.test = this;
+  for (int id = 0; id < kNumThreads; id++) {
+    mt.counter[id].store(0, std::memory_order_release);
+  }
+
+  // Start threads
+  MTThread thread[kNumThreads];
+  for (int id = 0; id < kNumThreads; id++) {
+    thread[id].state = &mt;
+    thread[id].id = id;
+    thread[id].multiget_batched = multiget_batched_;
+    env_->StartThread(MTThreadBody, &thread[id]);
+  }
+
+  env_->WaitForJoin();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    MultiThreaded, MultiThreadedDBTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(MultiThreadedDBTest::GenerateOptionConfigs()),
+        ::testing::Bool()));
+#endif  // ROCKSDB_LITE
+
+// Group commit test:
+#if !defined(OS_WIN)
+// Disable this test temporarily on Travis and appveyor as it fails
+// intermittently. Github issue: #4151
+namespace {
+
+static const int kGCNumThreads = 4;
+static const int kGCNumKeys = 1000;
+
+struct GCThread {
+  DB* db;
+  int id;
+  std::atomic<bool> done;
+};
+
+static void GCThreadBody(void* arg) {
+  GCThread* t = reinterpret_cast<GCThread*>(arg);
+  int id = t->id;
+  DB* db = t->db;
+  WriteOptions wo;
+
+  for (int i = 0; i < kGCNumKeys; ++i) {
+    std::string kv(std::to_string(i + id * kGCNumKeys));
+    ASSERT_OK(db->Put(wo, kv, kv));
+  }
+  t->done = true;
+}
+
+}  // anonymous namespace
+
+TEST_F(DBTest, GroupCommitTest) {
+  do {
+    Options options = CurrentOptions();
+    options.env = env_;
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    Reopen(options);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WriteThread::JoinBatchGroup:BeganWaiting",
+          "DBImpl::WriteImpl:BeforeLeaderEnters"},
+         {"WriteThread::AwaitState:BlockingWaiting",
+          "WriteThread::EnterAsBatchGroupLeader:End"}});
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // Start threads
+    GCThread thread[kGCNumThreads];
+    for (int id = 0; id < kGCNumThreads; id++) {
+      thread[id].id = id;
+      thread[id].db = db_;
+      thread[id].done = false;
+      env_->StartThread(GCThreadBody, &thread[id]);
+    }
+    env_->WaitForJoin();
+
+    ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0);
+
+    std::vector<std::string> expected_db;
+    for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
+      expected_db.push_back(std::to_string(i));
+    }
+    std::sort(expected_db.begin(), expected_db.end());
+
+    Iterator* itr = db_->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    for (auto x : expected_db) {
+      ASSERT_TRUE(itr->Valid());
+      ASSERT_EQ(itr->key().ToString(), x);
+      ASSERT_EQ(itr->value().ToString(), x);
+      itr->Next();
+    }
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    HistogramData hist_data;
+    options.statistics->histogramData(DB_WRITE, &hist_data);
+    ASSERT_GT(hist_data.average, 0.0);
+  } while (ChangeOptions(kSkipNoSeekToLast));
+}
+#endif  // OS_WIN
+
+namespace {
+using KVMap = std::map<std::string, std::string>;
+}
+
+class ModelDB : public DB {
+ public:
+  class ModelSnapshot : public Snapshot {
+   public:
+    KVMap map_;
+
+    SequenceNumber GetSequenceNumber() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+
+    int64_t GetUnixTime() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+
+    uint64_t GetTimestamp() const override {
+      // no need to call this
+      assert(false);
+      return 0;
+    }
+  };
+
+  explicit ModelDB(const Options& options) : options_(options) {}
+  using DB::Put;
+  Status Put(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+             const Slice& v) override {
+    WriteBatch batch;
+    Status s = batch.Put(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Put(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& /*k*/, const Slice& /*ts*/,
+             const Slice& /*v*/) override {
+    return Status::NotSupported();
+  }
+
+  using DB::PutEntity;
+  Status PutEntity(const WriteOptions& /* options */,
+                   ColumnFamilyHandle* /* column_family */,
+                   const Slice& /* key */,
+                   const WideColumns& /* columns */) override {
+    return Status::NotSupported();
+  }
+
+  using DB::Close;
+  Status Close() override { return Status::OK(); }
+  using DB::Delete;
+  Status Delete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                const Slice& key) override {
+    WriteBatch batch;
+    Status s = batch.Delete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Delete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+                const Slice& /*key*/, const Slice& /*ts*/) override {
+    return Status::NotSupported();
+  }
+  using DB::SingleDelete;
+  Status SingleDelete(const WriteOptions& o, ColumnFamilyHandle* cf,
+                      const Slice& key) override {
+    WriteBatch batch;
+    Status s = batch.SingleDelete(cf, key);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status SingleDelete(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+                      const Slice& /*key*/, const Slice& /*ts*/) override {
+    return Status::NotSupported();
+  }
+  using DB::Merge;
+  Status Merge(const WriteOptions& o, ColumnFamilyHandle* cf, const Slice& k,
+               const Slice& v) override {
+    WriteBatch batch;
+    Status s = batch.Merge(cf, k, v);
+    if (!s.ok()) {
+      return s;
+    }
+    return Write(o, &batch);
+  }
+  Status Merge(const WriteOptions& /*o*/, ColumnFamilyHandle* /*cf*/,
+               const Slice& /*k*/, const Slice& /*ts*/,
+               const Slice& /*value*/) override {
+    return Status::NotSupported();
+  }
+  using DB::Get;
+  Status Get(const ReadOptions& /*options*/, ColumnFamilyHandle* /*cf*/,
+             const Slice& key, PinnableSlice* /*value*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::GetMergeOperands;
+  virtual Status GetMergeOperands(
+      const ReadOptions& /*options*/, ColumnFamilyHandle* /*column_family*/,
+      const Slice& key, PinnableSlice* /*slice*/,
+      GetMergeOperandsOptions* /*merge_operands_options*/,
+      int* /*number_of_operands*/) override {
+    return Status::NotSupported(key);
+  }
+
+  using DB::MultiGet;
+  std::vector<Status> MultiGet(
+      const ReadOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+      const std::vector<Slice>& keys,
+      std::vector<std::string>* /*values*/) override {
+    std::vector<Status> s(keys.size(),
+                          Status::NotSupported("Not implemented."));
+    return s;
+  }
+
+#ifndef ROCKSDB_LITE
+  using DB::IngestExternalFile;
+  Status IngestExternalFile(
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*external_files*/,
+      const IngestExternalFileOptions& /*options*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::IngestExternalFiles;
+  Status IngestExternalFiles(
+      const std::vector<IngestExternalFileArg>& /*args*/) override {
+    return Status::NotSupported("Not implemented");
+  }
+
+  using DB::CreateColumnFamilyWithImport;
+  virtual Status CreateColumnFamilyWithImport(
+      const ColumnFamilyOptions& /*options*/,
+      const std::string& /*column_family_name*/,
+      const ImportColumnFamilyOptions& /*import_options*/,
+      const ExportImportFilesMetaData& /*metadata*/,
+      ColumnFamilyHandle** /*handle*/) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::VerifyChecksum;
+  Status VerifyChecksum(const ReadOptions&) override {
+    return Status::NotSupported("Not implemented.");
+  }
+
+  using DB::GetPropertiesOfAllTables;
+  Status GetPropertiesOfAllTables(
+      ColumnFamilyHandle* /*column_family*/,
+      TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+
+  Status GetPropertiesOfTablesInRange(
+      ColumnFamilyHandle* /*column_family*/, const Range* /*range*/,
+      std::size_t /*n*/, TablePropertiesCollection* /*props*/) override {
+    return Status();
+  }
+#endif  // ROCKSDB_LITE
+
+  using DB::KeyMayExist;
+  bool KeyMayExist(const ReadOptions& /*options*/,
+                   ColumnFamilyHandle* /*column_family*/, const Slice& /*key*/,
+                   std::string* /*value*/,
+                   bool* value_found = nullptr) override {
+    if (value_found != nullptr) {
+      *value_found = false;
+    }
+    return true;  // Not Supported directly
+  }
+  using DB::NewIterator;
+  Iterator* NewIterator(const ReadOptions& options,
+                        ColumnFamilyHandle* /*column_family*/) override {
+    if (options.snapshot == nullptr) {
+      KVMap* saved = new KVMap;
+      *saved = map_;
+      return new ModelIter(saved, true);
+    } else {
+      const KVMap* snapshot_state =
+          &(reinterpret_cast<const ModelSnapshot*>(options.snapshot)->map_);
+      return new ModelIter(snapshot_state, false);
+    }
+  }
+  Status NewIterators(const ReadOptions& /*options*/,
+                      const std::vector<ColumnFamilyHandle*>& /*column_family*/,
+                      std::vector<Iterator*>* /*iterators*/) override {
+    return Status::NotSupported("Not supported yet");
+  }
+  const Snapshot* GetSnapshot() override {
+    ModelSnapshot* snapshot = new ModelSnapshot;
+    snapshot->map_ = map_;
+    return snapshot;
+  }
+
+  void ReleaseSnapshot(const Snapshot* snapshot) override {
+    delete reinterpret_cast<const ModelSnapshot*>(snapshot);
+  }
+
+  Status Write(const WriteOptions& /*options*/, WriteBatch* batch) override {
+    class Handler : public WriteBatch::Handler {
+     public:
+      KVMap* map_;
+      void Put(const Slice& key, const Slice& value) override {
+        (*map_)[key.ToString()] = value.ToString();
+      }
+      void Merge(const Slice& /*key*/, const Slice& /*value*/) override {
+        // ignore merge for now
+        // (*map_)[key.ToString()] = value.ToString();
+      }
+      void Delete(const Slice& key) override { map_->erase(key.ToString()); }
+    };
+    Handler handler;
+    handler.map_ = &map_;
+    return batch->Iterate(&handler);
+  }
+
+  using DB::GetProperty;
+  bool GetProperty(ColumnFamilyHandle* /*column_family*/,
+                   const Slice& /*property*/, std::string* /*value*/) override {
+    return false;
+  }
+  using DB::GetIntProperty;
+  bool GetIntProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/, uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetMapProperty;
+  bool GetMapProperty(ColumnFamilyHandle* /*column_family*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) override {
+    return false;
+  }
+  using DB::GetAggregatedIntProperty;
+  bool GetAggregatedIntProperty(const Slice& /*property*/,
+                                uint64_t* /*value*/) override {
+    return false;
+  }
+  using DB::GetApproximateSizes;
+  Status GetApproximateSizes(const SizeApproximationOptions& /*options*/,
+                             ColumnFamilyHandle* /*column_family*/,
+                             const Range* /*range*/, int n,
+                             uint64_t* sizes) override {
+    for (int i = 0; i < n; i++) {
+      sizes[i] = 0;
+    }
+    return Status::OK();
+  }
+  using DB::GetApproximateMemTableStats;
+  void GetApproximateMemTableStats(ColumnFamilyHandle* /*column_family*/,
+                                   const Range& /*range*/,
+                                   uint64_t* const count,
+                                   uint64_t* const size) override {
+    *count = 0;
+    *size = 0;
+  }
+  using DB::CompactRange;
+  Status CompactRange(const CompactRangeOptions& /*options*/,
+                      ColumnFamilyHandle* /*column_family*/,
+                      const Slice* /*start*/, const Slice* /*end*/) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status SetDBOptions(
+      const std::unordered_map<std::string, std::string>& /*new_options*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  using DB::CompactFiles;
+  Status CompactFiles(
+      const CompactionOptions& /*compact_options*/,
+      ColumnFamilyHandle* /*column_family*/,
+      const std::vector<std::string>& /*input_file_names*/,
+      const int /*output_level*/, const int /*output_path_id*/ = -1,
+      std::vector<std::string>* const /*output_file_names*/ = nullptr,
+      CompactionJobInfo* /*compaction_job_info*/ = nullptr) override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status PauseBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status ContinueBackgroundWork() override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  Status EnableAutoCompaction(
+      const std::vector<ColumnFamilyHandle*>& /*column_family_handles*/)
+      override {
+    return Status::NotSupported("Not supported operation.");
+  }
+
+  void EnableManualCompaction() override { return; }
+
+  void DisableManualCompaction() override { return; }
+
+  using DB::NumberLevels;
+  int NumberLevels(ColumnFamilyHandle* /*column_family*/) override { return 1; }
+
+  using DB::MaxMemCompactionLevel;
+  int MaxMemCompactionLevel(ColumnFamilyHandle* /*column_family*/) override {
+    return 1;
+  }
+
+  using DB::Level0StopWriteTrigger;
+  int Level0StopWriteTrigger(ColumnFamilyHandle* /*column_family*/) override {
+    return -1;
+  }
+
+  const std::string& GetName() const override { return name_; }
+
+  Env* GetEnv() const override { return nullptr; }
+
+  using DB::GetOptions;
+  Options GetOptions(ColumnFamilyHandle* /*column_family*/) const override {
+    return options_;
+  }
+
+  using DB::GetDBOptions;
+  DBOptions GetDBOptions() const override { return options_; }
+
+  using DB::Flush;
+  Status Flush(const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+               ColumnFamilyHandle* /*column_family*/) override {
+    Status ret;
+    return ret;
+  }
+  Status Flush(
+      const ROCKSDB_NAMESPACE::FlushOptions& /*options*/,
+      const std::vector<ColumnFamilyHandle*>& /*column_families*/) override {
+    return Status::OK();
+  }
+
+  Status SyncWAL() override { return Status::OK(); }
+
+  Status DisableFileDeletions() override { return Status::OK(); }
+
+  Status EnableFileDeletions(bool /*force*/) override { return Status::OK(); }
+#ifndef ROCKSDB_LITE
+
+  Status GetLiveFiles(std::vector<std::string>&, uint64_t* /*size*/,
+                      bool /*flush_memtable*/ = true) override {
+    return Status::OK();
+  }
+
+  Status GetLiveFilesChecksumInfo(
+      FileChecksumList* /*checksum_list*/) override {
+    return Status::OK();
+  }
+
+  Status GetLiveFilesStorageInfo(
+      const LiveFilesStorageInfoOptions& /*opts*/,
+      std::vector<LiveFileStorageInfo>* /*files*/) override {
+    return Status::OK();
+  }
+
+  Status GetSortedWalFiles(VectorLogPtr& /*files*/) override {
+    return Status::OK();
+  }
+
+  Status GetCurrentWalFile(
+      std::unique_ptr<LogFile>* /*current_log_file*/) override {
+    return Status::OK();
+  }
+
+  virtual Status GetCreationTimeOfOldestFile(
+      uint64_t* /*creation_time*/) override {
+    return Status::NotSupported();
+  }
+
+  Status DeleteFile(std::string /*name*/) override { return Status::OK(); }
+
+  Status GetUpdatesSince(
+      ROCKSDB_NAMESPACE::SequenceNumber,
+      std::unique_ptr<ROCKSDB_NAMESPACE::TransactionLogIterator>*,
+      const TransactionLogIterator::ReadOptions& /*read_options*/ =
+          TransactionLogIterator::ReadOptions()) override {
+    return Status::NotSupported("Not supported in Model DB");
+  }
+
+  void GetColumnFamilyMetaData(ColumnFamilyHandle* /*column_family*/,
+                               ColumnFamilyMetaData* /*metadata*/) override {}
+#endif  // ROCKSDB_LITE
+
+  Status GetDbIdentity(std::string& /*identity*/) const override {
+    return Status::OK();
+  }
+
+  Status GetDbSessionId(std::string& /*session_id*/) const override {
+    return Status::OK();
+  }
+
+  SequenceNumber GetLatestSequenceNumber() const override { return 0; }
+
+  Status IncreaseFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                                  std::string /*ts_low*/) override {
+    return Status::OK();
+  }
+
+  Status GetFullHistoryTsLow(ColumnFamilyHandle* /*cf*/,
+                             std::string* /*ts_low*/) override {
+    return Status::OK();
+  }
+
+  ColumnFamilyHandle* DefaultColumnFamily() const override { return nullptr; }
+
+ private:
+  class ModelIter : public Iterator {
+   public:
+    ModelIter(const KVMap* map, bool owned)
+        : map_(map), owned_(owned), iter_(map_->end()) {}
+    ~ModelIter() override {
+      if (owned_) delete map_;
+    }
+    bool Valid() const override { return iter_ != map_->end(); }
+    void SeekToFirst() override { iter_ = map_->begin(); }
+    void SeekToLast() override {
+      if (map_->empty()) {
+        iter_ = map_->end();
+      } else {
+        iter_ = map_->find(map_->rbegin()->first);
+      }
+    }
+    void Seek(const Slice& k) override {
+      iter_ = map_->lower_bound(k.ToString());
+    }
+    void SeekForPrev(const Slice& k) override {
+      iter_ = map_->upper_bound(k.ToString());
+      Prev();
+    }
+    void Next() override { ++iter_; }
+    void Prev() override {
+      if (iter_ == map_->begin()) {
+        iter_ = map_->end();
+        return;
+      }
+      --iter_;
+    }
+
+    Slice key() const override { return iter_->first; }
+    Slice value() const override { return iter_->second; }
+    Status status() const override { return Status::OK(); }
+
+   private:
+    const KVMap* const map_;
+    const bool owned_;  // Do we own map_
+    KVMap::const_iterator iter_;
+  };
+  const Options options_;
+  KVMap map_;
+  std::string name_ = "";
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+static std::string RandomKey(Random* rnd, int minimum = 0) {
+  int len;
+  do {
+    len = (rnd->OneIn(3)
+               ? 1  // Short sometimes to encourage collisions
+               : (rnd->OneIn(100) ? rnd->Skewed(10) : rnd->Uniform(10)));
+  } while (len < minimum);
+  return test::RandomKey(rnd, len);
+}
+
+static bool CompareIterators(int step, DB* model, DB* db,
+                             const Snapshot* model_snap,
+                             const Snapshot* db_snap) {
+  ReadOptions options;
+  options.snapshot = model_snap;
+  Iterator* miter = model->NewIterator(options);
+  options.snapshot = db_snap;
+  Iterator* dbiter = db->NewIterator(options);
+  bool ok = true;
+  int count = 0;
+  for (miter->SeekToFirst(), dbiter->SeekToFirst();
+       ok && miter->Valid() && dbiter->Valid(); miter->Next(), dbiter->Next()) {
+    count++;
+    if (miter->key().compare(dbiter->key()) != 0) {
+      fprintf(stderr, "step %d: Key mismatch: '%s' vs. '%s'\n", step,
+              EscapeString(miter->key()).c_str(),
+              EscapeString(dbiter->key()).c_str());
+      ok = false;
+      break;
+    }
+
+    if (miter->value().compare(dbiter->value()) != 0) {
+      fprintf(stderr, "step %d: Value mismatch for key '%s': '%s' vs. '%s'\n",
+              step, EscapeString(miter->key()).c_str(),
+              EscapeString(miter->value()).c_str(),
+              EscapeString(dbiter->value()).c_str());
+      ok = false;
+    }
+  }
+
+  if (ok) {
+    if (miter->Valid() != dbiter->Valid()) {
+      fprintf(stderr, "step %d: Mismatch at end of iterators: %d vs. %d\n",
+              step, miter->Valid(), dbiter->Valid());
+      ok = false;
+    }
+  }
+  delete miter;
+  delete dbiter;
+  return ok;
+}
+
+class DBTestRandomized : public DBTest,
+                         public ::testing::WithParamInterface<int> {
+ public:
+  void SetUp() override { option_config_ = GetParam(); }
+
+  static std::vector<int> GenerateOptionConfigs() {
+    std::vector<int> option_configs;
+    // skip cuckoo hash as it does not support snapshot.
+    for (int option_config = kDefault; option_config < kEnd; ++option_config) {
+      if (!ShouldSkipOptions(option_config,
+                             kSkipDeletesFilterFirst | kSkipNoSeekToLast)) {
+        option_configs.push_back(option_config);
+      }
+    }
+    option_configs.push_back(kBlockBasedTableWithIndexRestartInterval);
+    return option_configs;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTestRandomized, DBTestRandomized,
+    ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs()));
+
+TEST_P(DBTestRandomized, Randomized) {
+  anon::OptionsOverride options_override;
+  options_override.skip_policy = kSkipNoSnapshot;
+  Options options = CurrentOptions(options_override);
+  DestroyAndReopen(options);
+
+  Random rnd(test::RandomSeed() + GetParam());
+  ModelDB model(options);
+  const int N = 10000;
+  const Snapshot* model_snap = nullptr;
+  const Snapshot* db_snap = nullptr;
+  std::string k, v;
+  for (int step = 0; step < N; step++) {
+    // TODO(sanjay): Test Get() works
+    int p = rnd.Uniform(100);
+    int minimum = 0;
+    if (option_config_ == kHashSkipList || option_config_ == kHashLinkList ||
+        option_config_ == kPlainTableFirstBytePrefix ||
+        option_config_ == kBlockBasedTableWithWholeKeyHashIndex ||
+        option_config_ == kBlockBasedTableWithPrefixHashIndex) {
+      minimum = 1;
+    }
+    if (p < 45) {  // Put
+      k = RandomKey(&rnd, minimum);
+      v = rnd.RandomString(rnd.OneIn(20) ? 100 + rnd.Uniform(100)
+                                         : rnd.Uniform(8));
+      ASSERT_OK(model.Put(WriteOptions(), k, v));
+      ASSERT_OK(db_->Put(WriteOptions(), k, v));
+    } else if (p < 90) {  // Delete
+      k = RandomKey(&rnd, minimum);
+      ASSERT_OK(model.Delete(WriteOptions(), k));
+      ASSERT_OK(db_->Delete(WriteOptions(), k));
+    } else {  // Multi-element batch
+      WriteBatch b;
+      const int num = rnd.Uniform(8);
+      for (int i = 0; i < num; i++) {
+        if (i == 0 || !rnd.OneIn(10)) {
+          k = RandomKey(&rnd, minimum);
+        } else {
+          // Periodically re-use the same key from the previous iter, so
+          // we have multiple entries in the write batch for the same key
+        }
+        if (rnd.OneIn(2)) {
+          v = rnd.RandomString(rnd.Uniform(10));
+          ASSERT_OK(b.Put(k, v));
+        } else {
+          ASSERT_OK(b.Delete(k));
+        }
+      }
+      ASSERT_OK(model.Write(WriteOptions(), &b));
+      ASSERT_OK(db_->Write(WriteOptions(), &b));
+    }
+
+    if ((step % 100) == 0) {
+      // For DB instances that use the hash index + block-based table, the
+      // iterator will be invalid right when seeking a non-existent key, right
+      // than return a key that is close to it.
+      if (option_config_ != kBlockBasedTableWithWholeKeyHashIndex &&
+          option_config_ != kBlockBasedTableWithPrefixHashIndex) {
+        ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+        ASSERT_TRUE(CompareIterators(step, &model, db_, model_snap, db_snap));
+      }
+
+      // Save a snapshot from each DB this time that we'll use next
+      // time we compare things, to make sure the current state is
+      // preserved with the snapshot
+      if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+      if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+
+      Reopen(options);
+      ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr));
+
+      model_snap = model.GetSnapshot();
+      db_snap = db_->GetSnapshot();
+    }
+  }
+  if (model_snap != nullptr) model.ReleaseSnapshot(model_snap);
+  if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap);
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("k2", "v2"));
+
+  // Reopen with different prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+  // Back to original
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:1"}}));
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+#endif  // !ROCKSDB_LITE
+
+  // Same if there's a problem initally loading prefix transform
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::ForceNullTablePrefixExtractor",
+      [&](void* arg) { *static_cast<bool*>(arg) = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+
+#ifndef ROCKSDB_LITE
+  // Change again
+  ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:2"}}));
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+#endif  // !ROCKSDB_LITE
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopen with no prefix extractor, make sure everything still works.
+  // RocksDB should just fall back to the binary index.
+  table_options.index_type = BlockBasedTableOptions::kBinarySearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset();
+
+  Reopen(options);
+  ASSERT_EQ("v1", Get("k1"));
+  ASSERT_EQ("v2", Get("k2"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixHashIndexTest) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewCappedPrefixTransform(2));
+
+  Reopen(options);
+  ASSERT_OK(Put("kk1", "v1"));
+  ASSERT_OK(Put("kk2", "v2"));
+  ASSERT_OK(Put("kk", "v3"));
+  ASSERT_OK(Put("k", "v4"));
+  Flush();
+
+  ASSERT_EQ("v1", Get("kk1"));
+  ASSERT_EQ("v2", Get("kk2"));
+
+  ASSERT_EQ("v3", Get("kk"));
+  ASSERT_EQ("v4", Get("k"));
+}
+
+TEST_F(DBTest, BlockBasedTablePrefixIndexTotalOrderSeek) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.max_open_files = 10;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  // RocksDB sanitize max open files to at least 20. Modify it back.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+        int* max_open_files = static_cast<int*>(arg);
+        *max_open_files = 11;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+  ASSERT_OK(Put("k1", "v1"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 1;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // Force evict tables
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  // Make table cache to keep one entry.
+  dbfull()->TEST_table_cache()->SetCapacity(1);
+
+  ReadOptions read_options;
+  read_options.total_order_seek = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+
+  // After total order seek, prefix index should still be used.
+  read_options.total_order_seek = false;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    iter->Seek("k1");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("k1", iter->key().ToString());
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, ChecksumTest) {
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("a", "b"));
+  ASSERT_OK(Put("c", "d"));
+  ASSERT_OK(Flush());  // table with crc checksum
+
+  table_options.checksum = kxxHash;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put("e", "f"));
+  ASSERT_OK(Put("g", "h"));
+  ASSERT_OK(Flush());  // table with xxhash checksum
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+
+  table_options.checksum = kCRC32c;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_EQ("b", Get("a"));
+  ASSERT_EQ("d", Get("c"));
+  ASSERT_EQ("f", Get("e"));
+  ASSERT_EQ("h", Get("g"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, FIFOCompactionTest) {
+  for (int iter = 0; iter < 2; ++iter) {
+    // first iteration -- auto compaction
+    // second iteration -- manual compaction
+    Options options;
+    options.compaction_style = kCompactionStyleFIFO;
+    options.write_buffer_size = 100 << 10;  // 100KB
+    options.arena_block_size = 4096;
+    options.compaction_options_fifo.max_table_files_size = 500 << 10;  // 500KB
+    options.compression = kNoCompression;
+    options.create_if_missing = true;
+    options.max_subcompactions = max_subcompactions_;
+    if (iter == 1) {
+      options.disable_auto_compactions = true;
+    }
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 6; ++i) {
+      for (int j = 0; j < 110; ++j) {
+        ASSERT_OK(Put(std::to_string(i * 100 + j), rnd.RandomString(980)));
+      }
+      // flush should happen here
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (iter == 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    } else {
+      CompactRangeOptions cro;
+      cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+      ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    }
+    // only 5 files should survive
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    for (int i = 0; i < 50; ++i) {
+      // these keys should be deleted in previous compaction
+      ASSERT_EQ("NOT_FOUND", Get(std::to_string(i)));
+    }
+  }
+}
+
+TEST_F(DBTest, FIFOCompactionTestWithCompaction) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 6;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  // It should be compacted to 10 files.
+  ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+  for (int i = 0; i < 60; i++) {
+    // Generate and flush a file about 20KB.
+    for (int j = 0; j < 20; j++) {
+      ASSERT_OK(Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // It should be compacted to no more than 20 files.
+  ASSERT_GT(NumTableFilesAtLevel(0), 10);
+  ASSERT_LT(NumTableFilesAtLevel(0), 18);
+  // Size limit is still guaranteed.
+  ASSERT_LE(SizeAtLevel(0),
+            options.compaction_options_fifo.max_table_files_size);
+}
+
+TEST_F(DBTest, FIFOCompactionStyleWithCompactionAndDelete) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 20 << 10;  // 20K
+  options.arena_block_size = 4096;
+  options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1MB
+  options.compaction_options_fifo.allow_compaction = true;
+  options.level0_file_num_compaction_trigger = 3;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Put("key" + std::to_string(i), ""));
+    ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("", Get("key" + std::to_string(i)));
+  }
+  for (int i = 0; i < 3; i++) {
+    // Each file contains a different key which will be dropped later.
+    ASSERT_OK(Put("a" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Delete("key" + std::to_string(i)));
+    ASSERT_OK(Put("z" + std::to_string(i), rnd.RandomString(500)));
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  for (int i = 0; i < 3; i++) {
+    ASSERT_EQ("NOT_FOUND", Get("key" + std::to_string(i)));
+  }
+}
+
+// Check that FIFO-with-TTL is not supported with max_open_files != -1.
+// Github issue #8014
+TEST_F(DBTest, FIFOCompactionWithTTLAndMaxOpenFilesTest) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  // TTL is not supported with max_open_files != -1.
+  options.max_open_files = 0;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  options.max_open_files = 100;
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  // TTL is supported with unlimited max_open_files
+  options.max_open_files = -1;
+  ASSERT_OK(TryReopen(options));
+}
+
+// Check that FIFO-with-TTL is supported only with BlockBasedTableFactory.
+TEST_F(DBTest, FIFOCompactionWithTTLAndVariousTableFormatsTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.create_if_missing = true;
+  options.ttl = 600;  // seconds
+
+  options = CurrentOptions(options);
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  ASSERT_OK(TryReopen(options));
+
+  Destroy(options);
+  options.table_factory.reset(NewPlainTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+
+  Destroy(options);
+  options.table_factory.reset(NewAdaptiveTableFactory());
+  ASSERT_TRUE(TryReopen(options).IsNotSupported());
+}
+
+TEST_F(DBTest, FIFOCompactionWithTTLTest) {
+  Options options;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.write_buffer_size = 10 << 10;  // 10KB
+  options.arena_block_size = 4096;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // manual compaction.
+  {
+    // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+
+    // Since no flushes and compactions have run, the db should still be in
+    // the same state even after considerable time has passed.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  }
+
+  // Test to make sure that all files with expired ttl are deleted on next
+  // automatic compaction.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    // Create 1 more file to trigger TTL compaction. The old files are dropped.
+    for (int i = 0; i < 1; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+    }
+
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Only the new 10 files remain.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test that shows the fall back to size-based FIFO compaction if TTL-based
+  // deletion doesn't move the total size to be less than max_table_files_size.
+  {
+    options.write_buffer_size = 10 << 10;                              // 10KB
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = false;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 3; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 3);
+
+    for (int i = 0; i < 5; i++) {
+      for (int j = 0; j < 140; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with TTL + Intra-L0 compactions.
+  {
+    options.compaction_options_fifo.max_table_files_size = 150 << 10;  // 150KB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 10; i++) {
+      // Generate and flush a file about 10KB.
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // With Intra-L0 compaction, out of 10 files, 6 files will be compacted to 1
+    // (due to level0_file_num_compaction_trigger = 6).
+    // So total files = 1 + remaining 4 = 5.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Sleep for 2 hours -- which is much greater than TTL.
+    env_->MockSleepForSeconds(2 * 60 * 60);
+    // Just to make sure that we are in the same state even after sleeping.
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+
+    // Create 10 more files. The old 5 files are dropped as their ttl expired.
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    ASSERT_EQ(NumTableFilesAtLevel(0), 5);
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+
+  // Test with large TTL + Intra-L0 compactions.
+  // Files dropped based on size, as ttl doesn't kick in.
+  {
+    options.write_buffer_size = 20 << 10;                               // 20K
+    options.compaction_options_fifo.max_table_files_size = 1500 << 10;  // 1.5MB
+    options.compaction_options_fifo.allow_compaction = true;
+    options.ttl = 1 * 60 * 60;  // 1 hour
+    options.level0_file_num_compaction_trigger = 6;
+    options = CurrentOptions(options);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(Put(std::to_string(i * 20 + j), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+    // It should be compacted to 10 files.
+    ASSERT_EQ(NumTableFilesAtLevel(0), 10);
+
+    for (int i = 0; i < 60; i++) {
+      // Generate and flush a file about 20KB.
+      for (int j = 0; j < 20; j++) {
+        ASSERT_OK(
+            Put(std::to_string(i * 20 + j + 2000), rnd.RandomString(980)));
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+
+    // It should be compacted to no more than 20 files.
+    ASSERT_GT(NumTableFilesAtLevel(0), 10);
+    ASSERT_LT(NumTableFilesAtLevel(0), 18);
+    // Size limit is still guaranteed.
+    ASSERT_LE(SizeAtLevel(0),
+              options.compaction_options_fifo.max_table_files_size);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+/*
+ * This test is not reliable enough as it heavily depends on disk behavior.
+ * Disable as it is flaky.
+ */
+TEST_F(DBTest, DISABLED_RateLimitingTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 1 << 20;  // 1MB
+  options.level0_file_num_compaction_trigger = 2;
+  options.target_file_size_base = 1 << 20;     // 1MB
+  options.max_bytes_for_level_base = 4 << 20;  // 4MB
+  options.max_bytes_for_level_multiplier = 4;
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.IncreaseParallelism(4);
+  DestroyAndReopen(options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  // # no rate limiting
+  Random rnd(301);
+  uint64_t start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  uint64_t elapsed = env_->NowMicros() - start;
+  double raw_rate = env_->bytes_written_ * 1000000.0 / elapsed;
+  uint64_t rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS);
+  ASSERT_EQ(0, rate_limiter_drains);
+  Close();
+
+  // # rate limiting with 0.7 x threshold
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(0.7 * raw_rate)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  elapsed = env_->NowMicros() - start;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, 0);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio);
+  ASSERT_TRUE(ratio < 0.8);
+
+  // # rate limiting with half of the raw_rate
+  options.rate_limiter.reset(
+      NewGenericRateLimiter(static_cast<int64_t>(raw_rate / 2)));
+  env_->bytes_written_ = 0;
+  DestroyAndReopen(options);
+
+  start = env_->NowMicros();
+  // Write ~96M data
+  for (int64_t i = 0; i < (96 << 10); ++i) {
+    ASSERT_OK(Put(rnd.RandomString(32), rnd.RandomString((1 << 10) + 1), wo));
+  }
+  elapsed = env_->NowMicros() - start;
+  rate_limiter_drains =
+      TestGetTickerCount(options, NUMBER_RATE_LIMITER_DRAINS) -
+      rate_limiter_drains;
+  Close();
+  ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_);
+  // Most intervals should've been drained (interval time is 100ms, elapsed is
+  // micros)
+  ASSERT_GT(rate_limiter_drains, elapsed / 100000 / 2);
+  ASSERT_LE(rate_limiter_drains, elapsed / 100000 + 1);
+  ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate;
+  fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio);
+  ASSERT_LT(ratio, 0.6);
+}
+
+// This is a mocked customed rate limiter without implementing optional APIs
+// (e.g, RateLimiter::GetTotalPendingRequests())
+class MockedRateLimiterWithNoOptionalAPIImpl : public RateLimiter {
+ public:
+  MockedRateLimiterWithNoOptionalAPIImpl() {}
+
+  ~MockedRateLimiterWithNoOptionalAPIImpl() override {}
+
+  void SetBytesPerSecond(int64_t bytes_per_second) override {
+    (void)bytes_per_second;
+  }
+
+  using RateLimiter::Request;
+  void Request(const int64_t bytes, const Env::IOPriority pri,
+               Statistics* stats) override {
+    (void)bytes;
+    (void)pri;
+    (void)stats;
+  }
+
+  int64_t GetSingleBurstBytes() const override { return 200; }
+
+  int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    (void)pri;
+    return 0;
+  }
+
+  int64_t GetBytesPerSecond() const override { return 0; }
+};
+
+// To test that customed rate limiter not implementing optional APIs (e.g,
+// RateLimiter::GetTotalPendingRequests()) works fine with RocksDB basic
+// operations (e.g, Put, Get, Flush)
+TEST_F(DBTest, CustomedRateLimiterWithNoOptionalAPIImplTest) {
+  Options options = CurrentOptions();
+  options.rate_limiter.reset(new MockedRateLimiterWithNoOptionalAPIImpl());
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("abc", "def"));
+  ASSERT_EQ(Get("abc"), "def");
+  ASSERT_OK(Flush());
+  ASSERT_EQ(Get("abc"), "def");
+}
+
+TEST_F(DBTest, TableOptionsSanitizeTest) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  DestroyAndReopen(options);
+  ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false);
+
+  options.table_factory.reset(NewPlainTableFactory());
+  options.prefix_extractor.reset(NewNoopTransform());
+  Destroy(options);
+  ASSERT_TRUE(!TryReopen(options).IsNotSupported());
+
+  // Test for check of prefix_extractor when hash index is used for
+  // block-based table
+  BlockBasedTableOptions to;
+  to.index_type = BlockBasedTableOptions::kHashSearch;
+  options = CurrentOptions();
+  options.create_if_missing = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(to));
+  ASSERT_TRUE(TryReopen(options).IsInvalidArgument());
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  ASSERT_OK(TryReopen(options));
+}
+
+TEST_F(DBTest, ConcurrentMemtableNotSupported) {
+  Options options = CurrentOptions();
+  options.allow_concurrent_memtable_write = true;
+  options.soft_pending_compaction_bytes_limit = 0;
+  options.hard_pending_compaction_bytes_limit = 100;
+  options.create_if_missing = true;
+
+  DestroyDB(dbname_, options);
+  options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ASSERT_NOK(TryReopen(options));
+
+  options.memtable_factory.reset(new SkipListFactory);
+  ASSERT_OK(TryReopen(options));
+
+  ColumnFamilyOptions cf_options(options);
+  cf_options.memtable_factory.reset(
+      NewHashLinkListRepFactory(4, 0, 3, true, 4));
+  ColumnFamilyHandle* handle;
+  ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle));
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, SanitizeNumThreads) {
+  for (int attempt = 0; attempt < 2; attempt++) {
+    const size_t kTotalTasks = 8;
+    test::SleepingBackgroundTask sleeping_tasks[kTotalTasks];
+
+    Options options = CurrentOptions();
+    if (attempt == 0) {
+      options.max_background_compactions = 3;
+      options.max_background_flushes = 2;
+    }
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      // Insert 5 tasks to low priority queue and 5 tasks to high priority queue
+      env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                     &sleeping_tasks[i],
+                     (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH);
+    }
+
+    // Wait until 10s for they are scheduled.
+    for (int i = 0; i < 10000; i++) {
+      if (options.env->GetThreadPoolQueueLen(Env::Priority::LOW) <= 1 &&
+          options.env->GetThreadPoolQueueLen(Env::Priority::HIGH) <= 2) {
+        break;
+      }
+      env_->SleepForMicroseconds(1000);
+    }
+
+    // pool size 3, total task 4. Queue size should be 1.
+    ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW));
+    // pool size 2, total task 4. Queue size should be 2.
+    ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH));
+
+    for (size_t i = 0; i < kTotalTasks; i++) {
+      sleeping_tasks[i].WakeUp();
+      sleeping_tasks[i].WaitUntilDone();
+    }
+
+    ASSERT_OK(Put("abc", "def"));
+    ASSERT_EQ("def", Get("abc"));
+    ASSERT_OK(Flush());
+    ASSERT_EQ("def", Get("abc"));
+  }
+}
+
+TEST_F(DBTest, WriteSingleThreadEntry) {
+  std::vector<port::Thread> threads;
+  dbfull()->TEST_LockMutex();
+  auto w = dbfull()->TEST_BeginWrite();
+  threads.emplace_back([&] { ASSERT_OK(Put("a", "b")); });
+  env_->SleepForMicroseconds(10000);
+  threads.emplace_back([&] { ASSERT_OK(Flush()); });
+  env_->SleepForMicroseconds(10000);
+  dbfull()->TEST_UnlockMutex();
+  dbfull()->TEST_LockMutex();
+  dbfull()->TEST_EndWrite(w);
+  dbfull()->TEST_UnlockMutex();
+
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST_F(DBTest, ConcurrentFlushWAL) {
+  const size_t cnt = 100;
+  Options options;
+  options.env = env_;
+  WriteOptions wopt;
+  ReadOptions ropt;
+  for (bool two_write_queues : {false, true}) {
+    for (bool manual_wal_flush : {false, true}) {
+      options.two_write_queues = two_write_queues;
+      options.manual_wal_flush = manual_wal_flush;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      std::vector<port::Thread> threads;
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt; i++) {
+          auto istr = std::to_string(i);
+          ASSERT_OK(db_->Put(wopt, db_->DefaultColumnFamily(), "a" + istr,
+                             "b" + istr));
+        }
+      });
+      if (two_write_queues) {
+        threads.emplace_back([&] {
+          for (size_t i = cnt; i < 2 * cnt; i++) {
+            auto istr = std::to_string(i);
+            WriteBatch batch(0 /* reserved_bytes */, 0 /* max_bytes */,
+                             wopt.protection_bytes_per_key,
+                             0 /* default_cf_ts_sz */);
+            ASSERT_OK(batch.Put("a" + istr, "b" + istr));
+            ASSERT_OK(
+                dbfull()->WriteImpl(wopt, &batch, nullptr, nullptr, 0, true));
+          }
+        });
+      }
+      threads.emplace_back([&] {
+        for (size_t i = 0; i < cnt * 100; i++) {  // FlushWAL is faster than Put
+          ASSERT_OK(db_->FlushWAL(false));
+        }
+      });
+      for (auto& t : threads) {
+        t.join();
+      }
+      options.create_if_missing = false;
+      // Recover from the wal and make sure that it is not corrupted
+      Reopen(options);
+      for (size_t i = 0; i < cnt; i++) {
+        PinnableSlice pval;
+        auto istr = std::to_string(i);
+        ASSERT_OK(
+            db_->Get(ropt, db_->DefaultColumnFamily(), "a" + istr, &pval));
+        ASSERT_TRUE(pval == ("b" + istr));
+      }
+    }
+  }
+}
+
+// This test failure will be caught with a probability
+TEST_F(DBTest, ManualFlushWalAndWriteRace) {
+  Options options;
+  options.env = env_;
+  options.manual_wal_flush = true;
+  options.create_if_missing = true;
+
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  wopts.sync = true;
+
+  port::Thread writeThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      auto istr = std::to_string(i);
+      ASSERT_OK(dbfull()->Put(wopts, "key_" + istr, "value_" + istr));
+    }
+  });
+  port::Thread flushThread([&]() {
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(dbfull()->FlushWAL(false));
+    }
+  });
+
+  writeThread.join();
+  flushThread.join();
+  ASSERT_OK(dbfull()->Put(wopts, "foo1", "value1"));
+  ASSERT_OK(dbfull()->Put(wopts, "foo2", "value2"));
+  Reopen(options);
+  ASSERT_EQ("value1", Get("foo1"));
+  ASSERT_EQ("value2", Get("foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, DynamicMemtableOptions) {
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k5KB = 5 * 1024;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.max_background_compactions = 1;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 16 * 1024;
+  options.max_write_buffer_number = 2;
+  // Don't trigger compact/slowdown/stop
+  options.level0_file_num_compaction_trigger = 1024;
+  options.level0_slowdown_writes_trigger = 1024;
+  options.level0_stop_writes_trigger = 1024;
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int size) {
+    const int kNumPutsBeforeWaitForFlush = 64;
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+
+      // The following condition prevents a race condition between flush jobs
+      // acquiring work and this thread filling up multiple memtables. Without
+      // this, the flush might produce less files than expected because
+      // multiple memtables are flushed into a single L0 file. This race
+      // condition affects assertion (A).
+      if (i % kNumPutsBeforeWaitForFlush == kNumPutsBeforeWaitForFlush - 1) {
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  };
+
+  // Test write_buffer_size
+  gen_l0_kb(64);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  ASSERT_LT(SizeAtLevel(0), k64KB + k5KB);
+  ASSERT_GT(SizeAtLevel(0), k64KB - k5KB * 2);
+
+  // Clean up L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Increase buffer size
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "131072"},
+  }));
+
+  // The existing memtable inflated 64KB->128KB when we invoked SetOptions().
+  // Write 192KB, we should have a 128KB L0 file and a memtable with 64KB data.
+  gen_l0_kb(192);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);  // (A)
+  ASSERT_LT(SizeAtLevel(0), k128KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB - 4 * k5KB);
+
+  // Decrease buffer size below current usage
+  ASSERT_OK(dbfull()->SetOptions({
+      {"write_buffer_size", "65536"},
+  }));
+  // The existing memtable became eligible for flush when we reduced its
+  // capacity to 64KB. Two keys need to be added to trigger flush: first causes
+  // memtable to be marked full, second schedules the flush. Then we should have
+  // a 128KB L0 file, a 64KB L0 file, and a memtable with just one key.
+  gen_l0_kb(2);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB);
+  ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 4 * k5KB);
+
+  // Test max_write_buffer_number
+  // Block compaction thread, which will also block the flushes because
+  // max_background_flushes == 0, so flushes are getting executed by the
+  // compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  // Start from scratch and disable compaction/flush. Flush can only happen
+  // during compaction but trigger is pretty high
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  env_->SetBackgroundThreads(0, Env::HIGH);
+
+  // Put until writes are stopped, bounded by 256 puts. We should see stop at
+  // ~128KB
+  int count = 0;
+  Random rnd(301);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait",
+      [&](void* /*arg*/) { sleeping_task_low.WakeUp(); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  while (!sleeping_task_low.WokenUp() && count < 256) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+  ASSERT_GT(static_cast<double>(count), 128 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 128 * 1.2);
+
+  sleeping_task_low.WaitUntilDone();
+
+  // Increase
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "8"},
+  }));
+  // Clean up memtable and L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 512 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 512 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  // Decrease
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_write_buffer_number", "4"},
+  }));
+  // Clean up memtable and L0
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  count = 0;
+  while (!sleeping_task_low.WokenUp() && count < 1024) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), WriteOptions()));
+    count++;
+  }
+// Windows fails this test. Will tune in the future and figure out
+// approp number
+#ifndef OS_WIN
+  ASSERT_GT(static_cast<double>(count), 256 * 0.8);
+  ASSERT_LT(static_cast<double>(count), 266 * 1.2);
+#endif
+  sleeping_task_low.WaitUntilDone();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // ROCKSDB_LITE
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+namespace {
+void VerifyOperationCount(Env* env, ThreadStatus::OperationType op_type,
+                          int expected_count) {
+  int op_count = 0;
+  std::vector<ThreadStatus> thread_list;
+  ASSERT_OK(env->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    if (thread.operation_type == op_type) {
+      op_count++;
+    }
+  }
+  ASSERT_EQ(op_count, expected_count);
+}
+}  // anonymous namespace
+
+TEST_F(DBTest, GetThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  TryReopen(options);
+
+  std::vector<ThreadStatus> thread_list;
+  Status s = env_->GetThreadList(&thread_list);
+
+  for (int i = 0; i < 2; ++i) {
+    // repeat the test with differet number of high / low priority threads
+    const int kTestCount = 3;
+    const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5};
+    const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3};
+    const unsigned int kBottomPriCounts[kTestCount] = {2, 1, 4};
+    for (int test = 0; test < kTestCount; ++test) {
+      // Change the number of threads in high / low priority pool.
+      env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH);
+      env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW);
+      env_->SetBackgroundThreads(kBottomPriCounts[test], Env::BOTTOM);
+      // Wait to ensure the all threads has been registered
+      unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES];
+      // TODO(ajkr): it'd be better if SetBackgroundThreads returned only after
+      // all threads have been registered.
+      // Try up to 60 seconds.
+      for (int num_try = 0; num_try < 60000; num_try++) {
+        env_->SleepForMicroseconds(1000);
+        thread_list.clear();
+        s = env_->GetThreadList(&thread_list);
+        ASSERT_OK(s);
+        memset(thread_type_counts, 0, sizeof(thread_type_counts));
+        for (auto thread : thread_list) {
+          ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES);
+          thread_type_counts[thread.thread_type]++;
+        }
+        if (thread_type_counts[ThreadStatus::HIGH_PRIORITY] ==
+                kHighPriCounts[test] &&
+            thread_type_counts[ThreadStatus::LOW_PRIORITY] ==
+                kLowPriCounts[test] &&
+            thread_type_counts[ThreadStatus::BOTTOM_PRIORITY] ==
+                kBottomPriCounts[test]) {
+          break;
+        }
+      }
+      // Verify the number of high-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::HIGH_PRIORITY],
+                kHighPriCounts[test]);
+      // Verify the number of low-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::LOW_PRIORITY],
+                kLowPriCounts[test]);
+      // Verify the number of bottom-priority threads
+      ASSERT_EQ(thread_type_counts[ThreadStatus::BOTTOM_PRIORITY],
+                kBottomPriCounts[test]);
+    }
+    if (i == 0) {
+      // repeat the test with multiple column families
+      CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+      env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                     true);
+    }
+  }
+  ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+  delete handles_[2];
+  handles_.erase(handles_.begin() + 2);
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+  Close();
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 true);
+}
+
+TEST_F(DBTest, DisableThreadStatus) {
+  Options options;
+  options.env = env_;
+  options.enable_thread_tracking = false;
+  TryReopen(options);
+  CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options);
+  // Verify non of the column family info exists
+  env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap(handles_,
+                                                                 false);
+}
+
+TEST_F(DBTest, ThreadStatusFlush) {
+  Options options;
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.enable_thread_tracking = true;
+  options = CurrentOptions(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"FlushJob::FlushJob()", "DBTest::ThreadStatusFlush:1"},
+      {"DBTest::ThreadStatusFlush:2", "FlushJob::WriteLevel0Table"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  ASSERT_OK(Put(1, "foo", "v1"));
+  ASSERT_EQ("v1", Get(1, "foo"));
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 0);
+
+  uint64_t num_running_flushes = 0;
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
+  ASSERT_EQ(num_running_flushes, 0);
+
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+
+  // The first sync point is to make sure there's one flush job
+  // running when we perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:1");
+  VerifyOperationCount(env_, ThreadStatus::OP_FLUSH, 1);
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningFlushes,
+                                  &num_running_flushes));
+  ASSERT_EQ(num_running_flushes, 1);
+  // This second sync point is to ensure the flush job will not
+  // be completed until we already perform VerifyOperationCount().
+  TEST_SYNC_POINT("DBTest::ThreadStatusFlush:2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestWithParam, ThreadStatusSingleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  const int kNumL0Files = 4;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_subcompactions = max_subcompactions_;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest::ThreadStatusSingleCompaction:0", "DBImpl::BGWorkCompaction"},
+      {"CompactionJob::Run():Start", "DBTest::ThreadStatusSingleCompaction:1"},
+      {"DBTest::ThreadStatusSingleCompaction:2", "CompactionJob::Run():End"},
+  });
+  for (int tests = 0; tests < 2; ++tests) {
+    DestroyAndReopen(options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    Random rnd(301);
+    // The Put Phase.
+    for (int file = 0; file < kNumL0Files; ++file) {
+      for (int key = 0; key < kEntriesPerBuffer; ++key) {
+        ASSERT_OK(Put(std::to_string(key + file * kEntriesPerBuffer),
+                      rnd.RandomString(kTestValueSize)));
+      }
+      ASSERT_OK(Flush());
+    }
+    // This makes sure a compaction won't be scheduled until
+    // we have done with the above Put Phase.
+    uint64_t num_running_compactions = 0;
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
+    ASSERT_EQ(num_running_compactions, 0);
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:0");
+    ASSERT_GE(NumTableFilesAtLevel(0),
+              options.level0_file_num_compaction_trigger);
+
+    // This makes sure at least one compaction is running.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:1");
+
+    if (options.enable_thread_tracking) {
+      // expecting one single L0 to L1 compaction
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 1);
+    } else {
+      // If thread tracking is not enabled, compaction count should be 0.
+      VerifyOperationCount(env_, ThreadStatus::OP_COMPACTION, 0);
+    }
+    ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumRunningCompactions,
+                                    &num_running_compactions));
+    ASSERT_EQ(num_running_compactions, 1);
+    // TODO(yhchiang): adding assert to verify each compaction stage.
+    TEST_SYNC_POINT("DBTest::ThreadStatusSingleCompaction:2");
+
+    // repeat the test with disabling thread tracking.
+    options.enable_thread_tracking = false;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(DBTestWithParam, PreShutdownManualCompaction) {
+  Options options = CurrentOptions();
+  options.max_subcompactions = max_subcompactions_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // iter - 0 with 7 levels
+  // iter - 1 with 3 levels
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeTables(3, "p", "q", 1);
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls before files
+    Compact(1, "", "c");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range falls after files
+    Compact(1, "r", "z");
+    ASSERT_EQ("1,1,1", FilesPerLevel(1));
+
+    // Compaction range overlaps files
+    Compact(1, "p", "q");
+    ASSERT_EQ("0,0,1", FilesPerLevel(1));
+
+    // Populate a different range
+    MakeTables(3, "c", "e", 1);
+    ASSERT_EQ("1,1,2", FilesPerLevel(1));
+
+    // Compact just the new range
+    Compact(1, "b", "f");
+    ASSERT_EQ("0,0,2", FilesPerLevel(1));
+
+    // Compact all
+    MakeTables(1, "a", "z", 1);
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+    CancelAllBackgroundWork(db_);
+    ASSERT_TRUE(
+        db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)
+            .IsShutdownInProgress());
+    ASSERT_EQ("1,0,2", FilesPerLevel(1));
+
+    if (iter == 0) {
+      options = CurrentOptions();
+      options.num_levels = 3;
+      options.create_if_missing = true;
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+    }
+  }
+}
+
+TEST_F(DBTest, PreShutdownFlush) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(1, "key", "value"));
+  CancelAllBackgroundWork(db_);
+  Status s =
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
+  ASSERT_TRUE(s.IsShutdownInProgress());
+}
+
+TEST_P(DBTestWithParam, PreShutdownMultipleCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FlushJob::FlushJob()", "CompactionJob::Run():Start"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:Preshutdown"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownMultipleCompaction:VerifyCompaction"},
+       {"DBTest::PreShutdownMultipleCompaction:Preshutdown",
+        "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+    }
+
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+    }
+  }
+
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:Preshutdown");
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownMultipleCompaction:VerifyPreshutdown");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  ASSERT_OK(env_->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+TEST_P(DBTestWithParam, PreShutdownCompactionMiddle) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 40;
+  const int kNumL0Files = 4;
+
+  const int kHighPriCount = 3;
+  const int kLowPriCount = 5;
+  env_->SetBackgroundThreads(kHighPriCount, Env::HIGH);
+  env_->SetBackgroundThreads(kLowPriCount, Env::LOW);
+
+  Options options;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base =
+      options.target_file_size_base * kNumL0Files;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.env = env_;
+  options.enable_thread_tracking = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.max_bytes_for_level_multiplier = 2;
+  options.max_background_compactions = kLowPriCount;
+  options.level0_stop_writes_trigger = 1 << 10;
+  options.level0_slowdown_writes_trigger = 1 << 10;
+  options.max_subcompactions = max_subcompactions_;
+
+  TryReopen(options);
+  Random rnd(301);
+
+  std::vector<ThreadStatus> thread_list;
+  // Delay both flush and compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTest::PreShutdownCompactionMiddle:Preshutdown",
+        "CompactionJob::Run():Inprogress"},
+       {"CompactionJob::Run():Start",
+        "DBTest::PreShutdownCompactionMiddle:VerifyCompaction"},
+       {"CompactionJob::Run():Inprogress", "CompactionJob::Run():End"},
+       {"CompactionJob::Run():End",
+        "DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Make rocksdb busy
+  int key = 0;
+  // check how many threads are doing compaction using GetThreadList
+  int operation_count[ThreadStatus::NUM_OP_TYPES] = {0};
+  for (int file = 0; file < 16 * kNumL0Files; ++file) {
+    for (int k = 0; k < kEntriesPerBuffer; ++k) {
+      ASSERT_OK(Put(std::to_string(key++), rnd.RandomString(kTestValueSize)));
+    }
+
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    for (auto thread : thread_list) {
+      operation_count[thread.operation_type]++;
+    }
+
+    // Speed up the test
+    if (operation_count[ThreadStatus::OP_FLUSH] > 1 &&
+        operation_count[ThreadStatus::OP_COMPACTION] >
+            0.6 * options.max_background_compactions) {
+      break;
+    }
+    if (file == 15 * kNumL0Files) {
+      TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyCompaction");
+    }
+  }
+
+  ASSERT_GE(operation_count[ThreadStatus::OP_COMPACTION], 1);
+  CancelAllBackgroundWork(db_);
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:Preshutdown");
+  TEST_SYNC_POINT("DBTest::PreShutdownCompactionMiddle:VerifyPreshutdown");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Record the number of compactions at a time.
+  for (int i = 0; i < ThreadStatus::NUM_OP_TYPES; ++i) {
+    operation_count[i] = 0;
+  }
+  ASSERT_OK(env_->GetThreadList(&thread_list));
+  for (auto thread : thread_list) {
+    operation_count[thread.operation_type]++;
+  }
+  ASSERT_EQ(operation_count[ThreadStatus::OP_COMPACTION], 0);
+}
+
+#endif  // ROCKSDB_USING_THREAD_STATUS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, FlushOnDestroy) {
+  WriteOptions wo;
+  wo.disableWAL = true;
+  ASSERT_OK(Put("foo", "v1", wo));
+  CancelAllBackgroundWork(db_);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  const int kNKeys = 120;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 20480;
+  options.write_buffer_size = 20480;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.target_file_size_base = 20480;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 102400;
+  options.max_bytes_for_level_multiplier = 4;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kNoCompression;
+  options.compression_per_level[2] = kSnappyCompression;
+
+  OnFileDeletionListener* listener = new OnFileDeletionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  // Insert more than 80K. L4 should be base level. Neither L0 nor L4 should
+  // be compressed, so total data size should be more than 80K.
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  // Assuming each files' metadata is at least 50 bytes/
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(4), 20U * 4000U + 50U * 4);
+
+  // Insert 400KB. Some data will be compressed
+  for (int i = 21; i < 120; i++) {
+    ASSERT_OK(Put(Key(keys[i]), CompressibleString(&rnd, 4000)));
+  }
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+
+  ASSERT_LT(SizeAtLevel(0) + SizeAtLevel(3) + SizeAtLevel(4),
+            120U * 4000U + 50U * 24);
+  // Make sure data in files in L3 is not compacted by removing all files
+  // in L4 and calculate number of rows
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "true"},
+  }));
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(&cf_meta);
+  for (auto file : cf_meta.levels[4].files) {
+    listener->SetExpectedFileName(dbname_ + file.name);
+    ASSERT_OK(dbfull()->DeleteFile(file.name));
+  }
+  listener->VerifyMatchedCount(cf_meta.levels[4].files.size());
+
+  int num_keys = 0;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  ASSERT_OK(iter->status());
+  ASSERT_GT(SizeAtLevel(0) + SizeAtLevel(3), num_keys * 4000U + num_keys * 10U);
+}
+
+TEST_F(DBTest, DynamicLevelCompressionPerLevel2) {
+  if (!Snappy_Supported() || !LZ4_Supported() || !Zlib_Supported()) {
+    return;
+  }
+  const int kNKeys = 500;
+  int keys[kNKeys];
+  for (int i = 0; i < kNKeys; i++) {
+    keys[i] = i;
+  }
+  RandomShuffle(std::begin(keys), std::end(keys));
+
+  Random rnd(301);
+  Options options;
+  options.create_if_missing = true;
+  options.db_write_buffer_size = 6000000;
+  options.write_buffer_size = 600000;
+  options.max_write_buffer_number = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level0_slowdown_writes_trigger = 2;
+  options.level0_stop_writes_trigger = 2;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.target_file_size_base = 20;
+  options.env = env_;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.max_bytes_for_level_base = 200;
+  options.max_bytes_for_level_multiplier = 8;
+  options.max_background_compactions = 1;
+  options.num_levels = 5;
+  std::shared_ptr<mock::MockTableFactory> mtf(new mock::MockTableFactory);
+  options.table_factory = mtf;
+
+  options.compression_per_level.resize(3);
+  options.compression_per_level[0] = kNoCompression;
+  options.compression_per_level[1] = kLZ4Compression;
+  options.compression_per_level[2] = kZlibCompression;
+
+  DestroyAndReopen(options);
+  // When base level is L4, L4 is LZ4.
+  std::atomic<int> num_zlib(0);
+  std::atomic<int> num_lz4(0);
+  std::atomic<int> num_no(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4) {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 0; i < 100; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 25 == 24) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), 0);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  int prev_num_files_l4 = NumTableFilesAtLevel(4);
+
+  // After base level turn L4->L3, L3 becomes LZ4 and L4 becomes Zlib
+  num_lz4.store(0);
+  num_no.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        if (compaction->output_level() == 4 && compaction->start_level() == 3) {
+          ASSERT_TRUE(compaction->output_compression() == kZlibCompression);
+          num_zlib.fetch_add(1);
+        } else {
+          ASSERT_TRUE(compaction->output_compression() == kLZ4Compression);
+          num_lz4.fetch_add(1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FlushJob::WriteLevel0Table:output_compression", [&](void* arg) {
+        auto* compression = reinterpret_cast<CompressionType*>(arg);
+        ASSERT_TRUE(*compression == kNoCompression);
+        num_no.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  for (int i = 101; i < 500; i++) {
+    std::string value = rnd.RandomString(200);
+    ASSERT_OK(Put(Key(keys[i]), value));
+    if (i % 100 == 99) {
+      ASSERT_OK(Flush());
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    }
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2), 0);
+  ASSERT_GT(NumTableFilesAtLevel(3), 0);
+  ASSERT_GT(NumTableFilesAtLevel(4), prev_num_files_l4);
+  ASSERT_GT(num_no.load(), 2);
+  ASSERT_GT(num_lz4.load(), 0);
+  ASSERT_GT(num_zlib.load(), 0);
+}
+
+TEST_F(DBTest, DynamicCompactionOptions) {
+  // minimum write buffer size is enforced at 64KB
+  const uint64_t k32KB = 1 << 15;
+  const uint64_t k64KB = 1 << 16;
+  const uint64_t k128KB = 1 << 17;
+  const uint64_t k1MB = 1 << 20;
+  const uint64_t k4KB = 1 << 12;
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.compression = kNoCompression;
+  options.soft_pending_compaction_bytes_limit = 1024 * 1024;
+  options.write_buffer_size = k64KB;
+  options.arena_block_size = 4 * k4KB;
+  options.max_write_buffer_number = 2;
+  // Compaction related options
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 8;
+  options.target_file_size_base = k64KB;
+  options.max_compaction_bytes = options.target_file_size_base * 10;
+  options.target_file_size_multiplier = 1;
+  options.max_bytes_for_level_base = k128KB;
+  options.max_bytes_for_level_multiplier = 4;
+
+  // Block flush thread and disable compaction thread
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  DestroyAndReopen(options);
+
+  auto gen_l0_kb = [this](int start, int size, int stride) {
+    Random rnd(301);
+    for (int i = 0; i < size; i++) {
+      ASSERT_OK(Put(Key(start + stride * i), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  };
+
+  // Write 3 files that have the same key range.
+  // Since level0_file_num_compaction_trigger is 3, compaction should be
+  // triggered. The compaction should result in one L1 file
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 1);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ(NumTableFilesAtLevel(0), 2);
+  gen_l0_kb(0, 64, 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,1", FilesPerLevel());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(1U, metadata.size());
+  ASSERT_LE(metadata[0].size, k64KB + k4KB);
+  ASSERT_GE(metadata[0].size, k64KB - k4KB);
+
+  // Test compaction trigger and target_file_size_base
+  // Reduce compaction trigger to 2, and reduce L1 file size to 32KB.
+  // Writing to 64KB L0 files should trigger a compaction. Since these
+  // 2 L0 files have the same key range, compaction merge them and should
+  // result in 2 32KB L1 files.
+  ASSERT_OK(
+      dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                            {"target_file_size_base", std::to_string(k32KB)}}));
+
+  gen_l0_kb(0, 64, 1);
+  ASSERT_EQ("1,1", FilesPerLevel());
+  gen_l0_kb(0, 64, 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("0,2", FilesPerLevel());
+  metadata.clear();
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_LE(metadata[0].size, k32KB + k4KB);
+  ASSERT_GE(metadata[0].size, k32KB - k4KB);
+  ASSERT_LE(metadata[1].size, k32KB + k4KB);
+  ASSERT_GE(metadata[1].size, k32KB - k4KB);
+
+  // Test max_bytes_for_level_base
+  // Increase level base size to 256KB and write enough data that will
+  // fill L1 and L2. L1 size should be around 256KB while L2 size should be
+  // around 256KB x 4.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_bytes_for_level_base", std::to_string(k1MB)}}));
+
+  // writing 96 x 64KB => 6 * 1024KB
+  // (L1 + L2) = (1 + 4) * 1024KB
+  for (int i = 0; i < 96; ++i) {
+    gen_l0_kb(i, 64, 96);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(SizeAtLevel(1), k1MB / 2);
+  ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2);
+
+  // Within (0.5, 1.5) of 4MB.
+  ASSERT_GT(SizeAtLevel(2), 2 * k1MB);
+  ASSERT_LT(SizeAtLevel(2), 6 * k1MB);
+
+  // Test max_bytes_for_level_multiplier and
+  // max_bytes_for_level_base. Now, reduce both mulitplier and level base,
+  // After filling enough data that can fit in L1 - L3, we should see L1 size
+  // reduces to 128KB from 256KB which was asserted previously. Same for L2.
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"max_bytes_for_level_multiplier", "2"},
+       {"max_bytes_for_level_base", std::to_string(k128KB)}}));
+
+  // writing 20 x 64KB = 10 x 128KB
+  // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB
+  for (int i = 0; i < 20; ++i) {
+    gen_l0_kb(i, 64, 32);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  uint64_t total_size = SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3);
+  ASSERT_TRUE(total_size < k128KB * 7 * 1.5);
+
+  // Test level0_stop_writes_trigger.
+  // Clean up memtable and L0. Block compaction threads. If continue to write
+  // and flush memtables. We should see put stop after 8 memtable flushes
+  // since level0_stop_writes_trigger = 8
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Block compaction
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  int count = 0;
+  Random rnd(301);
+  WriteOptions wo;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  // Stop trigger = 8
+  ASSERT_EQ(count, 8);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0.
+  // Block compaction thread again. Perform the put and memtable flushes
+  // until we see the stop after 6 memtable flushes.
+  ASSERT_OK(dbfull()->SetOptions({{"level0_stop_writes_trigger", "6"}}));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  // Block compaction again
+  sleeping_task_low.Reset();
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  count = 0;
+  while (count < 64) {
+    ASSERT_OK(Put(Key(count), rnd.RandomString(1024), wo));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    count++;
+    if (dbfull()->TEST_write_controler().IsStopped()) {
+      sleeping_task_low.WakeUp();
+      break;
+    }
+  }
+  ASSERT_EQ(count, 6);
+  // Unblock
+  sleeping_task_low.WaitUntilDone();
+
+  // Test disable_auto_compactions
+  // Compaction thread is unblocked but auto compaction is disabled. Write
+  // 4 L0 files and compaction should be triggered. If auto compaction is
+  // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of
+  // L0 files do not change after the call.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "true"}}));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+    // Wait for compaction so that put won't stop
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumTableFilesAtLevel(0), 4);
+
+  // Enable auto compaction and perform the same test, # of L0 files should be
+  // reduced after compaction.
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+    // Wait for compaction so that put won't stop
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LT(NumTableFilesAtLevel(0), 4);
+}
+
+// Test dynamic FIFO compaction options.
+// This test covers just option parsing and makes sure that the options are
+// correctly assigned. Also look at DBOptionsTest.SetFIFOCompactionOptions
+// test which makes sure that the FIFO compaction funcionality is working
+// as expected on dynamically changing the options.
+// Even more FIFOCompactionTests are at DBTest.FIFOCompaction* .
+TEST_F(DBTest, DynamicFIFOCompactionOptions) {
+  Options options;
+  options.ttl = 0;
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            1024 * 1024 * 1024);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=23;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 0);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "97"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 97);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "203"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{allow_compaction=true;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            23);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo", "{max_table_files_size=31;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            31);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 203);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_fifo",
+        "{max_table_files_size=51;allow_compaction=true;}"}}));
+  ASSERT_OK(dbfull()->SetOptions({{"ttl", "49"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.max_table_files_size,
+            51);
+  ASSERT_EQ(dbfull()->GetOptions().ttl, 49);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_fifo.allow_compaction,
+            true);
+}
+
+TEST_F(DBTest, DynamicUniversalCompactionOptions) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  DestroyAndReopen(options);
+
+  // Initial defaults
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 1U);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{size_ratio=7;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      {{"compaction_options_universal", "{min_merge_width=11;}"}}));
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.size_ratio, 7u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.min_merge_width,
+            11u);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.max_merge_width,
+            UINT_MAX);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.max_size_amplification_percent,
+            200u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions()
+                .compaction_options_universal.compression_size_percent,
+            -1);
+  ASSERT_EQ(dbfull()->GetOptions().compaction_options_universal.stop_style,
+            kCompactionStopStyleTotalSize);
+  ASSERT_EQ(
+      dbfull()->GetOptions().compaction_options_universal.allow_trivial_move,
+      false);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, FileCreationRandomFailure) {
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.target_file_size_base = 200000;
+  options.max_bytes_for_level_base = 1000000;
+  options.max_bytes_for_level_multiplier = 2;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  constexpr int kCDTKeysPerBuffer = 4;
+  constexpr int kTestSize = kCDTKeysPerBuffer * 4096;
+  constexpr int kTotalIteration = 20;
+  // the second half of the test involves in random failure
+  // of file creation.
+  constexpr int kRandomFailureTest = kTotalIteration / 2;
+
+  std::vector<std::string> values;
+  for (int i = 0; i < kTestSize; ++i) {
+    values.push_back("NOT_FOUND");
+  }
+  for (int j = 0; j < kTotalIteration; ++j) {
+    if (j == kRandomFailureTest) {
+      env_->non_writeable_rate_.store(90);
+    }
+    for (int k = 0; k < kTestSize; ++k) {
+      // here we expect some of the Put fails.
+      std::string value = rnd.RandomString(100);
+      Status s = Put(Key(k), Slice(value));
+      if (s.ok()) {
+        // update the latest successful put
+        values[k] = value;
+      }
+      // But everything before we simulate the failure-test should succeed.
+      if (j < kRandomFailureTest) {
+        ASSERT_OK(s);
+      }
+    }
+  }
+
+  // If rocksdb does not do the correct job, internal assert will fail here.
+  ASSERT_TRUE(dbfull()->TEST_WaitForFlushMemTable().IsIOError());
+  ASSERT_TRUE(dbfull()->TEST_WaitForCompact().IsIOError());
+
+  // verify we have the latest successful update
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+
+  // reopen and reverify we have the latest successful update
+  env_->non_writeable_rate_.store(0);
+  Reopen(options);
+  for (int k = 0; k < kTestSize; ++k) {
+    auto v = Get(Key(k));
+    ASSERT_EQ(v, values[k]);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, DynamicMiscOptions) {
+  // Test max_sequential_skip_in_iterations
+  Options options;
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 16;
+  options.compression = kNoCompression;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto assert_reseek_count = [this, &options](int key_start, int num_reseek) {
+    int key0 = key_start;
+    int key1 = key_start + 1;
+    int key2 = key_start + 2;
+    Random rnd(301);
+    ASSERT_OK(Put(Key(key0), rnd.RandomString(8)));
+    for (int i = 0; i < 10; ++i) {
+      ASSERT_OK(Put(Key(key1), rnd.RandomString(8)));
+    }
+    ASSERT_OK(Put(Key(key2), rnd.RandomString(8)));
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->Seek(Key(key1));
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key1)), 0);
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key().compare(Key(key2)), 0);
+    ASSERT_EQ(num_reseek,
+              TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION));
+  };
+  // No reseek
+  assert_reseek_count(100, 0);
+
+  ASSERT_OK(dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "4"}}));
+  // Clear memtable and make new option effective
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // Trigger reseek
+  assert_reseek_count(200, 1);
+
+  ASSERT_OK(
+      dbfull()->SetOptions({{"max_sequential_skip_in_iterations", "16"}}));
+  // Clear memtable and make new option effective
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(true));
+  // No reseek
+  assert_reseek_count(300, 1);
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  // Test soft_pending_compaction_bytes_limit,
+  // hard_pending_compaction_bytes_limit
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"soft_pending_compaction_bytes_limit", "200"},
+                    {"hard_pending_compaction_bytes_limit", "300"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(200, mutable_cf_options.soft_pending_compaction_bytes_limit);
+  ASSERT_EQ(300, mutable_cf_options.hard_pending_compaction_bytes_limit);
+  // Test report_bg_io_stats
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"report_bg_io_stats", "true"}}));
+  // sanity check
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  // Test compression
+  // sanity check
+  ASSERT_OK(dbfull()->SetOptions({{"compression", "kNoCompression"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(CompressionType::kNoCompression, mutable_cf_options.compression);
+
+  if (Snappy_Supported()) {
+    ASSERT_OK(dbfull()->SetOptions({{"compression", "kSnappyCompression"}}));
+    ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[0],
+                                                       &mutable_cf_options));
+    ASSERT_EQ(CompressionType::kSnappyCompression,
+              mutable_cf_options.compression);
+  }
+
+  // Test paranoid_file_checks already done in db_block_cache_test
+  ASSERT_OK(
+      dbfull()->SetOptions(handles_[1], {{"paranoid_file_checks", "true"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_TRUE(mutable_cf_options.report_bg_io_stats);
+  ASSERT_TRUE(mutable_cf_options.check_flush_compaction_key_order);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1], {{"check_flush_compaction_key_order", "false"}}));
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_FALSE(mutable_cf_options.check_flush_compaction_key_order);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, L0L1L2AndUpHitCounter) {
+  const int kNumLevels = 3;
+  const int kNumKeysPerLevel = 10000;
+  const int kNumKeysPerDb = kNumLevels * kNumKeysPerLevel;
+
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  Reopen(options);
+
+  // After the below loop there will be one file on each of L0, L1, and L2.
+  int key = 0;
+  for (int output_level = kNumLevels - 1; output_level >= 0; --output_level) {
+    for (int i = 0; i < kNumKeysPerLevel; ++i) {
+      ASSERT_OK(Put(Key(key), "val"));
+      key++;
+    }
+    ASSERT_OK(Flush());
+    for (int input_level = 0; input_level < output_level; ++input_level) {
+      // `TEST_CompactRange(input_level, ...)` compacts from `input_level` to
+      // `input_level + 1`.
+      ASSERT_OK(dbfull()->TEST_CompactRange(input_level, nullptr, nullptr));
+    }
+  }
+  assert(key == kNumKeysPerDb);
+
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  for (int i = 0; i < kNumKeysPerDb; i++) {
+    ASSERT_EQ(Get(Key(i)), "val");
+  }
+
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L0));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L1));
+  ASSERT_EQ(kNumKeysPerLevel, TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+
+  ASSERT_EQ(kNumKeysPerDb, TestGetTickerCount(options, GET_HIT_L0) +
+                               TestGetTickerCount(options, GET_HIT_L1) +
+                               TestGetTickerCount(options, GET_HIT_L2_AND_UP));
+}
+
+TEST_F(DBTest, EncodeDecompressedBlockSizeTest) {
+  // iter 0 -- zlib
+  // iter 1 -- bzip2
+  // iter 2 -- lz4
+  // iter 3 -- lz4HC
+  // iter 4 -- xpress
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      continue;
+    }
+    // first_table_version 1 -- generate with table_version == 1, read with
+    // table_version == 2
+    // first_table_version 2 -- generate with table_version == 2, read with
+    // table_version == 1
+    for (int first_table_version = 1; first_table_version <= 2;
+         ++first_table_version) {
+      BlockBasedTableOptions table_options;
+      table_options.format_version = first_table_version;
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+      Options options = CurrentOptions();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      options.create_if_missing = true;
+      options.compression = comp;
+      DestroyAndReopen(options);
+
+      int kNumKeysWritten = 1000;
+
+      Random rnd(301);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        // compressible string
+        ASSERT_OK(Put(Key(i), rnd.RandomString(128) + std::string(128, 'a')));
+      }
+
+      table_options.format_version = first_table_version == 1 ? 2 : 1;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      Reopen(options);
+      for (int i = 0; i < kNumKeysWritten; ++i) {
+        auto r = Get(Key(i));
+        ASSERT_EQ(r.substr(128), std::string(128, 'a'));
+      }
+    }
+  }
+}
+
+TEST_F(DBTest, CloseSpeedup) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 4;
+  options.max_bytes_for_level_base = 400 * 1024;
+  options.max_write_buffer_number = 16;
+
+  // Block background threads
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  std::vector<std::string> filenames;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  // In Windows, LOCK file cannot be deleted because it is locked by db_test
+  // After closing db_test, the LOCK file is unlocked and can be deleted
+  // Delete archival files.
+  bool deleteDir = true;
+  for (size_t i = 0; i < filenames.size(); ++i) {
+    Status s = env_->DeleteFile(dbname_ + "/" + filenames[i]);
+    if (!s.ok()) {
+      deleteDir = false;
+    }
+  }
+  if (deleteDir) {
+    ASSERT_OK(env_->DeleteDir(dbname_));
+  }
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to level 2
+  // After that, (100K, 200K)
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+  }
+
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  Close();
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // Unblock background threads
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  Destroy(options);
+}
+
+class DelayedMergeOperator : public MergeOperator {
+ private:
+  DBTest* db_test_;
+
+ public:
+  explicit DelayedMergeOperator(DBTest* d) : db_test_(d) {}
+
+  bool FullMergeV2(const MergeOperationInput& merge_in,
+                   MergeOperationOutput* merge_out) const override {
+    db_test_->env_->MockSleepForMicroseconds(1000 *
+                                             merge_in.operand_list.size());
+    merge_out->new_value = "";
+    return true;
+  }
+
+  const char* Name() const override { return "DelayedMergeOperator"; }
+};
+
+TEST_F(DBTest, MergeTestTime) {
+  std::string one, two, three;
+  PutFixed64(&one, 1);
+  PutFixed64(&two, 2);
+  PutFixed64(&three, 3);
+
+  // Enable time profiling
+  SetPerfLevel(kEnableTime);
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  ASSERT_EQ(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0);
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", one));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", two));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "foo", three));
+  ASSERT_OK(Flush());
+
+  ReadOptions opt;
+  opt.verify_checksums = true;
+  opt.snapshot = nullptr;
+  std::string result;
+  ASSERT_OK(db_->Get(opt, "foo", &result));
+
+  ASSERT_EQ(2000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+
+  ReadOptions read_options;
+  std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+  int count = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    ++count;
+  }
+
+  ASSERT_EQ(1, count);
+  ASSERT_EQ(4000000, TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  ASSERT_GT(TestGetTickerCount(options, FLUSH_WRITE_BYTES), 0);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+}
+
+#ifndef ROCKSDB_LITE
+TEST_P(DBTestWithParam, MergeCompactionTimeTest) {
+  SetPerfLevel(kEnableTime);
+  Options options = CurrentOptions();
+  options.compaction_filter_factory = std::make_shared<KeepFilterFactory>();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.merge_operator.reset(new DelayedMergeOperator(this));
+  options.disable_auto_compactions = true;
+  options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  constexpr unsigned n = 1000;
+  for (unsigned i = 0; i < n; i++) {
+    ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME));
+}
+
+TEST_P(DBTestWithParam, FilterCompactionTimeTest) {
+  Options options = CurrentOptions();
+  options.compaction_filter_factory =
+      std::make_shared<DelayFilterFactory>(this);
+  options.disable_auto_compactions = true;
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(kExceptTimeForMutex);
+  options.max_subcompactions = max_subcompactions_;
+  SetTimeElapseOnlySleepOnReopen(&options);
+  DestroyAndReopen(options);
+
+  unsigned n = 0;
+  // put some data
+  for (int table = 0; table < 4; ++table) {
+    for (int i = 0; i < 10 + table; ++i) {
+      ASSERT_OK(Put(std::to_string(table * 100 + i), "val"));
+      ++n;
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ(0U, CountLiveFiles());
+
+  Reopen(options);
+
+  Iterator* itr = db_->NewIterator(ReadOptions());
+  itr->SeekToFirst();
+  ASSERT_OK(itr->status());
+  ASSERT_EQ(uint64_t{n} * 1000000U,
+            TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME));
+  delete itr;
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, TestLogCleanup) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 64 * 1024;  // very small
+  // only two memtables allowed ==> only two log files
+  options.max_write_buffer_number = 2;
+  Reopen(options);
+
+  for (int i = 0; i < 100000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+    // only 2 memtables will be alive, so logs_to_free needs to always be below
+    // 2
+    ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast<size_t>(3));
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, EmptyCompactedDB) {
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  Status s = Put("new", "value");
+  ASSERT_TRUE(s.IsNotSupported());
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, SuggestCompactRangeTest) {
+  class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
+   public:
+    std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+        const CompactionFilter::Context& context) override {
+      saved_context = context;
+      std::unique_ptr<CompactionFilter> empty_filter;
+      return empty_filter;
+    }
+    const char* Name() const override {
+      return "CompactionFilterFactoryGetContext";
+    }
+    static bool IsManual(CompactionFilterFactory* compaction_filter_factory) {
+      return reinterpret_cast<CompactionFilterFactoryGetContext*>(
+                 compaction_filter_factory)
+          ->saved_context.is_manual_compaction;
+    }
+    CompactionFilter::Context saved_context;
+  };
+
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.compaction_filter_factory.reset(
+      new CompactionFilterFactoryGetContext());
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // make sure either L0 or L1 has file
+  while (NumTableFilesAtLevel(0) == 0 && NumTableFilesAtLevel(1) == 0) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  // compact it three times
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(experimental::SuggestCompactRange(db_, nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // All files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // should not compact the level 0 file
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // SuggestCompactRange() is not going to be reported as manual compaction
+  ASSERT_TRUE(!CompactionFilterFactoryGetContext::IsManual(
+      options.compaction_filter_factory.get()));
+
+  // now it should compact the level 0 file
+  // as it's a trivial move to L1, it triggers another one to compact to L2
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, SuggestCompactRangeUniversal) {
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 98 << 10;
+  options.max_compaction_bytes = static_cast<uint64_t>(1) << 60;  // inf
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 10; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+
+  ASSERT_EQ("1,2,3,4", FilesPerLevel());
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(
+        db_->SuggestCompactRange(db_->DefaultColumnFamily(), nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  // All files are compacted
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+
+  GenerateNewRandomFile(&rnd);
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  // nonoverlapping with the file on level 0
+  Slice start("a"), end("b");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // should not compact the level 0 file
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+
+  start = Slice("j");
+  end = Slice("m");
+  ASSERT_OK(experimental::SuggestCompactRange(db_, &start, &end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // now it should compact the level 0 file to the last level
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+}
+
+TEST_F(DBTest, PromoteL0) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // non overlapping ranges
+  std::vector<std::pair<int32_t, int32_t>> ranges = {
+      {81, 160}, {0, 80}, {161, 240}, {241, 320}};
+
+  int32_t value_size = 10 * 1024;  // 10 KB
+
+  Random rnd(301);
+  std::map<int32_t, std::string> values;
+  for (const auto& range : ranges) {
+    for (int32_t j = range.first; j < range.second; j++) {
+      values[j] = rnd.RandomString(value_size);
+      ASSERT_OK(Put(Key(j), values[j]));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  int32_t level0_files = NumTableFilesAtLevel(0, 0);
+  ASSERT_EQ(level0_files, ranges.size());
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 0);  // No files in L1
+
+  // Promote L0 level to L2.
+  ASSERT_OK(experimental::PromoteL0(db_, db_->DefaultColumnFamily(), 2));
+  // We expect that all the files were trivially moved from L0 to L2
+  ASSERT_EQ(NumTableFilesAtLevel(0, 0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(2, 0), level0_files);
+
+  for (const auto& kv : values) {
+    ASSERT_EQ(Get(Key(kv.first)), kv.second);
+  }
+}
+
+TEST_F(DBTest, PromoteL0Failure) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 10 * 1024 * 1024;
+  DestroyAndReopen(options);
+
+  // Produce two L0 files with overlapping ranges.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_OK(Put(Key(3), ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), ""));
+  ASSERT_OK(Flush());
+
+  Status status;
+  // Fails because L0 has overlapping files.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Now there is a file in L1.
+  ASSERT_GE(NumTableFilesAtLevel(1, 0), 1);
+
+  ASSERT_OK(Put(Key(5), ""));
+  ASSERT_OK(Flush());
+  // Fails because L1 is non-empty.
+  status = experimental::PromoteL0(db_, db_->DefaultColumnFamily());
+  ASSERT_TRUE(status.IsInvalidArgument());
+}
+
+// Github issue #596
+TEST_F(DBTest, CompactRangeWithEmptyBottomLevel) {
+  const int kNumLevels = 2;
+  const int kNumL0Files = 2;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumL0Files; ++i) {
+    ASSERT_OK(Put(Key(0), rnd.RandomString(1024)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(NumTableFilesAtLevel(0), kNumL0Files);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 0);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), kNumL0Files);
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, AutomaticConflictsWithManualCompaction) {
+  const int kNumL0Files = 50;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  // never slowdown / stop
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.max_background_compactions = 10;
+  DestroyAndReopen(options);
+
+  // schedule automatic compactions after the manual one starts, but before it
+  // finishes to ensure conflict.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:Start",
+        "DBTest::AutomaticConflictsWithManualCompaction:PrePuts"},
+       {"DBTest::AutomaticConflictsWithManualCompaction:PostPuts",
+        "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"}});
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::MaybeScheduleFlushOrCompaction:Conflict",
+      [&](void* /*arg*/) { callback_count.fetch_add(1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  port::Thread manual_compaction_thread([this]() {
+    CompactRangeOptions croptions;
+    croptions.exclusive_manual_compaction = true;
+    ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PrePuts");
+  for (int i = 0; i < kNumL0Files; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  TEST_SYNC_POINT("DBTest::AutomaticConflictsWithManualCompaction:PostPuts");
+
+  ASSERT_GE(callback_count.load(), 1);
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_NE("NOT_FOUND", Get(Key(i)));
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  manual_compaction_thread.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, CompactFilesShouldTriggerAutoCompaction) {
+  Options options = CurrentOptions();
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 36;
+  options.level0_stop_writes_trigger = 36;
+  DestroyAndReopen(options);
+
+  // generate files for manual compaction
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    // put two keys to ensure no trivial move
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::ColumnFamilyMetaData cf_meta_data;
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+
+  std::vector<std::string> input_files;
+  input_files.push_back(cf_meta_data.levels[0].files[0].name);
+
+  SyncPoint::GetInstance()->LoadDependency({
+      {"CompactFilesImpl:0",
+       "DBTest::CompactFilesShouldTriggerAutoCompaction:Begin"},
+      {"DBTest::CompactFilesShouldTriggerAutoCompaction:End",
+       "CompactFilesImpl:1"},
+  });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread manual_compaction_thread([&]() {
+    auto s = db_->CompactFiles(CompactionOptions(), db_->DefaultColumnFamily(),
+                               input_files, 0);
+    ASSERT_OK(s);
+  });
+
+  TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:Begin");
+  // generate enough files to trigger compaction
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(1024)));
+    }
+    ASSERT_OK(Flush());
+  }
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_GT(cf_meta_data.levels[0].files.size(),
+            options.level0_file_num_compaction_trigger);
+  TEST_SYNC_POINT("DBTest::CompactFilesShouldTriggerAutoCompaction:End");
+
+  manual_compaction_thread.join();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  db_->GetColumnFamilyMetaData(db_->DefaultColumnFamily(), &cf_meta_data);
+  ASSERT_LE(cf_meta_data.levels[0].files.size(),
+            options.level0_file_num_compaction_trigger);
+}
+#endif  // ROCKSDB_LITE
+
+// Github issue #595
+// Large write batch with column families
+TEST_F(DBTest, LargeBatchWithColumnFamilies) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  CreateAndReopenWithCF({"pikachu"}, options);
+  int64_t j = 0;
+  for (int i = 0; i < 5; i++) {
+    for (int pass = 1; pass <= 3; pass++) {
+      WriteBatch batch;
+      size_t write_size = 1024 * 1024 * (5 + i);
+      fprintf(stderr, "prepare: %" ROCKSDB_PRIszt " MB, pass:%d\n",
+              (write_size / 1024 / 1024), pass);
+      for (;;) {
+        std::string data(3000, j++ % 127 + 20);
+        data += std::to_string(j);
+        ASSERT_OK(batch.Put(handles_[0], Slice(data), Slice(data)));
+        if (batch.GetDataSize() > write_size) {
+          break;
+        }
+      }
+      fprintf(stderr, "write: %" ROCKSDB_PRIszt " MB\n",
+              (batch.GetDataSize() / 1024 / 1024));
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+      fprintf(stderr, "done\n");
+    }
+  }
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+// Make sure that Flushes can proceed in parallel with CompactRange()
+TEST_F(DBTest, FlushesInParallelWithCompactRange) {
+  // iter == 0 -- leveled
+  // iter == 1 -- leveled, but throw in a flush between two levels compacting
+  // iter == 2 -- universal
+  for (int iter = 0; iter < 3; ++iter) {
+    Options options = CurrentOptions();
+    if (iter < 2) {
+      options.compaction_style = kCompactionStyleLevel;
+    } else {
+      options.compaction_style = kCompactionStyleUniversal;
+    }
+    options.write_buffer_size = 110 << 10;
+    options.level0_file_num_compaction_trigger = 4;
+    options.num_levels = 4;
+    options.compression = kNoCompression;
+    options.max_bytes_for_level_base = 450 << 10;
+    options.target_file_size_base = 98 << 10;
+    options.max_write_buffer_number = 2;
+
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    for (int num = 0; num < 14; num++) {
+      GenerateNewRandomFile(&rnd);
+    }
+
+    if (iter == 1) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::RunManualCompaction()::1",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "DBImpl::RunManualCompaction()::2"}});
+    } else {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"CompactionJob::Run():Start",
+            "DBTest::FlushesInParallelWithCompactRange:1"},
+           {"DBTest::FlushesInParallelWithCompactRange:2",
+            "CompactionJob::Run():End"}});
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    std::vector<port::Thread> threads;
+    threads.emplace_back([&]() { Compact("a", "z"); });
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:1");
+
+    // this has to start a flush. if flushes are blocked, this will try to
+    // create
+    // 3 memtables, and that will fail because max_write_buffer_number is 2
+    for (int num = 0; num < 3; num++) {
+      GenerateNewRandomFile(&rnd, /* nowait */ true);
+    }
+
+    TEST_SYNC_POINT("DBTest::FlushesInParallelWithCompactRange:2");
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_F(DBTest, DelayedWriteRate) {
+  const int kEntriesPerMemTable = 100;
+  const int kTotalFlushes = 12;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.env = env_;
+  options.write_buffer_size = 100000000;
+  options.max_write_buffer_number = 256;
+  options.max_background_compactions = 1;
+  options.level0_file_num_compaction_trigger = 3;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000000;  // Start with 200MB/s
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kEntriesPerMemTable));
+
+  SetTimeElapseOnlySleepOnReopen(&options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Block compactions
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(i), std::string(10000, 'x')));
+    ASSERT_OK(Flush());
+  }
+
+  // These writes will be slowed down to 1KB/s
+  uint64_t estimated_sleep_time = 0;
+  Random rnd(301);
+  ASSERT_OK(Put("", ""));
+  uint64_t cur_rate = options.delayed_write_rate;
+  for (int i = 0; i < kTotalFlushes; i++) {
+    uint64_t size_memtable = 0;
+    for (int j = 0; j < kEntriesPerMemTable; j++) {
+      auto rand_num = rnd.Uniform(20);
+      // Spread the size range to more.
+      size_t entry_size = rand_num * rand_num * rand_num;
+      WriteOptions wo;
+      ASSERT_OK(Put(Key(i), std::string(entry_size, 'x'), wo));
+      size_memtable += entry_size + 18;
+      // Occasionally sleep a while
+      if (rnd.Uniform(20) == 6) {
+        env_->SleepForMicroseconds(2666);
+      }
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    estimated_sleep_time += size_memtable * 1000000u / cur_rate;
+    // Slow down twice. One for memtable switch and one for flush finishes.
+    cur_rate = static_cast<uint64_t>(static_cast<double>(cur_rate) *
+                                     kIncSlowdownRatio * kIncSlowdownRatio);
+  }
+  // Estimate the total sleep time fall into the rough range.
+  ASSERT_GT(env_->NowMicros(), estimated_sleep_time / 2);
+  ASSERT_LT(env_->NowMicros(), estimated_sleep_time * 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, HardLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  env_->SetBackgroundThreads(1, Env::LOW);
+  options.max_write_buffer_number = 256;
+  options.write_buffer_size = 110 << 10;  // 110KB
+  options.arena_block_size = 4 * 1024;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 999999;
+  options.level0_stop_writes_trigger = 999999;
+  options.hard_pending_compaction_bytes_limit = 800 << 10;
+  options.max_bytes_for_level_base = 10000000000u;
+  options.max_background_compactions = 1;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  std::atomic<int> callback_count(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DelayWrite:Wait", [&](void* /*arg*/) {
+        callback_count.fetch_add(1);
+        sleeping_task_low.WakeUp();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+
+  ASSERT_EQ(0, callback_count.load());
+
+  for (int num = 0; num < 5; num++) {
+    GenerateNewFile(&rnd, &key_idx, true);
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  }
+  ASSERT_GE(callback_count.load(), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WaitUntilDone();
+}
+
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+class WriteStallListener : public EventListener {
+ public:
+  WriteStallListener() : condition_(WriteStallCondition::kNormal) {}
+  void OnStallConditionsChanged(const WriteStallInfo& info) override {
+    MutexLock l(&mutex_);
+    condition_ = info.condition.cur;
+  }
+  bool CheckCondition(WriteStallCondition expected) {
+    MutexLock l(&mutex_);
+    return expected == condition_;
+  }
+
+ private:
+  port::Mutex mutex_;
+  WriteStallCondition condition_;
+};
+
+TEST_F(DBTest, SoftLimit) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.max_write_buffer_number = 256;
+  options.level0_file_num_compaction_trigger = 1;
+  options.level0_slowdown_writes_trigger = 3;
+  options.level0_stop_writes_trigger = 999999;
+  options.delayed_write_rate = 20000;  // About 200KB/s limited rate
+  options.soft_pending_compaction_bytes_limit = 160000;
+  options.target_file_size_base = 99999999;  // All into one file
+  options.max_bytes_for_level_base = 50000;
+  options.max_bytes_for_level_multiplier = 10;
+  options.max_background_compactions = 1;
+  options.compression = kNoCompression;
+  WriteStallListener* listener = new WriteStallListener();
+  options.listeners.emplace_back(listener);
+
+  // FlushMemtable with opt.wait=true does not wait for
+  // `OnStallConditionsChanged` being called. The event listener is triggered
+  // on `JobContext::Clean`, which happens after flush result is installed.
+  // We use sync point to create a custom WaitForFlush that waits for
+  // context cleanup.
+  port::Mutex flush_mutex;
+  port::CondVar flush_cv(&flush_mutex);
+  bool flush_finished = false;
+  auto InstallFlushCallback = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      flush_finished = false;
+    }
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp", [&](void*) {
+          {
+            MutexLock l(&flush_mutex);
+            flush_finished = true;
+          }
+          flush_cv.SignalAll();
+        });
+  };
+  auto WaitForFlush = [&]() {
+    {
+      MutexLock l(&flush_mutex);
+      while (!flush_finished) {
+        flush_cv.Wait();
+      }
+    }
+    SyncPoint::GetInstance()->ClearCallBack(
+        "DBImpl::BackgroundCallFlush:ContextCleanedUp");
+  };
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  // Generating 360KB in Level 3
+  for (int i = 0; i < 72; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    if (i % 10 == 0) {
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(3);
+
+  // Generating 360KB in Level 2
+  for (int i = 0; i < 72; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    if (i % 10 == 0) {
+      ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  MoveFilesToLevel(2);
+
+  ASSERT_OK(Put(Key(0), ""));
+
+  test::SleepingBackgroundTask sleeping_task_low;
+  // Block compactions
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(100 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  sleeping_task_low.Reset();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Now there is one L1 file but doesn't trigger soft_rate_limit
+  //
+  // TODO: soft_rate_limit is depreciated. If this test
+  // relies on soft_rate_limit, then we need to change the test.
+  //
+  // The L1 file size is around 30KB.
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Only allow one compactin going through.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void* /*arg*/) {
+        // Schedule a sleeping task.
+        sleeping_task_low.Reset();
+        env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                       &sleeping_task_low, Env::Priority::LOW);
+      });
+
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+  sleeping_task_low.WaitUntilSleeping();
+  // Create 3 L0 files, making score of L0 to be 3
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(10 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(90 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 60KB) which exceeds 50KB base by 10KB
+  // Given level multiplier 10, estimated pending compaction is around 100KB
+  // doesn't trigger soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // Create 3 L0 files, making score of L0 to be 3, higher than L0.
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put(Key(20 + i), std::string(5000, 'x')));
+    ASSERT_OK(Put(Key(80 - i), std::string(5000, 'x')));
+    // Flush the file. File size is around 30KB.
+    InstallFlushCallback();
+    ASSERT_OK(dbfull()->TEST_FlushMemTable(true, true));
+    WaitForFlush();
+  }
+  // Wake up sleep task to enable compaction to run and waits
+  // for it to go to sleep state again to make sure one compaction
+  // goes through.
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  // Now there is one L1 file (around 90KB) which exceeds 50KB base by 40KB
+  // L2 size is 360KB, so the estimated level fanout 4, estimated pending
+  // compaction is around 200KB
+  // triggerring soft_pending_compaction_bytes_limit
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilSleeping();
+
+  ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kNormal));
+
+  // shrink level base so L2 will hit soft limit easier.
+  ASSERT_OK(dbfull()->SetOptions({
+      {"max_bytes_for_level_base", "5000"},
+  }));
+
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+  ASSERT_TRUE(listener->CheckCondition(WriteStallCondition::kDelayed));
+
+  sleeping_task_low.WaitUntilSleeping();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
+TEST_F(DBTest, LastWriteBufferDelay) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.write_buffer_size = 100000;
+  options.max_write_buffer_number = 4;
+  options.delayed_write_rate = 20000;
+  options.compression = kNoCompression;
+  options.disable_auto_compactions = true;
+  int kNumKeysPerMemtable = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerMemtable));
+
+  Reopen(options);
+  test::SleepingBackgroundTask sleeping_task;
+  // Block flushes
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task,
+                 Env::Priority::HIGH);
+  sleeping_task.WaitUntilSleeping();
+
+  // Create 3 L0 files, making score of L0 to be 3.
+  for (int i = 0; i < 3; i++) {
+    // Fill one mem table
+    for (int j = 0; j < kNumKeysPerMemtable; j++) {
+      ASSERT_OK(Put(Key(j), ""));
+    }
+    ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay());
+  }
+  // Inserting a new entry would create a new mem table, triggering slow down.
+  ASSERT_OK(Put(Key(0), ""));
+  ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay());
+
+  sleeping_task.WakeUp();
+  sleeping_task.WaitUntilDone();
+}
+#endif  // !defined(ROCKSDB_LITE) &&
+        // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+
+TEST_F(DBTest, FailWhenCompressionNotSupportedTest) {
+  CompressionType compressions[] = {kZlibCompression, kBZip2Compression,
+                                    kLZ4Compression, kLZ4HCCompression,
+                                    kXpressCompression};
+  for (auto comp : compressions) {
+    if (!CompressionTypeSupported(comp)) {
+      // not supported, we should fail the Open()
+      Options options = CurrentOptions();
+      options.compression = comp;
+      ASSERT_TRUE(!TryReopen(options).ok());
+      // Try if CreateColumnFamily also fails
+      options.compression = kNoCompression;
+      ASSERT_OK(TryReopen(options));
+      ColumnFamilyOptions cf_options(options);
+      cf_options.compression = comp;
+      ColumnFamilyHandle* handle;
+      ASSERT_TRUE(!db_->CreateColumnFamily(cf_options, "name", &handle).ok());
+    }
+  }
+}
+
+TEST_F(DBTest, CreateColumnFamilyShouldFailOnIncompatibleOptions) {
+  Options options = CurrentOptions();
+  options.max_open_files = 100;
+  Reopen(options);
+
+  ColumnFamilyOptions cf_options(options);
+  // ttl is now supported when max_open_files is -1.
+  cf_options.ttl = 3600;
+  ColumnFamilyHandle* handle;
+  ASSERT_OK(db_->CreateColumnFamily(cf_options, "pikachu", &handle));
+  delete handle;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest, RowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+}
+
+TEST_F(DBTest, PinnableSliceAndRowCache) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
+TEST_F(DBTest, ReusePinnableSlice) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("foo"), "bar");
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    PinnableSlice pin_slice;
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(Get("foo", &pin_slice), Status::OK());
+    ASSERT_EQ(pin_slice.ToString(), "bar");
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    std::vector<Slice> multiget_keys;
+    multiget_keys.push_back("foo");
+    std::vector<PinnableSlice> multiget_values(1);
+    std::vector<Status> statuses({Status::NotFound()});
+    ReadOptions ropt;
+    dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+                       multiget_keys.size(), multiget_keys.data(),
+                       multiget_values.data(), statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+    dbfull()->MultiGet(ropt, dbfull()->DefaultColumnFamily(),
+                       multiget_keys.size(), multiget_keys.data(),
+                       multiget_values.data(), statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+
+  {
+    std::vector<ColumnFamilyHandle*> multiget_cfs;
+    multiget_cfs.push_back(dbfull()->DefaultColumnFamily());
+    std::vector<Slice> multiget_keys;
+    multiget_keys.push_back("foo");
+    std::vector<PinnableSlice> multiget_values(1);
+    std::vector<Status> statuses({Status::NotFound()});
+    ReadOptions ropt;
+    dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+                       multiget_keys.data(), multiget_values.data(),
+                       statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+    dbfull()->MultiGet(ropt, multiget_keys.size(), multiget_cfs.data(),
+                       multiget_keys.data(), multiget_values.data(),
+                       statuses.data());
+    ASSERT_EQ(Status::OK(), statuses[0]);
+
+    // Entry is already in cache, lookup will remove the element from lru
+    ASSERT_EQ(
+        reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+        0);
+  }
+  // After PinnableSlice destruction element is added back in LRU
+  ASSERT_EQ(
+      reinterpret_cast<LRUCache*>(options.row_cache.get())->TEST_GetLRUSize(),
+      1);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest, DeletingOldWalAfterDrop) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"Test:AllowFlushes", "DBImpl::BGWorkFlush"},
+       {"DBImpl::BGWorkFlush:done", "Test:WaitForFlush"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Options options = CurrentOptions();
+  options.max_total_wal_size = 8192;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 1 << 20;
+  options.level0_file_num_compaction_trigger = (1 << 30);
+  options.level0_slowdown_writes_trigger = (1 << 30);
+  options.level0_stop_writes_trigger = (1 << 30);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"cf1", "cf2"}, options);
+  ASSERT_OK(Put(0, "key1", DummyString(8192)));
+  ASSERT_OK(Put(0, "key2", DummyString(8192)));
+  // the oldest wal should now be getting_flushed
+  ASSERT_OK(db_->DropColumnFamily(handles_[0]));
+  // all flushes should now do nothing because their CF is dropped
+  TEST_SYNC_POINT("Test:AllowFlushes");
+  TEST_SYNC_POINT("Test:WaitForFlush");
+  uint64_t lognum1 = dbfull()->TEST_LogfileNumber();
+  ASSERT_OK(Put(1, "key3", DummyString(8192)));
+  ASSERT_OK(Put(1, "key4", DummyString(8192)));
+  // new wal should have been created
+  uint64_t lognum2 = dbfull()->TEST_LogfileNumber();
+  EXPECT_GT(lognum2, lognum1);
+}
+
+TEST_F(DBTest, UnsupportedManualSync) {
+  DestroyAndReopen(CurrentOptions());
+  env_->is_wal_sync_thread_safe_.store(false);
+  Status s = db_->SyncWAL();
+  ASSERT_TRUE(s.IsNotSupported());
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
+                        ::testing::Combine(::testing::Values(1, 4),
+                                           ::testing::Bool()));
+
+TEST_F(DBTest, PauseBackgroundWorkTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;  // Small write buffer
+  Reopen(options);
+
+  std::vector<port::Thread> threads;
+  std::atomic<bool> done(false);
+  ASSERT_OK(db_->PauseBackgroundWork());
+  threads.emplace_back([&]() {
+    Random rnd(301);
+    for (int i = 0; i < 10000; ++i) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    done.store(true);
+  });
+  env_->SleepForMicroseconds(200000);
+  // make sure the thread is not done
+  ASSERT_FALSE(done.load());
+  ASSERT_OK(db_->ContinueBackgroundWork());
+  for (auto& t : threads) {
+    t.join();
+  }
+  // now it's done
+  ASSERT_TRUE(done.load());
+}
+
+// Keep spawning short-living threads that create an iterator and quit.
+// Meanwhile in another thread keep flushing memtables.
+// This used to cause a deadlock.
+TEST_F(DBTest, ThreadLocalPtrDeadlock) {
+  std::atomic<int> flushes_done{0};
+  std::atomic<int> threads_destroyed{0};
+  auto done = [&] { return flushes_done.load() > 10; };
+
+  port::Thread flushing_thread([&] {
+    for (int i = 0; !done(); ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), Slice("hi"),
+                         Slice(std::to_string(i).c_str())));
+      ASSERT_OK(db_->Flush(FlushOptions()));
+      int cnt = ++flushes_done;
+      fprintf(stderr, "Flushed %d times\n", cnt);
+    }
+  });
+
+  std::vector<port::Thread> thread_spawning_threads(10);
+  for (auto& t : thread_spawning_threads) {
+    t = port::Thread([&] {
+      while (!done()) {
+        {
+          port::Thread tmp_thread([&] {
+            auto it = db_->NewIterator(ReadOptions());
+            ASSERT_OK(it->status());
+            delete it;
+          });
+          tmp_thread.join();
+        }
+        ++threads_destroyed;
+      }
+    });
+  }
+
+  for (auto& t : thread_spawning_threads) {
+    t.join();
+  }
+  flushing_thread.join();
+  fprintf(stderr, "Done. Flushed %d times, destroyed %d threads\n",
+          flushes_done.load(), threads_destroyed.load());
+}
+
+TEST_F(DBTest, LargeBlockSizeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_OK(Put(0, "foo", "bar"));
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 8LL * 1024 * 1024 * 1024LL;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest, CreationTimeOfOldestFile) {
+  const int kNumKeysPerFile = 32;
+  const int kNumLevelFiles = 2;
+  const int kValueSize = 100;
+
+  Options options = CurrentOptions();
+  options.max_open_files = -1;
+  env_->SetMockSleep();
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  bool set_file_creation_time_to_zero = true;
+  int idx = 0;
+
+  int64_t time_1 = 0;
+  env_->GetCurrentTime(&time_1);
+  const uint64_t uint_time_1 = static_cast<uint64_t>(time_1);
+
+  // Add 50 hours
+  env_->MockSleepForSeconds(50 * 60 * 60);
+
+  int64_t time_2 = 0;
+  env_->GetCurrentTime(&time_2);
+  const uint64_t uint_time_2 = static_cast<uint64_t>(time_2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* arg) {
+        TableProperties* props = reinterpret_cast<TableProperties*>(arg);
+        if (set_file_creation_time_to_zero) {
+          if (idx == 0) {
+            props->file_creation_time = 0;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_1;
+            idx = 0;
+          }
+        } else {
+          if (idx == 0) {
+            props->file_creation_time = uint_time_1;
+            idx++;
+          } else if (idx == 1) {
+            props->file_creation_time = uint_time_2;
+          }
+        }
+      });
+  // Set file creation time in manifest all to 0.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "FileMetaData::FileMetaData", [&](void* arg) {
+        FileMetaData* meta = static_cast<FileMetaData*>(arg);
+        meta->file_creation_time = 0;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // At this point there should be 2 files, one with file_creation_time = 0 and
+  // the other non-zero. GetCreationTimeOfOldestFile API should return 0.
+  uint64_t creation_time;
+  Status s1 = dbfull()->GetCreationTimeOfOldestFile(&creation_time);
+  ASSERT_EQ(0, creation_time);
+  ASSERT_EQ(s1, Status::OK());
+
+  // Testing with non-zero file creation time.
+  set_file_creation_time_to_zero = false;
+  options = CurrentOptions();
+  options.max_open_files = -1;
+  options.env = env_;
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < kNumLevelFiles; ++i) {
+    for (int j = 0; j < kNumKeysPerFile; ++j) {
+      ASSERT_OK(
+          Put(Key(i * kNumKeysPerFile + j), rnd.RandomString(kValueSize)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // At this point there should be 2 files with non-zero file creation time.
+  // GetCreationTimeOfOldestFile API should return non-zero value.
+  uint64_t ctime;
+  Status s2 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(uint_time_1, ctime);
+  ASSERT_EQ(s2, Status::OK());
+
+  // Testing with max_open_files != -1
+  options = CurrentOptions();
+  options.max_open_files = 10;
+  DestroyAndReopen(options);
+  Status s3 = dbfull()->GetCreationTimeOfOldestFile(&ctime);
+  ASSERT_EQ(s3, Status::NotSupported());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest, MemoryUsageWithMaxWriteBufferSizeToMaintain) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 10000;
+  options.write_buffer_size = 160000;
+  Reopen(options);
+  Random rnd(301);
+  bool memory_limit_exceeded = false;
+
+  ColumnFamilyData* cfd =
+      static_cast<ColumnFamilyHandleImpl*>(db_->DefaultColumnFamily())->cfd();
+
+  for (int i = 0; i < 1000; i++) {
+    std::string value = rnd.RandomString(1000);
+    ASSERT_OK(Put("keykey_" + std::to_string(i), value));
+
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+    const uint64_t cur_active_mem = cfd->mem()->ApproximateMemoryUsage();
+    const uint64_t size_all_mem_table =
+        cur_active_mem + cfd->imm()->ApproximateMemoryUsage();
+
+    // Errors out if memory usage keeps on increasing beyond the limit.
+    // Once memory limit exceeds,  memory_limit_exceeded  is set and if
+    // size_all_mem_table doesn't drop out in the next write then it errors out
+    // (not expected behaviour). If memory usage drops then
+    // memory_limit_exceeded is set to false.
+    if ((size_all_mem_table > cur_active_mem) &&
+        (cur_active_mem >=
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain)) &&
+        (size_all_mem_table >
+         static_cast<uint64_t>(options.max_write_buffer_size_to_maintain) +
+             options.write_buffer_size)) {
+      ASSERT_FALSE(memory_limit_exceeded);
+      memory_limit_exceeded = true;
+    } else {
+      memory_limit_exceeded = false;
+    }
+  }
+}
+
+TEST_F(DBTest, ShuttingDownNotBlockStalledWrites) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  Random rnd(403);
+
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put("key_" + std::to_string(i), rnd.RandomString(10)));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(GetSstFileCount(dbname_), 20);
+
+  // We need !disable_auto_compactions for writes to stall but also want to
+  // delay compaction so stalled writes unblocked due to kShutdownInProgress. BG
+  // compaction will first wait for the sync point
+  // DBTest::ShuttingDownNotBlockStalledWrites. Then it waits extra 2 sec to
+  // allow CancelAllBackgroundWork() to set shutting_down_.
+  SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0",
+      [&](void* /* arg */) { env_->SleepForMicroseconds(2 * 1000 * 1000); });
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::DelayWrite:Wait", "DBTest::ShuttingDownNotBlockStalledWrites"},
+       {"DBTest::ShuttingDownNotBlockStalledWrites",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  options.level0_stop_writes_trigger = 20;
+  options.disable_auto_compactions = false;
+  Reopen(options);
+
+  std::thread thd([&]() {
+    Status s = Put("key_" + std::to_string(101), "101");
+    ASSERT_EQ(s.code(), Status::kShutdownInProgress);
+  });
+
+  TEST_SYNC_POINT("DBTest::ShuttingDownNotBlockStalledWrites");
+  CancelAllBackgroundWork(db_, true);
+
+  thd.join();
+}
+#endif
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test2.cc b/src/rocksdb/db/db_test2.cc
new file mode 100644
index 000000000..8adde3680
--- /dev/null
+++ b/src/rocksdb/db/db_test2.cc
@@ -0,0 +1,7652 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <atomic>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "db/read_callback.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/experimental.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/persistent_cache.h"
+#include "rocksdb/trace_record.h"
+#include "rocksdb/trace_record_result.h"
+#include "rocksdb/utilities/replayer.h"
+#include "rocksdb/wal_filter.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBTest2 : public DBTestBase {
+ public:
+  DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {}
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, OpenForReadOnly) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) {
+  DB* db_ptr = nullptr;
+  std::string dbname = test::PerThreadDBPath("db_readonly");
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  column_families.push_back(ColumnFamilyDescriptor("goku", cf_options));
+  std::vector<ColumnFamilyHandle*> handles;
+  // OpenForReadOnly should fail but will create <dbname> in the file system
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // Since <dbname> is created, we should be able to delete the dir
+  // We first get the list files under <dbname>
+  // There should not be any subdirectories -- this is not checked here
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname, &files));
+  for (auto& f : files) {
+    ASSERT_OK(env_->DeleteFile(dbname + "/" + f));
+  }
+  // <dbname> should be empty now and we should be able to delete it
+  ASSERT_OK(env_->DeleteDir(dbname));
+  options.create_if_missing = false;
+  // OpenForReadOnly should fail since <dbname> was successfully deleted
+  ASSERT_NOK(
+      DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr));
+  // With create_if_missing false, there should not be a dir in the file system
+  ASSERT_NOK(env_->FileExists(dbname));
+}
+
+class TestReadOnlyWithCompressedCache
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  TestReadOnlyWithCompressedCache()
+      : DBTestBase("test_readonly_with_compressed_cache",
+                   /*env_do_fsync=*/true) {
+    max_open_files_ = std::get<0>(GetParam());
+    use_mmap_ = std::get<1>(GetParam());
+  }
+  int max_open_files_;
+  bool use_mmap_;
+};
+
+TEST_P(TestReadOnlyWithCompressedCache, ReadOnlyWithCompressedCache) {
+  if (use_mmap_ && !IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires MMAP support");
+    return;
+  }
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("foo2", "barbarbarbarbarbarbarbar"));
+  ASSERT_OK(Flush());
+
+  DB* db_ptr = nullptr;
+  Options options = CurrentOptions();
+  options.allow_mmap_reads = use_mmap_;
+  options.max_open_files = max_open_files_;
+  options.compression = kSnappyCompression;
+  BlockBasedTableOptions table_options;
+  table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.statistics = CreateDBStatistics();
+
+  ASSERT_OK(DB::OpenForReadOnly(options, dbname_, &db_ptr));
+
+  std::string v;
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  ASSERT_EQ(0, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+  ASSERT_OK(db_ptr->Get(ReadOptions(), "foo", &v));
+  ASSERT_EQ("bar", v);
+  if (Snappy_Supported()) {
+    if (use_mmap_) {
+      ASSERT_EQ(0,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    } else {
+      ASSERT_EQ(1,
+                options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_HIT));
+    }
+  }
+
+  delete db_ptr;
+}
+
+INSTANTIATE_TEST_CASE_P(TestReadOnlyWithCompressedCache,
+                        TestReadOnlyWithCompressedCache,
+                        ::testing::Combine(::testing::Values(-1, 100),
+                                           ::testing::Bool()));
+
+class PartitionedIndexTestListener : public EventListener {
+ public:
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    ASSERT_GT(info.table_properties.index_partitions, 1);
+    ASSERT_EQ(info.table_properties.index_key_is_user_key, 0);
+  }
+};
+
+TEST_F(DBTest2, PartitionedIndexUserToInternalKey) {
+  const int kValueSize = 10500;
+  const int kNumEntriesPerFile = 1000;
+  const int kNumFiles = 3;
+  const int kNumDistinctKeys = 30;
+
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+  PartitionedIndexTestListener* listener = new PartitionedIndexTestListener();
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.listeners.emplace_back(listener);
+  std::vector<const Snapshot*> snapshots;
+  Reopen(options);
+  Random rnd(301);
+
+  for (int i = 0; i < kNumFiles; i++) {
+    for (int j = 0; j < kNumEntriesPerFile; j++) {
+      int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys;
+      std::string value = rnd.RandomString(kValueSize);
+      ASSERT_OK(Put("keykey_" + std::to_string(key_id), value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush());
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+
+#endif  // ROCKSDB_LITE
+
+class PrefixFullBloomWithReverseComparator
+    : public DBTestBase,
+      public ::testing::WithParamInterface<bool> {
+ public:
+  PrefixFullBloomWithReverseComparator()
+      : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {}
+  void SetUp() override { if_cache_filter_ = GetParam(); }
+  bool if_cache_filter_;
+};
+
+TEST_P(PrefixFullBloomWithReverseComparator,
+       PrefixFullBloomWithReverseComparator) {
+  Options options = last_options_;
+  options.comparator = ReverseBytewiseComparator();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  if (if_cache_filter_) {
+    bbto.no_block_cache = false;
+    bbto.cache_index_and_filter_blocks = true;
+    bbto.block_cache = NewLRUCache(1);
+  }
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3"));
+
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+
+  if (bbto.block_cache) {
+    bbto.block_cache->EraseUnRefEntries();
+  }
+
+  std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+  iter->Seek("bar345");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar234", iter->key().ToString());
+  ASSERT_EQ("foo2", iter->value().ToString());
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bar123", iter->key().ToString());
+  ASSERT_EQ("foo", iter->value().ToString());
+
+  iter->Seek("foo234");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("foo123", iter->key().ToString());
+  ASSERT_EQ("foo3", iter->value().ToString());
+
+  iter->Seek("bar");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+}
+
+INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator,
+                        PrefixFullBloomWithReverseComparator, testing::Bool());
+
+TEST_F(DBTest2, IteratorPropertyVersionNumber) {
+  ASSERT_OK(Put("", ""));
+  Iterator* iter1 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter1->status());
+  std::string prop_value;
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+
+  Iterator* iter2 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter2->status());
+  ASSERT_OK(
+      iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number2 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_GT(version_number2, version_number1);
+
+  ASSERT_OK(Put("", ""));
+
+  Iterator* iter3 = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter3->status());
+  ASSERT_OK(
+      iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number3 =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+
+  ASSERT_EQ(version_number2, version_number3);
+
+  iter1->SeekToFirst();
+  ASSERT_OK(
+      iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value));
+  uint64_t version_number1_new =
+      static_cast<uint64_t>(std::atoi(prop_value.c_str()));
+  ASSERT_EQ(version_number1, version_number1_new);
+
+  delete iter1;
+  delete iter2;
+  delete iter3;
+}
+
+TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "a", "begin"));
+  ASSERT_OK(Put(1, "z", "end"));
+  ASSERT_OK(Flush(1));
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  std::string value;
+  value = Get(1, "a");
+}
+
+TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.max_successive_merges = 3;
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("poi", "Finch"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root"));
+  options.max_successive_merges = 2;
+  Reopen(options);
+}
+
+#ifndef ROCKSDB_LITE
+class DBTestSharedWriteBufferAcrossCFs
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  DBTestSharedWriteBufferAcrossCFs()
+      : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {}
+  void SetUp() override {
+    use_old_interface_ = std::get<0>(GetParam());
+    cost_cache_ = std::get<1>(GetParam());
+  }
+  bool use_old_interface_;
+  bool cost_cache_;
+};
+
+TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
+
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // The total soft write buffer size is about 105000
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+
+  if (use_old_interface_) {
+    options.db_write_buffer_size = 120000;  // this is the real limit
+  } else if (!cost_cache_) {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285));
+  } else {
+    options.write_buffer_manager.reset(new WriteBufferManager(114285, cache));
+  }
+  options.write_buffer_size = 500000;  // this is never hit
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  };
+
+  // Create some data and flush "default" and "nikitich" so that they
+  // are newer CFs created.
+  flush_listener->expected_flush_reason = FlushReason::kManualFlush;
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+            static_cast<uint64_t>(1));
+  ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+            static_cast<uint64_t>(1));
+
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(60000), wo));
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024);
+  }
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  // No flush should trigger
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+
+  // Trigger a flush. Flushing "nikitich".
+  ASSERT_OK(Put(3, Key(2), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Without hitting the threshold, no flush should trigger.
+  ASSERT_OK(Put(2, Key(1), DummyString(30000), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Hit the write buffer limit again. "default"
+  // will have been flushed.
+  ASSERT_OK(Put(2, Key(2), DummyString(10000), wo));
+  wait_flush();
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  // Trigger another flush. This time "dobrynia". "pikachu" should not
+  // be flushed, althrough it was never flushed.
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(80000), wo));
+  wait_flush();
+  ASSERT_OK(Put(1, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+  if (cost_cache_) {
+    ASSERT_GE(cache->GetUsage(), 256 * 1024);
+    Close();
+    options.write_buffer_manager.reset();
+    last_options_.write_buffer_manager.reset();
+    ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs,
+                        DBTestSharedWriteBufferAcrossCFs,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) {
+  std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2");
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  auto flush_listener = std::make_shared<FlushCounterListener>();
+  options.listeners.push_back(flush_listener);
+  // Don't trip the listener at shutdown.
+  options.avoid_flush_during_shutdown = true;
+  // Avoid undeterministic value by malloc_usable_size();
+  // Force arena block size to 1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::Arena:0", [&](void* arg) {
+        size_t* block_size = static_cast<size_t*>(arg);
+        *block_size = 1;
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Arena::AllocateNewBlock:0", [&](void* arg) {
+        std::pair<size_t*, size_t*>* pair =
+            static_cast<std::pair<size_t*, size_t*>*>(arg);
+        *std::get<0>(*pair) = *std::get<1>(*pair);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options.write_buffer_size = 500000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(120000));
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  ASSERT_OK(DestroyDB(dbname2, options));
+  DB* db2 = nullptr;
+  ASSERT_OK(DB::Open(options, dbname2, &db2));
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  std::function<void()> wait_flush = [&]() {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+    ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db2)->TEST_WaitForBackgroundWork());
+  };
+
+  // Trigger a flush on cf2
+  flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager;
+  ASSERT_OK(Put(2, Key(1), DummyString(70000), wo));
+  wait_flush();
+  ASSERT_OK(Put(0, Key(1), DummyString(20000), wo));
+  wait_flush();
+
+  // Insert to DB2
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000)));
+  wait_flush();
+
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf1") +
+                  GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering to flush another CF in DB1
+  ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000)));
+  wait_flush();
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  wait_flush();
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(0));
+  }
+
+  // Triggering flush in DB2.
+  ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000)));
+  wait_flush();
+  ASSERT_OK(db2->Put(wo, Key(1), DummyString(1)));
+  wait_flush();
+  ASSERT_OK(static_cast<DBImpl*>(db2)->TEST_WaitForFlushMemTable());
+  {
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"),
+              static_cast<uint64_t>(0));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"),
+              static_cast<uint64_t>(1));
+  }
+
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  std::shared_ptr<Cache> cache = NewLRUCache(LRUCacheOptions(
+      10000000 /* capacity */, 1 /* num_shard_bits */,
+      false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */,
+      nullptr /* memory_allocator */, kDefaultToAdaptiveMutex,
+      kDontChargeCacheMetadata));
+
+  options.write_buffer_size = 50000;  // this is never hit
+  // Use a write buffer total size so that the soft limit is about
+  // 105000.
+  options.write_buffer_manager.reset(new WriteBufferManager(0, cache));
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  // One dummy entry is 256KB.
+  ASSERT_GT(cache->GetUsage(), 128000);
+}
+
+namespace {
+void ValidateKeyExistence(DB* db, const std::vector<Slice>& keys_must_exist,
+                          const std::vector<Slice>& keys_must_not_exist) {
+  // Ensure that expected keys exist
+  std::vector<std::string> values;
+  if (keys_must_exist.size() > 0) {
+    std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_exist, &values);
+    for (size_t i = 0; i < keys_must_exist.size(); i++) {
+      ASSERT_OK(status_list[i]);
+    }
+  }
+
+  // Ensure that given keys don't exist
+  if (keys_must_not_exist.size() > 0) {
+    std::vector<Status> status_list =
+        db->MultiGet(ReadOptions(), keys_must_not_exist, &values);
+    for (size_t i = 0; i < keys_must_not_exist.size(); i++) {
+      ASSERT_TRUE(status_list[i].IsNotFound());
+    }
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(DBTest2, WalFilterTest) {
+  class TestWalFilter : public WalFilter {
+   private:
+    // Processing option that is requested to be applied at the given index
+    WalFilter::WalProcessingOption wal_processing_option_;
+    // Index at which to apply wal_processing_option_
+    // At other indexes default wal_processing_option::kContinueProcessing is
+    // returned.
+    size_t apply_option_at_record_index_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+   public:
+    TestWalFilter(WalFilter::WalProcessingOption wal_processing_option,
+                  size_t apply_option_for_record_index)
+        : wal_processing_option_(wal_processing_option),
+          apply_option_at_record_index_(apply_option_for_record_index),
+          current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& /*batch*/,
+                                  WriteBatch* /*new_batch*/,
+                                  bool* /*batch_changed*/) const override {
+      WalFilter::WalProcessingOption option_to_return;
+
+      if (current_record_index_ == apply_option_at_record_index_) {
+        option_to_return = wal_processing_option_;
+      } else {
+        option_to_return = WalProcessingOption::kContinueProcessing;
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilter*>(this)->current_record_index_)++;
+
+      return option_to_return;
+    }
+
+    const char* Name() const override { return "TestWalFilter"; }
+  };
+
+  // Create 3 batches with two keys each
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  // Test with all WAL processing options
+  for (int option = 0;
+       option < static_cast<int>(
+                    WalFilter::WalProcessingOption::kWalProcessingOptionMax);
+       option++) {
+    Options options = OptionsForLogIterTest();
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Write given keys in given batches
+    for (size_t i = 0; i < batch_keys.size(); i++) {
+      WriteBatch batch;
+      for (size_t j = 0; j < batch_keys[i].size(); j++) {
+        ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+      }
+      ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+    }
+
+    WalFilter::WalProcessingOption wal_processing_option =
+        static_cast<WalFilter::WalProcessingOption>(option);
+
+    // Create a test filter that would apply wal_processing_option at the first
+    // record
+    size_t apply_option_for_record_index = 1;
+    TestWalFilter test_wal_filter(wal_processing_option,
+                                  apply_option_for_record_index);
+
+    // Reopen database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    options.wal_filter = &test_wal_filter;
+    Status status =
+        TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+    if (wal_processing_option ==
+        WalFilter::WalProcessingOption::kCorruptedRecord) {
+      ASSERT_NOK(status);
+      // In case of corruption we can turn off paranoid_checks to reopen
+      // databse
+      options.paranoid_checks = false;
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    } else {
+      ASSERT_OK(status);
+    }
+
+    // Compute which keys we expect to be found
+    // and which we expect not to be found after recovery.
+    std::vector<Slice> keys_must_exist;
+    std::vector<Slice> keys_must_not_exist;
+    switch (wal_processing_option) {
+      case WalFilter::WalProcessingOption::kCorruptedRecord:
+      case WalFilter::WalProcessingOption::kContinueProcessing: {
+        fprintf(stderr, "Testing with complete WAL processing\n");
+        // we expect all records to be processed
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            keys_must_exist.push_back(Slice(batch_keys[i][j]));
+          }
+        }
+        break;
+      }
+      case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: {
+        fprintf(stderr,
+                "Testing with ignoring record %" ROCKSDB_PRIszt " only\n",
+                apply_option_for_record_index);
+        // We expect the record with apply_option_for_record_index to be not
+        // found.
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            if (i == apply_option_for_record_index) {
+              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+            } else {
+              keys_must_exist.push_back(Slice(batch_keys[i][j]));
+            }
+          }
+        }
+        break;
+      }
+      case WalFilter::WalProcessingOption::kStopReplay: {
+        fprintf(stderr,
+                "Testing with stopping replay from record %" ROCKSDB_PRIszt
+                "\n",
+                apply_option_for_record_index);
+        // We expect records beyond apply_option_for_record_index to be not
+        // found.
+        for (size_t i = 0; i < batch_keys.size(); i++) {
+          for (size_t j = 0; j < batch_keys[i].size(); j++) {
+            if (i >= apply_option_for_record_index) {
+              keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+            } else {
+              keys_must_exist.push_back(Slice(batch_keys[i][j]));
+            }
+          }
+        }
+        break;
+      }
+      default:
+        FAIL();  // unhandled case
+    }
+
+    bool checked_after_reopen = false;
+
+    while (true) {
+      // Ensure that expected keys exists
+      // and not expected keys don't exist after recovery
+      ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+      if (checked_after_reopen) {
+        break;
+      }
+
+      // reopen database again to make sure previous log(s) are not used
+      //(even if they were skipped)
+      // reopn database with option to use WAL filter
+      options = OptionsForLogIterTest();
+      ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+      checked_after_reopen = true;
+    }
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatch) {
+  class ChangeBatchHandler : public WriteBatch::Handler {
+   private:
+    // Batch to insert keys in
+    WriteBatch* new_write_batch_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Number of keys added to new batch
+    size_t num_keys_added_;
+
+   public:
+    ChangeBatchHandler(WriteBatch* new_write_batch,
+                       size_t num_keys_to_add_in_new_batch)
+        : new_write_batch_(new_write_batch),
+          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+          num_keys_added_(0) {}
+    void Put(const Slice& key, const Slice& value) override {
+      if (num_keys_added_ < num_keys_to_add_in_new_batch_) {
+        ASSERT_OK(new_write_batch_->Put(key, value));
+        ++num_keys_added_;
+      }
+    }
+  };
+
+  class TestWalFilterWithChangeBatch : public WalFilter {
+   private:
+    // Index at which to start changing records
+    size_t change_records_from_index_;
+    // Number of keys to add in the new batch
+    size_t num_keys_to_add_in_new_batch_;
+    // Current record index, incremented with each record encountered.
+    size_t current_record_index_;
+
+   public:
+    TestWalFilterWithChangeBatch(size_t change_records_from_index,
+                                 size_t num_keys_to_add_in_new_batch)
+        : change_records_from_index_(change_records_from_index),
+          num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch),
+          current_record_index_(0) {}
+
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
+      if (current_record_index_ >= change_records_from_index_) {
+        ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_);
+        Status s = batch.Iterate(&handler);
+        if (s.ok()) {
+          *batch_changed = true;
+        } else {
+          assert(false);
+        }
+      }
+
+      // Filter is passed as a const object for RocksDB to not modify the
+      // object, however we modify it for our own purpose here and hence
+      // cast the constness away.
+      (const_cast<TestWalFilterWithChangeBatch*>(this)
+           ->current_record_index_)++;
+
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override { return "TestWalFilterWithChangeBatch"; }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Create a test filter that would apply wal_processing_option at the first
+  // record
+  size_t change_records_from_index = 1;
+  size_t num_keys_to_add_in_new_batch = 1;
+  TestWalFilterWithChangeBatch test_wal_filter_with_change_batch(
+      change_records_from_index, num_keys_to_add_in_new_batch);
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_with_change_batch;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Ensure that all keys exist before change_records_from_index_
+  // And after that index only single key exists
+  // as our filter adds only single key for each batch
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) {
+        keys_must_not_exist.push_back(Slice(batch_keys[i][j]));
+      } else {
+        keys_must_exist.push_back(Slice(batch_keys[i][j]));
+      }
+    }
+  }
+
+  bool checked_after_reopen = false;
+
+  while (true) {
+    // Ensure that expected keys exists
+    // and not expected keys don't exist after recovery
+    ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+
+    if (checked_after_reopen) {
+      break;
+    }
+
+    // reopen database again to make sure previous log(s) are not used
+    //(even if they were skipped)
+    // reopn database with option to use WAL filter
+    options = OptionsForLogIterTest();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+    checked_after_reopen = true;
+  }
+}
+
+TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) {
+  class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter {
+   public:
+    WalProcessingOption LogRecord(const WriteBatch& batch,
+                                  WriteBatch* new_batch,
+                                  bool* batch_changed) const override {
+      *new_batch = batch;
+      Status s = new_batch->Put("key_extra", "value_extra");
+      if (s.ok()) {
+        *batch_changed = true;
+      } else {
+        assert(false);
+      }
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override {
+      return "WalFilterTestWithChangeBatchExtraKeys";
+    }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys(3);
+
+  batch_keys[0].push_back("key1");
+  batch_keys[0].push_back("key2");
+  batch_keys[1].push_back("key3");
+  batch_keys[1].push_back("key4");
+  batch_keys[2].push_back("key5");
+  batch_keys[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_extra_keys;
+  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(status.IsNotSupported());
+
+  // Reopen without filter, now reopen should succeed - previous
+  // attempt to open must not have altered the db.
+  options = OptionsForLogIterTest();
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  std::vector<Slice> keys_must_exist;
+  std::vector<Slice> keys_must_not_exist;  // empty vector
+
+  for (size_t i = 0; i < batch_keys.size(); i++) {
+    for (size_t j = 0; j < batch_keys[i].size(); j++) {
+      keys_must_exist.push_back(Slice(batch_keys[i][j]));
+    }
+  }
+
+  ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist);
+}
+
+TEST_F(DBTest2, WalFilterTestWithColumnFamilies) {
+  class TestWalFilterWithColumnFamilies : public WalFilter {
+   private:
+    // column_family_id -> log_number map (provided to WALFilter)
+    std::map<uint32_t, uint64_t> cf_log_number_map_;
+    // column_family_name -> column_family_id map (provided to WALFilter)
+    std::map<std::string, uint32_t> cf_name_id_map_;
+    // column_family_name -> keys_found_in_wal map
+    // We store keys that are applicable to the column_family
+    // during recovery (i.e. aren't already flushed to SST file(s))
+    // for verification against the keys we expect.
+    std::map<uint32_t, std::vector<std::string>> cf_wal_keys_;
+
+   public:
+    void ColumnFamilyLogNumberMap(
+        const std::map<uint32_t, uint64_t>& cf_lognumber_map,
+        const std::map<std::string, uint32_t>& cf_name_id_map) override {
+      cf_log_number_map_ = cf_lognumber_map;
+      cf_name_id_map_ = cf_name_id_map;
+    }
+
+    WalProcessingOption LogRecordFound(unsigned long long log_number,
+                                       const std::string& /*log_file_name*/,
+                                       const WriteBatch& batch,
+                                       WriteBatch* /*new_batch*/,
+                                       bool* /*batch_changed*/) override {
+      class LogRecordBatchHandler : public WriteBatch::Handler {
+       private:
+        const std::map<uint32_t, uint64_t>& cf_log_number_map_;
+        std::map<uint32_t, std::vector<std::string>>& cf_wal_keys_;
+        unsigned long long log_number_;
+
+       public:
+        LogRecordBatchHandler(
+            unsigned long long current_log_number,
+            const std::map<uint32_t, uint64_t>& cf_log_number_map,
+            std::map<uint32_t, std::vector<std::string>>& cf_wal_keys)
+            : cf_log_number_map_(cf_log_number_map),
+              cf_wal_keys_(cf_wal_keys),
+              log_number_(current_log_number) {}
+
+        Status PutCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& /*value*/) override {
+          auto it = cf_log_number_map_.find(column_family_id);
+          assert(it != cf_log_number_map_.end());
+          unsigned long long log_number_for_cf = it->second;
+          // If the current record is applicable for column_family_id
+          // (i.e. isn't flushed to SST file(s) for column_family_id)
+          // add it to the cf_wal_keys_ map for verification.
+          if (log_number_ >= log_number_for_cf) {
+            cf_wal_keys_[column_family_id].push_back(
+                std::string(key.data(), key.size()));
+          }
+          return Status::OK();
+        }
+      } handler(log_number, cf_log_number_map_, cf_wal_keys_);
+
+      Status s = batch.Iterate(&handler);
+      if (!s.ok()) {
+        // TODO(AR) is this ok?
+        return WalProcessingOption::kCorruptedRecord;
+      }
+
+      return WalProcessingOption::kContinueProcessing;
+    }
+
+    const char* Name() const override {
+      return "WalFilterTestWithColumnFamilies";
+    }
+
+    const std::map<uint32_t, std::vector<std::string>>& GetColumnFamilyKeys() {
+      return cf_wal_keys_;
+    }
+
+    const std::map<std::string, uint32_t>& GetColumnFamilyNameIdMap() {
+      return cf_name_id_map_;
+    }
+  };
+
+  std::vector<std::vector<std::string>> batch_keys_pre_flush(3);
+
+  batch_keys_pre_flush[0].push_back("key1");
+  batch_keys_pre_flush[0].push_back("key2");
+  batch_keys_pre_flush[1].push_back("key3");
+  batch_keys_pre_flush[1].push_back("key4");
+  batch_keys_pre_flush[2].push_back("key5");
+  batch_keys_pre_flush[2].push_back("key6");
+
+  Options options = OptionsForLogIterTest();
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j],
+                          DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // Flush default column-family
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[0]));
+
+  // Do some more writes
+  std::vector<std::vector<std::string>> batch_keys_post_flush(3);
+
+  batch_keys_post_flush[0].push_back("key7");
+  batch_keys_post_flush[0].push_back("key8");
+  batch_keys_post_flush[1].push_back("key9");
+  batch_keys_post_flush[1].push_back("key10");
+  batch_keys_post_flush[2].push_back("key11");
+  batch_keys_post_flush[2].push_back("key12");
+
+  // Write given keys in given batches
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    WriteBatch batch;
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
+      ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j],
+                          DummyString(1024)));
+    }
+    ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+  }
+
+  // On Recovery we should only find the second batch applicable to default CF
+  // But both batches applicable to pikachu CF
+
+  // Create a test filter that would add extra keys
+  TestWalFilterWithColumnFamilies test_wal_filter_column_families;
+
+  // Reopen database with option to use WAL filter
+  options = OptionsForLogIterTest();
+  options.wal_filter = &test_wal_filter_column_families;
+  Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+  ASSERT_TRUE(status.ok());
+
+  // verify that handles_[0] only has post_flush keys
+  // while handles_[1] has pre and post flush keys
+  auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys();
+  auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap();
+  size_t index = 0;
+  auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]];
+  // default column-family, only post_flush keys are expected
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+  ASSERT_EQ(index, keys_cf.size());
+
+  index = 0;
+  keys_cf = cf_wal_keys[name_id_map["pikachu"]];
+  // pikachu column-family, all keys are expected
+  for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_pre_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+
+  for (size_t i = 0; i < batch_keys_post_flush.size(); i++) {
+    for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) {
+      Slice key_from_the_log(keys_cf[index++]);
+      Slice batch_key(batch_keys_post_flush[i][j]);
+      ASSERT_EQ(key_from_the_log.compare(batch_key), 0);
+    }
+  }
+  ASSERT_EQ(index, keys_cf.size());
+}
+
+TEST_F(DBTest2, PresetCompressionDict) {
+  // Verifies that compression ratio improves when dictionary is enabled, and
+  // improves even further when the dictionary is trained by ZSTD.
+  const size_t kBlockSizeBytes = 4 << 10;
+  const size_t kL0FileBytes = 128 << 10;
+  const size_t kApproxPerBlockOverheadBytes = 50;
+  const int kNumL0Files = 5;
+
+  Options options;
+  // Make sure to use any custom env that the test is configured with.
+  options.env = CurrentOptions().env;
+  options.allow_concurrent_memtable_write = false;
+  options.arena_block_size = kBlockSizeBytes;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes));
+  options.num_levels = 2;
+  options.target_file_size_base = kL0FileBytes;
+  options.target_file_size_multiplier = 2;
+  options.write_buffer_size = kL0FileBytes;
+  BlockBasedTableOptions table_options;
+  table_options.block_size = kBlockSizeBytes;
+  std::vector<CompressionType> compression_types;
+  if (Zlib_Supported()) {
+    compression_types.push_back(kZlibCompression);
+  }
+#if LZ4_VERSION_NUMBER >= 10400  // r124+
+  compression_types.push_back(kLZ4Compression);
+  compression_types.push_back(kLZ4HCCompression);
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (ZSTD_Supported()) {
+    compression_types.push_back(kZSTD);
+  }
+
+  enum DictionaryTypes : int {
+    kWithoutDict,
+    kWithDict,
+    kWithZSTDfinalizeDict,
+    kWithZSTDTrainedDict,
+    kDictEnd,
+  };
+
+  for (auto compression_type : compression_types) {
+    options.compression = compression_type;
+    size_t bytes_without_dict = 0;
+    size_t bytes_with_dict = 0;
+    size_t bytes_with_zstd_finalize_dict = 0;
+    size_t bytes_with_zstd_trained_dict = 0;
+    for (int i = kWithoutDict; i < kDictEnd; i++) {
+      // First iteration: compress without preset dictionary
+      // Second iteration: compress with preset dictionary
+      // Third iteration (zstd only): compress with zstd-trained dictionary
+      //
+      // To make sure the compression dictionary has the intended effect, we
+      // verify the compressed size is smaller in successive iterations. Also in
+      // the non-first iterations, verify the data we get out is the same data
+      // we put in.
+      switch (i) {
+        case kWithoutDict:
+          options.compression_opts.max_dict_bytes = 0;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithDict:
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = 0;
+          break;
+        case kWithZSTDfinalizeDict:
+          if (compression_type != kZSTD ||
+              !ZSTD_FinalizeDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = false;
+          break;
+        case kWithZSTDTrainedDict:
+          if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) {
+            continue;
+          }
+          options.compression_opts.max_dict_bytes = kBlockSizeBytes;
+          options.compression_opts.zstd_max_train_bytes = kL0FileBytes;
+          options.compression_opts.use_zstd_dict_trainer = true;
+          break;
+        default:
+          assert(false);
+      }
+
+      options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+      CreateAndReopenWithCF({"pikachu"}, options);
+      Random rnd(301);
+      std::string seq_datas[10];
+      for (int j = 0; j < 10; ++j) {
+        seq_datas[j] =
+            rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes);
+      }
+
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      for (int j = 0; j < kNumL0Files; ++j) {
+        for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) {
+          auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k;
+          ASSERT_OK(Put(1, Key(static_cast<int>(key_num)),
+                        seq_datas[(key_num / 10) % 10]));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+        ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1));
+      }
+      ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1],
+                                            true /* disallow_trivial_move */));
+      ASSERT_EQ(0, NumTableFilesAtLevel(0, 1));
+      ASSERT_GT(NumTableFilesAtLevel(1, 1), 0);
+
+      // Get the live sst files size
+      size_t total_sst_bytes = TotalSize(1);
+      if (i == kWithoutDict) {
+        bytes_without_dict = total_sst_bytes;
+      } else if (i == kWithDict) {
+        bytes_with_dict = total_sst_bytes;
+      } else if (i == kWithZSTDfinalizeDict) {
+        bytes_with_zstd_finalize_dict = total_sst_bytes;
+      } else if (i == kWithZSTDTrainedDict) {
+        bytes_with_zstd_trained_dict = total_sst_bytes;
+      }
+
+      for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes);
+           j++) {
+        ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast<int>(j))));
+      }
+      if (i == kWithDict) {
+        ASSERT_GT(bytes_without_dict, bytes_with_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a finalized
+        // dictionary does not get as good a compression ratio as raw content
+        // dictionary. But using a dictionary should always get better
+        // compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict ||
+                    bytes_without_dict > bytes_with_zstd_finalize_dict);
+      } else if (i == kWithZSTDTrainedDict) {
+        // In zstd compression, it is sometimes possible that using a trained
+        // dictionary does not get as good a compression ratio as without
+        // training.
+        // But using a dictionary (with or without training) should always get
+        // better compression ratio than not using one.
+        ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict ||
+                    bytes_without_dict > bytes_with_zstd_trained_dict);
+      }
+
+      DestroyAndReopen(options);
+    }
+  }
+}
+
+TEST_F(DBTest2, PresetCompressionDictLocality) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  // Verifies that compression dictionary is generated from local data. The
+  // verification simply checks all output SSTs have different compression
+  // dictionaries. We do not verify effectiveness as that'd likely be flaky in
+  // the future.
+  const int kNumEntriesPerFile = 1 << 10;  // 1KB
+  const int kNumBytesPerEntry = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry;
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kNumEntriesPerFile; ++j) {
+      ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j),
+                    rnd.RandomString(kNumBytesPerEntry)));
+    }
+    ASSERT_OK(Flush());
+    MoveFilesToLevel(1);
+    ASSERT_EQ(NumTableFilesAtLevel(1), i + 1);
+  }
+
+  // Store all the dictionaries generated during a full compaction.
+  std::vector<std::string> compression_dicts;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* arg) {
+        compression_dicts.emplace_back(static_cast<Slice*>(arg)->ToString());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  CompactRangeOptions compact_range_opts;
+  compact_range_opts.bottommost_level_compaction =
+      BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+  // Dictionary compression should not be so good as to compress four totally
+  // random files into one. If it does then there's probably something wrong
+  // with the test.
+  ASSERT_GT(NumTableFilesAtLevel(1), 1);
+
+  // Furthermore, there should be one compression dictionary generated per file.
+  // And they should all be different from each other.
+  ASSERT_EQ(NumTableFilesAtLevel(1),
+            static_cast<int>(compression_dicts.size()));
+  for (size_t i = 1; i < compression_dicts.size(); ++i) {
+    std::string& a = compression_dicts[i - 1];
+    std::string& b = compression_dicts[i];
+    size_t alen = a.size();
+    size_t blen = b.size();
+    ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0);
+  }
+}
+
+class PresetCompressionDictTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<CompressionType, bool>> {
+ public:
+  PresetCompressionDictTest()
+      : DBTestBase("db_test2", false /* env_do_fsync */),
+        compression_type_(std::get<0>(GetParam())),
+        bottommost_(std::get<1>(GetParam())) {}
+
+ protected:
+  const CompressionType compression_type_;
+  const bool bottommost_;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, PresetCompressionDictTest,
+    ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()),
+                       ::testing::Bool()));
+
+TEST_P(PresetCompressionDictTest, Flush) {
+  // Verifies that dictionary is generated and written during flush only when
+  // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the
+  // size of the dictionary is within expectations according to the limit on
+  // buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile));
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t i = 0; i <= kKeysPerFile; ++i) {
+    ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the flush finishes.
+  if (bottommost_) {
+    // Flush is never considered bottommost. This should change in the future
+    // since flushed files may have nothing underneath them, like the one in
+    // this test case.
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        0);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactNonBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when `ColumnFamilyOptions::compression` enables
+  // dictionary. Also verifies the size of the dictionary is within expectations
+  // according to the limit on buffering set by
+  // `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (size_t j = 0; j <= kKeysPerFile; ++j) {
+    ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  // This L0->L1 compaction merges the two L0 files into L1. The produced L1
+  // file is not bottommost due to the existing L2 file covering the same key-
+  // range.
+  ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a
+  // compression dictionary exists since dictionaries would be preloaded when
+  // the compaction finishes.
+  if (bottommost_) {
+    ASSERT_EQ(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+  } else {
+    ASSERT_GT(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted);
+    // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+    // number of bytes needs to be adjusted in case the cached block is in
+    // ZSTD's digested dictionary format.
+    if (compression_type_ != kZSTD &&
+        compression_type_ != kZSTDNotFinalCompression) {
+      // Although we limited buffering to `kBlockLen`, there may be up to two
+      // blocks of data included in the dictionary since we only check limit
+      // after each block is built.
+      ASSERT_LE(TestGetTickerCount(options,
+                                   BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+                prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+    }
+  }
+}
+
+TEST_P(PresetCompressionDictTest, CompactBottommost) {
+  // Verifies that dictionary is generated and written during compaction to
+  // non-bottommost level only when either `ColumnFamilyOptions::compression` or
+  // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also
+  // verifies the size of the dictionary is within expectations according to the
+  // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`.
+  const size_t kValueLen = 256;
+  const size_t kKeysPerFile = 1 << 10;
+  const size_t kDictLen = 16 << 10;
+  const size_t kBlockLen = 4 << 10;
+
+  Options options = CurrentOptions();
+  if (bottommost_) {
+    options.bottommost_compression = compression_type_;
+    options.bottommost_compression_opts.enabled = true;
+    options.bottommost_compression_opts.max_dict_bytes = kDictLen;
+    options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen;
+  } else {
+    options.compression = compression_type_;
+    options.compression_opts.max_dict_bytes = kDictLen;
+    options.compression_opts.max_dict_buffer_bytes = kBlockLen;
+  }
+  options.disable_auto_compactions = true;
+  options.statistics = CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.block_size = kBlockLen;
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (size_t j = 0; j <= kKeysPerFile; ++j) {
+      ASSERT_OK(Put(Key(static_cast<int>(j)), rnd.RandomString(kValueLen)));
+    }
+    ASSERT_OK(Flush());
+  }
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  uint64_t prev_compression_dict_bytes_inserted =
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT);
+  CompactRangeOptions cro;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+  ASSERT_GT(
+      TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+      prev_compression_dict_bytes_inserted);
+  // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on
+  // number of bytes needs to be adjusted in case the cached block is in ZSTD's
+  // digested dictionary format.
+  if (compression_type_ != kZSTD &&
+      compression_type_ != kZSTDNotFinalCompression) {
+    // Although we limited buffering to `kBlockLen`, there may be up to two
+    // blocks of data included in the dictionary since we only check limit after
+    // each block is built.
+    ASSERT_LE(
+        TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT),
+        prev_compression_dict_bytes_inserted + 2 * kBlockLen);
+  }
+}
+
+class CompactionCompressionListener : public EventListener {
+ public:
+  explicit CompactionCompressionListener(Options* db_options)
+      : db_options_(db_options) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    // Figure out last level with files
+    int bottommost_level = 0;
+    for (int level = 0; level < db->NumberLevels(); level++) {
+      std::string files_at_level;
+      ASSERT_TRUE(
+          db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level),
+                          &files_at_level));
+      if (files_at_level != "0") {
+        bottommost_level = level;
+      }
+    }
+
+    if (db_options_->bottommost_compression != kDisableCompressionOption &&
+        ci.output_level == bottommost_level) {
+      ASSERT_EQ(ci.compression, db_options_->bottommost_compression);
+    } else if (db_options_->compression_per_level.size() != 0) {
+      ASSERT_EQ(ci.compression,
+                db_options_->compression_per_level[ci.output_level]);
+    } else {
+      ASSERT_EQ(ci.compression, db_options_->compression);
+    }
+    max_level_checked = std::max(max_level_checked, ci.output_level);
+  }
+
+  int max_level_checked = 0;
+  const Options* db_options_;
+};
+
+enum CompressionFailureType {
+  kTestCompressionFail,
+  kTestDecompressionFail,
+  kTestDecompressionCorruption
+};
+
+class CompressionFailuresTest
+    : public DBTest2,
+      public testing::WithParamInterface<std::tuple<
+          CompressionFailureType, CompressionType, uint32_t, uint32_t>> {
+ public:
+  CompressionFailuresTest() {
+    std::tie(compression_failure_type_, compression_type_,
+             compression_max_dict_bytes_, compression_parallel_threads_) =
+        GetParam();
+  }
+
+  CompressionFailureType compression_failure_type_ = kTestCompressionFail;
+  CompressionType compression_type_ = kNoCompression;
+  uint32_t compression_max_dict_bytes_ = 0;
+  uint32_t compression_parallel_threads_ = 0;
+};
+
+INSTANTIATE_TEST_CASE_P(
+    DBTest2, CompressionFailuresTest,
+    ::testing::Combine(::testing::Values(kTestCompressionFail,
+                                         kTestDecompressionFail,
+                                         kTestDecompressionCorruption),
+                       ::testing::ValuesIn(GetSupportedCompressions()),
+                       ::testing::Values(0, 10), ::testing::Values(1, 4)));
+
+TEST_P(CompressionFailuresTest, CompressionFailures) {
+  if (compression_type_ == kNoCompression) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 1024;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 512;
+
+  BlockBasedTableOptions table_options;
+  table_options.block_size = 512;
+  table_options.verify_compression = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  options.compression = compression_type_;
+  options.compression_opts.parallel_threads = compression_parallel_threads_;
+  options.compression_opts.max_dict_bytes = compression_max_dict_bytes_;
+  options.bottommost_compression_opts.parallel_threads =
+      compression_parallel_threads_;
+  options.bottommost_compression_opts.max_dict_bytes =
+      compression_max_dict_bytes_;
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "CompressData:TamperWithReturnValue", [](void* arg) {
+          bool* ret = static_cast<bool*>(arg);
+          *ret = false;
+        });
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockData:TamperWithReturnValue", [](void* arg) {
+          Status* ret = static_cast<Status*>(arg);
+          ASSERT_OK(*ret);
+          *ret = Status::Corruption("kTestDecompressionFail");
+        });
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "UncompressBlockData:"
+        "TamperWithDecompressionOutput",
+        [](void* arg) {
+          BlockContents* contents = static_cast<BlockContents*>(arg);
+          // Ensure uncompressed data != original data
+          const size_t len = contents->data.size() + 1;
+          std::unique_ptr<char[]> fake_data(new char[len]());
+          *contents = BlockContents(std::move(fake_data), len);
+        });
+  }
+
+  std::map<std::string, std::string> key_value_written;
+
+  const int kKeySize = 5;
+  const int kValUnitSize = 16;
+  const int kValSize = 256;
+  Random rnd(405);
+
+  Status s = Status::OK();
+
+  DestroyAndReopen(options);
+  // Write 10 random files
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 5; j++) {
+      std::string key = rnd.RandomString(kKeySize);
+      // Ensure good compression ratio
+      std::string valueUnit = rnd.RandomString(kValUnitSize);
+      std::string value;
+      for (int k = 0; k < kValSize; k += kValUnitSize) {
+        value += valueUnit;
+      }
+      s = Put(key, value);
+      if (compression_failure_type_ == kTestCompressionFail) {
+        key_value_written[key] = value;
+        ASSERT_OK(s);
+      }
+    }
+    s = Flush();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    s = dbfull()->TEST_WaitForCompact();
+    if (compression_failure_type_ == kTestCompressionFail) {
+      ASSERT_OK(s);
+    }
+    if (i == 4) {
+      // Make compression fail at the mid of table building
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (compression_failure_type_ == kTestCompressionFail) {
+    // Should be kNoCompression, check content consistency
+    std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+    for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+      std::string key = db_iter->key().ToString();
+      std::string value = db_iter->value().ToString();
+      ASSERT_NE(key_value_written.find(key), key_value_written.end());
+      ASSERT_EQ(key_value_written[key], value);
+      key_value_written.erase(key);
+    }
+    ASSERT_EQ(0, key_value_written.size());
+  } else if (compression_failure_type_ == kTestDecompressionFail) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Could not decompress: kTestDecompressionFail");
+  } else if (compression_failure_type_ == kTestDecompressionCorruption) {
+    ASSERT_EQ(std::string(s.getState()),
+              "Decompressed block did not match pre-compression block");
+  }
+}
+
+TEST_F(DBTest2, CompressionOptions) {
+  if (!Zlib_Supported() || !Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.max_bytes_for_level_base = 100;
+  options.max_bytes_for_level_multiplier = 2;
+  options.num_levels = 7;
+  options.max_background_compactions = 1;
+
+  CompactionCompressionListener* listener =
+      new CompactionCompressionListener(&options);
+  options.listeners.emplace_back(listener);
+
+  const int kKeySize = 5;
+  const int kValSize = 20;
+  Random rnd(301);
+
+  std::vector<uint32_t> compression_parallel_threads = {1, 4};
+
+  std::map<std::string, std::string> key_value_written;
+
+  for (int iter = 0; iter <= 2; iter++) {
+    listener->max_level_checked = 0;
+
+    if (iter == 0) {
+      // Use different compression algorithms for different levels but
+      // always use Zlib for bottommost level
+      options.compression_per_level = {kNoCompression,     kNoCompression,
+                                       kNoCompression,     kSnappyCompression,
+                                       kSnappyCompression, kSnappyCompression,
+                                       kZlibCompression};
+      options.compression = kNoCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 1) {
+      // Use Snappy except for bottommost level use ZLib
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kZlibCompression;
+    } else if (iter == 2) {
+      // Use Snappy everywhere
+      options.compression_per_level = {};
+      options.compression = kSnappyCompression;
+      options.bottommost_compression = kDisableCompressionOption;
+    }
+
+    for (auto num_threads : compression_parallel_threads) {
+      options.compression_opts.parallel_threads = num_threads;
+      options.bottommost_compression_opts.parallel_threads = num_threads;
+
+      DestroyAndReopen(options);
+      // Write 10 random files
+      for (int i = 0; i < 10; i++) {
+        for (int j = 0; j < 5; j++) {
+          std::string key = rnd.RandomString(kKeySize);
+          std::string value = rnd.RandomString(kValSize);
+          key_value_written[key] = value;
+          ASSERT_OK(Put(key, value));
+        }
+        ASSERT_OK(Flush());
+        ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      }
+
+      // Make sure that we wrote enough to check all 7 levels
+      ASSERT_EQ(listener->max_level_checked, 6);
+
+      // Make sure database content is the same as key_value_written
+      std::unique_ptr<Iterator> db_iter(db_->NewIterator(ReadOptions()));
+      for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) {
+        std::string key = db_iter->key().ToString();
+        std::string value = db_iter->value().ToString();
+        ASSERT_NE(key_value_written.find(key), key_value_written.end());
+        ASSERT_EQ(key_value_written[key], value);
+        key_value_written.erase(key);
+      }
+      ASSERT_OK(db_iter->status());
+      ASSERT_EQ(0, key_value_written.size());
+    }
+  }
+}
+
+class CompactionStallTestListener : public EventListener {
+ public:
+  CompactionStallTestListener()
+      : compacting_files_cnt_(0), compacted_files_cnt_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacting_files_cnt_ += ci.input_files.size();
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum);
+    compacted_files_cnt_ += ci.input_files.size();
+  }
+
+  std::atomic<size_t> compacting_files_cnt_;
+  std::atomic<size_t> compacted_files_cnt_;
+};
+
+TEST_F(DBTest2, CompactionStall) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"},
+       {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"},
+       {"DBTest2::CompactionStall:2",
+        "DBImpl::NotifyOnCompactionBegin::UnlockMutex"},
+       {"DBTest2::CompactionStall:3",
+        "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 4;
+  options.max_background_compactions = 40;
+  CompactionStallTestListener* listener = new CompactionStallTestListener();
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+  // make sure all background compaction jobs can be scheduled
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  Random rnd(301);
+
+  // 4 Files in L0
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:0");
+
+  // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again
+  // at DBTest2::CompactionStall::1
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Another 6 L0 files to trigger compaction again
+  for (int i = 0; i < 6; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for another compaction to be triggered
+  TEST_SYNC_POINT("DBTest2::CompactionStall:1");
+
+  // Hold NotifyOnCompactionBegin in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:2");
+
+  // Hold NotifyOnCompactionCompleted in the unlock mutex section
+  TEST_SYNC_POINT("DBTest2::CompactionStall:3");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LT(NumTableFilesAtLevel(0),
+            options.level0_file_num_compaction_trigger);
+  ASSERT_GT(listener->compacted_files_cnt_.load(),
+            10 - options.level0_file_num_compaction_trigger);
+  ASSERT_EQ(listener->compacting_files_cnt_.load(),
+            listener->compacted_files_cnt_.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, FirstSnapshotTest) {
+  Options options;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // This snapshot will have sequence number 0 what is expected behaviour.
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put(1, "k1", std::string(100000, 'x')));  // Fill memtable
+  ASSERT_OK(Put(1, "k2", std::string(100000, 'y')));  // Trigger flush
+
+  db_->ReleaseSnapshot(s1);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, DuplicateSnapshot) {
+  Options options;
+  options = CurrentOptions(options);
+  std::vector<const Snapshot*> snapshots;
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  SequenceNumber oldest_ww_snap, first_ww_snap;
+
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(db_->GetSnapshot());
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  first_ww_snap = snapshots.back()->GetSequenceNumber();
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary());
+  snapshots.push_back(db_->GetSnapshot());
+  ASSERT_OK(Put("k", "v"));  // inc seq
+  snapshots.push_back(db_->GetSnapshot());
+
+  {
+    InstrumentedMutexLock l(dbi->mutex());
+    auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap);
+    ASSERT_EQ(seqs.size(), 4);  // duplicates are not counted
+    ASSERT_EQ(oldest_ww_snap, first_ww_snap);
+  }
+
+  for (auto s : snapshots) {
+    db_->ReleaseSnapshot(s);
+  }
+}
+#endif  // ROCKSDB_LITE
+
+class PinL0IndexAndFilterBlocksTest
+    : public DBTestBase,
+      public testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  PinL0IndexAndFilterBlocksTest()
+      : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {}
+  void SetUp() override {
+    infinite_max_files_ = std::get<0>(GetParam());
+    disallow_preload_ = std::get<1>(GetParam());
+  }
+
+  void CreateTwoLevels(Options* options, bool close_afterwards) {
+    if (infinite_max_files_) {
+      options->max_open_files = -1;
+    }
+    options->create_if_missing = true;
+    options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    BlockBasedTableOptions table_options;
+    table_options.cache_index_and_filter_blocks = true;
+    table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+    table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+    CreateAndReopenWithCF({"pikachu"}, *options);
+
+    ASSERT_OK(Put(1, "a", "begin"));
+    ASSERT_OK(Put(1, "z", "end"));
+    ASSERT_OK(Flush(1));
+    // move this table to L1
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]));
+
+    // reset block cache
+    table_options.block_cache = NewLRUCache(64 * 1024);
+    options->table_factory.reset(NewBlockBasedTableFactory(table_options));
+    TryReopenWithColumnFamilies({"default", "pikachu"}, *options);
+    // create new table at L0
+    ASSERT_OK(Put(1, "a2", "begin2"));
+    ASSERT_OK(Put(1, "z2", "end2"));
+    ASSERT_OK(Flush(1));
+
+    if (close_afterwards) {
+      Close();  // This ensures that there is no ref to block cache entries
+    }
+    table_options.block_cache->EraseUnRefEntries();
+  }
+
+  bool infinite_max_files_;
+  bool disallow_preload_;
+};
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) {
+  Options options = CurrentOptions();
+  if (infinite_max_files_) {
+    options.max_open_files = -1;
+  }
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  table_options.pin_l0_filter_and_index_blocks_in_cache = true;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(20));
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "key", "val"));
+  // Create a new table.
+  ASSERT_OK(Flush(1));
+
+  // index/filter blocks added to block cache right after table creation.
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // only index/filter were added
+  ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
+
+  std::string value;
+  // Miss and hit count should remain the same, they're all pinned.
+  ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // Miss and hit count should remain the same, they're all pinned.
+  value = Get(1, "key");
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest,
+       MultiLevelIndexAndFilterBlocksCachedWithPinning) {
+  Options options = CurrentOptions();
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false);
+  // get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  std::string value;
+  // this should be read from L0
+  // so cache values don't change
+  value = Get(1, "a2");
+  ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  // the file is opened, prefetching results in a cache filter miss
+  // the block is loaded and added to the cache,
+  // then the get results in a cache hit for L1
+  // When we have inifinite max_files, there is still cache miss because we have
+  // reset the block cache
+  value = Get(1, "a");
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+}
+
+TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) {
+  Options options = CurrentOptions();
+  // This ensures that db does not ref anything in the block cache, so
+  // EraseUnRefEntries could clear them up.
+  bool close_afterwards = true;
+  PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards);
+
+  // Get base cache values
+  uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS);
+  uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
+  uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS);
+  uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT);
+
+  if (disallow_preload_) {
+    // Now we have two files. We narrow the max open files to allow 3 entries
+    // so that preloading SST files won't happen.
+    options.max_open_files = 13;
+    // RocksDB sanitize max open files to at least 20. Modify it back.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+          int* max_open_files = static_cast<int*>(arg);
+          *max_open_files = 13;
+        });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Reopen database. If max_open_files is set as -1, table readers will be
+  // preloaded. This will trigger a BlockBasedTable::Open() and prefetch
+  // L0 index and filter. Level 1's prefetching is disabled in DB::Open()
+  TryReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  if (!disallow_preload_) {
+    // After reopen, cache miss are increased by one because we read (and only
+    // read) filter and index on L0
+    ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // If max_open_files is not -1, we do not preload table readers, so there is
+    // no change.
+    ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+  std::string value;
+  // this should be read from L0
+  value = Get(1, "a2");
+  // If max_open_files is -1, we have pinned index and filter in Rep, so there
+  // will not be changes in index and filter misses or hits. If max_open_files
+  // is not -1, Get() will open a TableReader and prefetch index and filter.
+  ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+  ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+  ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+  ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+
+  // this should be read from L1
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    // In inifinite max files case, there's a cache miss in executing Get()
+    // because index and filter are not prefetched before.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    // In this case, cache miss will be increased by one in
+    // BlockBasedTable::Open() because this is not in DB::Open() code path so we
+    // will prefetch L1's index and filter. Cache hit will also be increased by
+    // one because Get() will read index and filter from the block cache
+    // prefetched in previous Open() call.
+    ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Force a full compaction to one single file. There will be a block
+  // cache read for both of index and filter. If prefetch doesn't explicitly
+  // happen, it will happen when verifying the file.
+  Compact(1, "a", "zzzzz");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+
+  // Bloom and index hit will happen when a Get() happens.
+  value = Get(1, "a");
+  if (!disallow_preload_) {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  } else {
+    ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
+    ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
+    ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
+    ASSERT_EQ(ih + 5, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest,
+                        PinL0IndexAndFilterBlocksTest,
+                        ::testing::Values(std::make_tuple(true, false),
+                                          std::make_tuple(false, false),
+                                          std::make_tuple(false, true)));
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, MaxCompactionBytesTest) {
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+  options.compaction_style = kCompactionStyleLevel;
+  options.write_buffer_size = 200 << 10;
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  options.compression = kNoCompression;
+  options.max_bytes_for_level_base = 450 << 10;
+  options.target_file_size_base = 100 << 10;
+  // Infinite for full compaction.
+  options.max_compaction_bytes = options.target_file_size_base * 100;
+
+  Reopen(options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < 8; num++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_EQ("0,0,8", FilesPerLevel(0));
+
+  // When compact from Ln -> Ln+1, cut a file if the file overlaps with
+  // more than three files in Ln+1.
+  options.max_compaction_bytes = options.target_file_size_base * 3;
+  Reopen(options);
+
+  GenerateNewRandomFile(&rnd);
+  // Add three more small files that overlap with the previous file
+  for (int i = 0; i < 3; i++) {
+    ASSERT_OK(Put("a", "z"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Output files to L1 are cut to 4 pieces, according to
+  // options.max_compaction_bytes (300K)
+  // There are 8 files on L2 (grandparents level), each one is 100K. The first
+  // file overlaps with a, b which max_compaction_bytes is less than 300K, the
+  // second one overlaps with d, e, which is also less than 300K. Including any
+  // extra grandparent file will make the future compaction larger than 300K.
+  // L1: [  1  ] [  2 ]  [  3  ] [ 4 ]
+  // L2: [a] [b] [c] [d] [e] [f] [g] [h]
+  ASSERT_EQ("0,4,8", FilesPerLevel(0));
+}
+
+static void UniqueIdCallback(void* arg) {
+  int* result = reinterpret_cast<int*>(arg);
+  if (*result == -1) {
+    *result = 0;
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+}
+
+class MockPersistentCache : public PersistentCache {
+ public:
+  explicit MockPersistentCache(const bool is_compressed, const size_t max_size)
+      : is_compressed_(is_compressed), max_size_(max_size) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
+  }
+
+  ~MockPersistentCache() override {}
+
+  PersistentCache::StatsType Stats() override {
+    return PersistentCache::StatsType();
+  }
+
+  uint64_t NewId() override {
+    return last_id_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  Status Insert(const Slice& page_key, const char* data,
+                const size_t size) override {
+    MutexLock _(&lock_);
+
+    if (size_ > max_size_) {
+      size_ -= data_.begin()->second.size();
+      data_.erase(data_.begin());
+    }
+
+    data_.insert(std::make_pair(page_key.ToString(), std::string(data, size)));
+    size_ += size;
+    return Status::OK();
+  }
+
+  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
+                size_t* size) override {
+    MutexLock _(&lock_);
+    auto it = data_.find(page_key.ToString());
+    if (it == data_.end()) {
+      return Status::NotFound();
+    }
+
+    assert(page_key.ToString() == it->first);
+    data->reset(new char[it->second.size()]);
+    memcpy(data->get(), it->second.c_str(), it->second.size());
+    *size = it->second.size();
+    return Status::OK();
+  }
+
+  bool IsCompressed() override { return is_compressed_; }
+
+  std::string GetPrintableOptions() const override {
+    return "MockPersistentCache";
+  }
+
+  port::Mutex lock_;
+  std::map<std::string, std::string> data_;
+  const bool is_compressed_ = true;
+  size_t size_ = 0;
+  const size_t max_size_ = 10 * 1024;  // 10KiB
+  std::atomic<uint64_t> last_id_{1};
+};
+
+#ifdef OS_LINUX
+// Make sure that in CPU time perf context counters, Env::NowCPUNanos()
+// is used, rather than Env::CPUNanos();
+TEST_F(DBTest2, TestPerfContextGetCpuTime) {
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during Get().
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(0, get_perf_context()->get_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_GT(env_->now_cpu_count_.load(), 2);
+  ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestPerfContextIterCpuTime) {
+  DestroyAndReopen(CurrentOptions());
+  // force resizing table cache so table handle is not preloaded so that
+  // we can measure find_table_nanos during iteration
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+
+  const size_t kNumEntries = 10;
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+  for (size_t i = 0; i < kNumEntries; ++i) {
+    ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i)));
+  }
+  std::string last_key = "k" + std::to_string(kNumEntries - 1);
+  std::string last_value = "v" + std::to_string(kNumEntries - 1);
+  env_->now_cpu_count_.store(0);
+  env_->SetMockSleep();
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  // CPU timing is not enabled with kEnableTimeExceptForMutex
+  SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex);
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos);
+  ASSERT_EQ(0, env_->now_cpu_count_.load());
+  delete iter;
+
+  constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000};
+  constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds;
+
+  // Add time to NowNanos() reading.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TableCache::FindTable:0",
+      [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+  iter = db_->NewIterator(ReadOptions());
+  iter->Seek("k0");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  iter->SeekForPrev(last_key);
+  ASSERT_TRUE(iter->Valid());
+  iter->SeekToLast();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(last_value, iter->value().ToString());
+  iter->SeekToFirst();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos);
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("v1", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_OK(iter->status());
+  ASSERT_EQ("v0", iter->value().ToString());
+  ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0);
+  ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos);
+  ASSERT_GE(env_->now_cpu_count_.load(), 12);
+  ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos);
+
+  SetPerfLevel(PerfLevel::kDisable);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  delete iter;
+}
+#endif  // OS_LINUX
+
+#if !defined OS_SOLARIS
+TEST_F(DBTest2, PersistentCache) {
+  int num_iter = 80;
+
+  Options options;
+  options.write_buffer_size = 64 * 1024;  // small write buffer
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options = CurrentOptions(options);
+
+  auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024};
+  auto types = {/*compressed*/ 1, /*uncompressed*/ 0};
+  for (auto bsize : bsizes) {
+    for (auto type : types) {
+      BlockBasedTableOptions table_options;
+      table_options.persistent_cache.reset(
+          new MockPersistentCache(type, 10 * 1024));
+      table_options.no_block_cache = true;
+      table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr;
+      table_options.block_cache_compressed = nullptr;
+      options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+      DestroyAndReopen(options);
+      CreateAndReopenWithCF({"pikachu"}, options);
+      // default column family doesn't have block cache
+      Options no_block_cache_opts;
+      no_block_cache_opts.statistics = options.statistics;
+      no_block_cache_opts = CurrentOptions(no_block_cache_opts);
+      BlockBasedTableOptions table_options_no_bc;
+      table_options_no_bc.no_block_cache = true;
+      no_block_cache_opts.table_factory.reset(
+          NewBlockBasedTableFactory(table_options_no_bc));
+      ReopenWithColumnFamilies(
+          {"default", "pikachu"},
+          std::vector<Options>({no_block_cache_opts, options}));
+
+      Random rnd(301);
+
+      // Write 8MB (80 values, each 100K)
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+      std::vector<std::string> values;
+      std::string str;
+      for (int i = 0; i < num_iter; i++) {
+        if (i % 4 == 0) {  // high compression ratio
+          str = rnd.RandomString(1000);
+        }
+        values.push_back(str);
+        ASSERT_OK(Put(1, Key(i), values[i]));
+      }
+
+      // flush all data from memtable so that reads are from block cache
+      ASSERT_OK(Flush(1));
+
+      for (int i = 0; i < num_iter; i++) {
+        ASSERT_EQ(Get(1, Key(i)), values[i]);
+      }
+
+      auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT);
+      auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS);
+
+      ASSERT_GT(hit, 0);
+      ASSERT_GT(miss, 0);
+    }
+  }
+}
+#endif  // !defined OS_SOLARIS
+
+namespace {
+void CountSyncPoint() {
+  TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */);
+}
+}  // anonymous namespace
+
+TEST_F(DBTest2, SyncPointMarker) {
+  std::atomic<int> sync_point_called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTest2::MarkedPoint",
+      [&](void* /*arg*/) { sync_point_called.fetch_add(1); });
+
+  // The first dependency enforces Marker can be loaded before MarkedPoint.
+  // The second checks that thread 1's MarkedPoint should be disabled here.
+  // Execution order:
+  // |   Thread 1    |  Thread 2   |
+  // |               |   Marker    |
+  // |  MarkedPoint  |             |
+  // | Thread1First  |             |
+  // |               | MarkedPoint |
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers(
+      {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}},
+      {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::function<void()> func1 = [&]() {
+    CountSyncPoint();
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First");
+  };
+
+  std::function<void()> func2 = [&]() {
+    TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker");
+    CountSyncPoint();
+  };
+
+  auto thread1 = port::Thread(func1);
+  auto thread2 = port::Thread(func2);
+  thread1.join();
+  thread2.join();
+
+  // Callback is only executed once
+  ASSERT_EQ(sync_point_called.load(), 1);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif
+
+size_t GetEncodedEntrySize(size_t key_size, size_t value_size) {
+  std::string buffer;
+
+  PutVarint32(&buffer, static_cast<uint32_t>(0));
+  PutVarint32(&buffer, static_cast<uint32_t>(key_size));
+  PutVarint32(&buffer, static_cast<uint32_t>(value_size));
+
+  return buffer.size() + key_size + value_size;
+}
+
+TEST_F(DBTest2, ReadAmpBitmap) {
+  Options options = CurrentOptions();
+  BlockBasedTableOptions bbto;
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = NewLRUCache(1024 * 1024 * 1024);
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+    DestroyAndReopen(options);
+
+    const size_t kNumEntries = 10000;
+
+    Random rnd(301);
+    for (size_t i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(static_cast<int>(i)), rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    // Read keys/values randomly and verify that reported read amp error
+    // is less than 2%
+    uint64_t total_useful_bytes = 0;
+    std::set<int> read_keys;
+    std::string value;
+    for (size_t i = 0; i < kNumEntries * 5; i++) {
+      int key_idx = rnd.Next() % kNumEntries;
+      std::string key = Key(key_idx);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(key_idx) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        total_useful_bytes +=
+            GetEncodedEntrySize(internal_key.size(), value.size());
+        read_keys.insert(key_idx);
+      }
+
+      double expected_read_amp =
+          static_cast<double>(total_useful_bytes) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double read_amp =
+          static_cast<double>(options.statistics->getTickerCount(
+              READ_AMP_ESTIMATE_USEFUL_BYTES)) /
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+      double error_pct = fabs(expected_read_amp - read_amp) * 100;
+      // Error between reported read amp and real read amp should be less than
+      // 2%
+      EXPECT_LE(error_pct, 2);
+    }
+
+    // Make sure we read every thing in the DB (which is smaller than our cache)
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString()));
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES),
+          options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES));
+    } else {
+      ASSERT_NEAR(
+          options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) *
+              1.0f /
+              options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES),
+          1, .01);
+    }
+  }
+}
+
+#ifndef OS_SOLARIS  // GetUniqueIdFromFile is not implemented
+TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) {
+  {
+    const int kIdBufLen = 100;
+    char id_buf[kIdBufLen];
+    Status s = Status::NotSupported();
+#ifndef OS_WIN
+    // You can't open a directory on windows using random access file
+    std::unique_ptr<RandomAccessFile> file;
+    s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions());
+    if (s.ok()) {
+      if (file->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
+#endif
+    if (!s.ok()) {
+      std::unique_ptr<Directory> dir;
+      ASSERT_OK(env_->NewDirectory(dbname_, &dir));
+      if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) {
+        // fs holding db directory doesn't support getting a unique file id,
+        // this means that running this test will fail because lru_cache will
+        // load the blocks again regardless of them being already in the cache
+        return;
+      }
+    }
+  }
+  uint32_t bytes_per_bit[2] = {1, 16};
+  for (size_t k = 0; k < 2; k++) {
+    std::shared_ptr<Cache> lru_cache = NewLRUCache(1024 * 1024 * 1024);
+    std::shared_ptr<Statistics> stats = ROCKSDB_NAMESPACE::CreateDBStatistics();
+
+    Options options = CurrentOptions();
+    BlockBasedTableOptions bbto;
+    // Disable delta encoding to make it easier to calculate read amplification
+    bbto.use_delta_encoding = false;
+    // Huge block cache to make it easier to calculate read amplification
+    bbto.block_cache = lru_cache;
+    bbto.read_amp_bytes_per_bit = bytes_per_bit[k];
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.statistics = stats;
+    DestroyAndReopen(options);
+
+    const int kNumEntries = 10000;
+
+    Random rnd(301);
+    for (int i = 0; i < kNumEntries; i++) {
+      ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+    }
+    ASSERT_OK(Flush());
+
+    Close();
+    Reopen(options);
+
+    std::set<int> read_keys;
+    std::string value;
+    // Iter1: Read half the DB, Read even keys
+    // Key(0), Key(2), Key(4), Key(6), Key(8), ...
+    for (int i = 0; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter1 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+    Close();
+    std::shared_ptr<Statistics> new_statistics =
+        ROCKSDB_NAMESPACE::CreateDBStatistics();
+    // Destroy old statistics obj that the blocks in lru_cache are pointing to
+    options.statistics.reset();
+    // Use the statistics object that we just created
+    options.statistics = new_statistics;
+    Reopen(options);
+
+    // Iter2: Read half the DB, Read odd keys
+    // Key(1), Key(3), Key(5), Key(7), Key(9), ...
+    for (int i = 1; i < kNumEntries; i += 2) {
+      std::string key = Key(i);
+      ASSERT_OK(db_->Get(ReadOptions(), key, &value));
+
+      if (read_keys.find(i) == read_keys.end()) {
+        auto internal_key = InternalKey(key, 0, ValueType::kTypeValue);
+        read_keys.insert(i);
+      }
+    }
+
+    size_t total_useful_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES);
+    size_t total_loaded_bytes_iter2 =
+        options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES);
+
+    // Read amp is on average 100% since we read all what we loaded in memory
+    if (k == 0) {
+      ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2,
+                total_loaded_bytes_iter1 + total_loaded_bytes_iter2);
+    } else {
+      ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f /
+                      (total_loaded_bytes_iter1 + total_loaded_bytes_iter2),
+                  1, .01);
+    }
+  }
+}
+#endif  // !OS_SOLARIS
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  auto get_stat = [](std::string level_str, LevelStatType type,
+                     std::map<std::string, std::string> props) {
+    auto prop_str =
+        "compaction." + level_str + "." +
+        InternalStats::compaction_level_stats.at(type).property_name.c_str();
+    auto prop_item = props.find(prop_str);
+    return prop_item == props.end() ? 0 : std::stod(prop_item->second);
+  };
+
+  // Trivial move 2 files to L2
+  ASSERT_EQ("0,0,2", FilesPerLevel());
+  // Also test that the stats GetMapProperty API reporting the same result
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop));
+    ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop));
+  }
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L2, these 2 files will be moved to L2 and overlap with
+  // the running compaction and break the LSM consistency.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        ASSERT_OK(
+            dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"},
+                                  {"max_bytes_for_level_base", "1"}}));
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L2
+  // into 1 file in L2
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Test that the stats GetMapProperty API reporting 1 file in L2
+  {
+    std::map<std::string, std::string> prop;
+    ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop));
+    ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop));
+  }
+}
+
+TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.IncreaseParallelism(20);
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Trivial move 2 files to L1
+  ASSERT_EQ("0,2", FilesPerLevel());
+
+  std::function<void()> bg_manual_compact = [&]() {
+    std::string k1 = Key(6);
+    std::string k2 = Key(9);
+    Slice k1s(k1);
+    Slice k2s(k2);
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s));
+  };
+  ROCKSDB_NAMESPACE::port::Thread bg_thread;
+
+  // While the compaction is running, we will create 2 new files that
+  // can fit in L1, these 2 files will be moved to L1 and overlap with
+  // the running compaction and break the LSM consistency.
+  std::atomic<bool> flag(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (flag.exchange(true)) {
+          // We want to make sure to call this callback only once
+          return;
+        }
+        ASSERT_OK(Put(Key(6), "a"));
+        ASSERT_OK(Put(Key(7), "a"));
+        ASSERT_OK(Flush());
+
+        ASSERT_OK(Put(Key(8), "a"));
+        ASSERT_OK(Put(Key(9), "a"));
+        ASSERT_OK(Flush());
+
+        // Start a non-exclusive manual compaction in a bg thread
+        bg_thread = port::Thread(bg_manual_compact);
+        // This manual compaction conflict with the other manual compaction
+        // so it should wait until the first compaction finish
+        env_->SleepForMicroseconds(1000000);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Run a manual compaction that will compact the 2 files in L1
+  // into 1 file in L1
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  bg_thread.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction1) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+  // Generate a file containing 10 keys.
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  // Generate another file containing same keys
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(50)));
+  }
+  ASSERT_OK(Flush());
+
+  int manual_compactions_paused = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) {
+        auto canceled = static_cast<std::atomic<bool>*>(arg);
+        // CompactRange triggers manual compaction and cancel the compaction
+        // by set *canceled as true
+        if (canceled != nullptr) {
+          canceled->store(true, std::memory_order_release);
+        }
+        manual_compactions_paused += 1;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        // CompactFiles() relies on manual_compactions_paused to
+        // determine if thie compaction should be paused or not
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<std::string> files_before_compact, files_after_compact;
+  // Remember file name before compaction is triggered
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_before_compact.push_back(file.name);
+  }
+
+  // OK, now trigger a manual compaction
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(CompactRangeOptions(), nullptr, nullptr)
+                  .IsManualCompactionPaused());
+
+  // Wait for compactions to get scheduled and stopped
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Get file names after compaction is stopped
+  files_meta.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  // Like nothing happened
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  ASSERT_EQ(manual_compactions_paused, 1);
+
+  manual_compactions_paused = 0;
+  // Now make sure CompactFiles also not run
+  ASSERT_TRUE(dbfull()
+                  ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(),
+                                 files_before_compact, 0)
+                  .IsManualCompactionPaused());
+  // Wait for manual compaction to get scheduled and finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  files_meta.clear();
+  files_after_compact.clear();
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  for (auto file : files_meta) {
+    files_after_compact.push_back(file.name);
+  }
+
+  ASSERT_EQ(files_before_compact, files_after_compact);
+  // CompactFiles returns at entry point
+  ASSERT_EQ(manual_compactions_paused, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// PausingManualCompaction does not affect auto compaction
+TEST_F(DBTest2, PausingManualCompaction2) {
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = 2;
+  options.disable_auto_compactions = false;
+
+  DestroyAndReopen(options);
+  dbfull()->DisableManualCompaction();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; i++) {
+    // Generate a file containing 100 keys.
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(j), rnd.RandomString(50)));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  std::vector<LiveFileMetaData> files_meta;
+  dbfull()->GetLiveFilesMetaData(&files_meta);
+  ASSERT_EQ(files_meta.size(), 1);
+}
+
+TEST_F(DBTest2, PausingManualCompaction3) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->DisableManualCompaction();
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  // As manual compaction disabled, not even reach sync point
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+  dbfull()->EnableManualCompaction();
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, PausingManualCompaction4) {
+  CompactRangeOptions compact_options;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) {
+        auto canceled = static_cast<std::atomic<bool>*>(arg);
+        // CompactRange triggers manual compaction and cancel the compaction
+        // by set *canceled as true
+        if (canceled != nullptr) {
+          canceled->store(true, std::memory_order_release);
+        }
+        run_manual_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) {
+        auto paused = static_cast<std::atomic<int>*>(arg);
+        // CompactFiles() relies on manual_compactions_paused to
+        // determine if thie compaction should be paused or not
+        ASSERT_EQ(0, paused->load(std::memory_order_acquire));
+        paused->fetch_add(1, std::memory_order_release);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_EQ(run_manual_compactions, 1);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2");
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction1) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  int run_manual_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1",
+      [&](void* /*arg*/) { run_manual_compactions++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Setup a callback to disable compactions after a couple of levels are
+  // compacted
+  int compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1",
+      [&](void* /*arg*/) { ++compactions_run; });
+
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // Since compactions are disabled, we shouldn't start compacting.
+  // E.g. we should call the compaction function exactly one time.
+  ASSERT_EQ(compactions_run, 0);
+  ASSERT_EQ(run_manual_compactions, 0);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  compactions_run = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+        if (compactions_run == 3) {
+          compact_options.canceled->store(true, std::memory_order_release);
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(compactions_run, 3);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CancelManualCompaction2) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+
+  Random rnd(301);
+  auto generate_files = [&]() {
+    for (int i = 0; i < options.num_levels; i++) {
+      for (int j = 0; j < options.num_levels - i + 1; j++) {
+        for (int k = 0; k < 1000; k++) {
+          ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50)));
+        }
+        ASSERT_OK(Flush());
+      }
+
+      for (int l = 1; l < options.num_levels - i; l++) {
+        MoveFilesToLevel(l);
+      }
+    }
+  };
+
+  DestroyAndReopen(options);
+  generate_files();
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int compactions_run = 0;
+  std::atomic<int> kv_compactions{0};
+  int compactions_stopped_at = 0;
+  int kv_compactions_stopped_at = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) {
+        ++compactions_run;
+        // After 3 compactions disable
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        int kv_compactions_run =
+            kv_compactions.fetch_add(1, std::memory_order_release);
+        if (kv_compactions_run == 5) {
+          compact_options.canceled->store(true, std::memory_order_release);
+          kv_compactions_stopped_at = kv_compactions_run;
+          compactions_stopped_at = compactions_run;
+        }
+      });
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to
+  // the canceled variable from the single compacting thread (via callback),
+  // this value is deterministically kv_compactions_stopped_at + 1.
+  ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1);
+  ASSERT_EQ(compactions_run, compactions_stopped_at);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator::ProcessKV");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "DBImpl::RunManualCompaction()::1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionJob::Run():PausingManualCompaction:1");
+
+  // Compactions should work again if we re-enable them..
+  compact_options.canceled->store(false, std::memory_order_relaxed);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class CancelCompactionListener : public EventListener {
+ public:
+  CancelCompactionListener()
+      : num_compaction_started_(0), num_compaction_ended_(0) {}
+
+  void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    num_compaction_started_++;
+  }
+
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    ASSERT_EQ(ci.cf_name, "default");
+    ASSERT_EQ(ci.base_input_level, 0);
+    ASSERT_EQ(ci.status.code(), code_);
+    ASSERT_EQ(ci.status.subcode(), subcode_);
+    num_compaction_ended_++;
+  }
+
+  std::atomic<size_t> num_compaction_started_;
+  std::atomic<size_t> num_compaction_ended_;
+  Status::Code code_;
+  Status::SubCode subcode_;
+};
+
+TEST_F(DBTest2, CancelManualCompactionWithListener) {
+  CompactRangeOptions compact_options;
+  auto canceledPtr =
+      std::unique_ptr<std::atomic<bool>>(new std::atomic<bool>{true});
+  compact_options.canceled = canceledPtr.get();
+  compact_options.max_subcompactions = 1;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator:ProcessKV", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  int running_compaction = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::FinishCompactionOutputFile1",
+      [&](void* /*arg*/) { running_compaction++; });
+
+  // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable
+  // manual compaction in the callback function, 3 Compaction not run,
+  // 4 Notify compaction end.
+  listener->code_ = Status::kIncomplete;
+  listener->subcode_ = Status::SubCode::kManualCompactionPaused;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  listener->num_compaction_started_ = 0;
+  listener->num_compaction_ended_ = 0;
+
+  // Case II: 1 Set *canceled as true in the callback function to disable manual
+  // compaction, 2 Notify begin compaction (return without notifying), 3 Notify
+  // compaction end (return without notifying).
+  ASSERT_TRUE(dbfull()
+                  ->CompactRange(compact_options, nullptr, nullptr)
+                  .IsManualCompactionPaused());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_EQ(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+  ASSERT_EQ(running_compaction, 0);
+
+  // Case III: 1 Notify begin compaction, 2 Compaction in between
+  // 3. Set *canceled as true in the callback function to disable manual
+  // compaction, 4 Notify compaction end.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "CompactionIterator:ProcessKV");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) {
+        compact_options.canceled->store(true, std::memory_order_release);
+      });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  compact_options.canceled->store(false, std::memory_order_release);
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  // Compaction job will succeed.
+  ASSERT_GT(running_compaction, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) {
+  int num_levels = 3;
+  const int kNumFilesTrigger = 4;
+
+  Options options = CurrentOptions();
+  env_->SetBackgroundThreads(0, Env::Priority::HIGH);
+  env_->SetBackgroundThreads(0, Env::Priority::LOW);
+  env_->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  options.env = env_;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+
+  CancelCompactionListener* listener = new CancelCompactionListener();
+  options.listeners.emplace_back(listener);
+
+  DestroyAndReopen(options);
+
+  int num_bottom_thread_compaction_scheduled = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+      [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; });
+
+  int num_compaction_jobs = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compaction_jobs++; });
+
+  listener->code_ = Status::kOk;
+  listener->subcode_ = Status::SubCode::kNone;
+
+  Random rnd(301);
+  for (int i = 0; i < 1; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(num_bottom_thread_compaction_scheduled, 0);
+  ASSERT_EQ(num_compaction_jobs, 1);
+  ASSERT_GT(listener->num_compaction_started_, 0);
+  ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, OptimizeForPointLookup) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForPointLookup(2);
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("v1", Get("foo"));
+}
+
+TEST_F(DBTest2, OptimizeForSmallDB) {
+  Options options = CurrentOptions();
+  Close();
+  options.OptimizeForSmallDb();
+
+  // Find the cache object
+  ASSERT_TRUE(options.table_factory->IsInstanceOf(
+      TableFactory::kBlockBasedTableName()));
+  auto table_options =
+      options.table_factory->GetOptions<BlockBasedTableOptions>();
+
+  ASSERT_TRUE(table_options != nullptr);
+  std::shared_ptr<Cache> cache = table_options->block_cache;
+
+  ASSERT_EQ(0, cache->GetUsage());
+  ASSERT_OK(DB::Open(options, dbname_, &db_));
+  ASSERT_OK(Put("foo", "v1"));
+
+  // memtable size is costed to the block cache
+  ASSERT_NE(0, cache->GetUsage());
+
+  ASSERT_EQ("v1", Get("foo"));
+  ASSERT_OK(Flush());
+
+  size_t prev_size = cache->GetUsage();
+  // Remember block cache size, so that we can find that
+  // it is filled after Get().
+  // Use pinnable slice so that it can ping the block so that
+  // when we check the size it is not evicted.
+  PinnableSlice value;
+  ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value));
+  ASSERT_GT(cache->GetUsage(), prev_size);
+  value.Reset();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, IterRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"},
+       {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned after second Put(), so it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"},
+       {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2");
+  });
+
+  // iterator is created after the first Put(), and its snapshot sequence is
+  // assigned before second Put(), thus it must see v1.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v1", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, IterRefreshRaceFlush) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"},
+       {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2");
+  });
+
+  // iterator is refreshed after the first Put(), and its sequence number is
+  // assigned after second Put(), thus it must see v2.
+  {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(it->status());
+    ASSERT_OK(it->Refresh());
+    it->Seek("foo");
+    ASSERT_TRUE(it->Valid());
+    ASSERT_OK(it->status());
+    ASSERT_EQ("foo", it->key().ToString());
+    ASSERT_EQ("v2", it->value().ToString());
+  }
+
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush1) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, GetRaceFlush2) {
+  ASSERT_OK(Put("foo", "v1"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"},
+       {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread t1([&] {
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:1");
+    ASSERT_OK(Put("foo", "v2"));
+    ASSERT_OK(Flush());
+    TEST_SYNC_POINT("DBTest2::GetRaceFlush:2");
+  });
+
+  // Get() is issued after the first Put(), so it should see either
+  // "v1" or "v2".
+  ASSERT_NE("NOT_FOUND", Get("foo"));
+  t1.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, DirectIO) {
+  if (!IsDirectIOSupported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
+      true;
+  options.allow_mmap_reads = options.allow_mmap_writes = false;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "a"));
+  ASSERT_OK(Put(Key(5), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(10), "a"));
+  ASSERT_OK(Put(Key(15), "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  Reopen(options);
+}
+
+TEST_F(DBTest2, MemtableOnlyIterator) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "first"));
+  ASSERT_OK(Put(1, "bar", "second"));
+
+  ReadOptions ropt;
+  ropt.read_tier = kMemtableTier;
+  std::string value;
+  Iterator* it = nullptr;
+
+  // Before flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+
+  // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet.
+  it = db_->NewIterator(ropt, handles_[1]);
+  int count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(2, count);
+  delete it;
+
+  Flush(1);
+
+  // After flushing
+  // point lookups
+  ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value));
+  ASSERT_EQ("first", value);
+  ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value));
+  ASSERT_EQ("second", value);
+  // nothing should be returned using memtable-only iterator after flushing.
+  it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(0, count);
+  ASSERT_OK(it->status());
+  delete it;
+
+  // Add a key to memtable
+  ASSERT_OK(Put(1, "foobar", "third"));
+  it = db_->NewIterator(ropt, handles_[1]);
+  ASSERT_OK(it->status());
+  count = 0;
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    ASSERT_TRUE(it->Valid());
+    ASSERT_EQ("foobar", it->key().ToString());
+    ASSERT_EQ("third", it->value().ToString());
+    count++;
+  }
+  ASSERT_TRUE(!it->Valid());
+  ASSERT_EQ(1, count);
+  ASSERT_OK(it->status());
+  delete it;
+}
+
+TEST_F(DBTest2, LowPriWrite) {
+  Options options = CurrentOptions();
+  // Compaction pressure should trigger since 6 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 12;
+  options.level0_stop_writes_trigger = 30;
+  options.delayed_write_rate = 8 * 1024 * 1024;
+  Reopen(options);
+
+  std::atomic<int> rate_limit_count(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "GenericRateLimiter::Request:1", [&](void* arg) {
+        rate_limit_count.fetch_add(1);
+        int64_t* rate_bytes_per_sec = static_cast<int64_t*>(arg);
+        ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec);
+      });
+  // Block compaction
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  WriteOptions wo;
+  for (int i = 0; i < 6; i++) {
+    wo.low_pri = false;
+    ASSERT_OK(Put("", "", wo));
+    wo.low_pri = true;
+    ASSERT_OK(Put("", "", wo));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(0, rate_limit_count.load());
+  wo.low_pri = true;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+
+  TEST_SYNC_POINT("DBTest.LowPriWrite:0");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  wo.low_pri = true;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+  wo.low_pri = false;
+  ASSERT_OK(Put("", "", wo));
+  ASSERT_EQ(1, rate_limit_count.load());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RateLimitedCompactionReads) {
+  // compaction input has 512KB data
+  const int kNumKeysPerFile = 128;
+  const int kBytesPerKey = 1024;
+  const int kNumL0Files = 4;
+
+  for (int compaction_readahead_size : {0, 32 << 10}) {
+    for (auto use_direct_io : {false, true}) {
+      if (use_direct_io && !IsDirectIOSupported()) {
+        continue;
+      }
+      Options options = CurrentOptions();
+      options.compaction_readahead_size = compaction_readahead_size;
+      options.compression = kNoCompression;
+      options.level0_file_num_compaction_trigger = kNumL0Files;
+      options.memtable_factory.reset(
+          test::NewSpecialSkipListFactory(kNumKeysPerFile));
+      // takes roughly one second, split into 100 x 10ms intervals. Each
+      // interval permits 5.12KB, which is smaller than the block size, so this
+      // test exercises the code for chunking reads.
+      options.rate_limiter.reset(NewGenericRateLimiter(
+          static_cast<int64_t>(kNumL0Files * kNumKeysPerFile *
+                               kBytesPerKey) /* rate_bytes_per_sec */,
+          10 * 1000 /* refill_period_us */, 10 /* fairness */,
+          RateLimiter::Mode::kReadsOnly));
+      options.use_direct_reads =
+          options.use_direct_io_for_flush_and_compaction = use_direct_io;
+      BlockBasedTableOptions bbto;
+      bbto.block_size = 16384;
+      bbto.no_block_cache = true;
+      options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+      DestroyAndReopen(options);
+
+      for (int i = 0; i < kNumL0Files; ++i) {
+        for (int j = 0; j <= kNumKeysPerFile; ++j) {
+          ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey)));
+        }
+        ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+        if (i + 1 < kNumL0Files) {
+          ASSERT_EQ(i + 1, NumTableFilesAtLevel(0));
+        }
+      }
+      ASSERT_OK(dbfull()->TEST_WaitForCompact());
+      ASSERT_EQ(0, NumTableFilesAtLevel(0));
+
+      // should be slightly above 512KB due to non-data blocks read. Arbitrarily
+      // chose 1MB as the upper bound on the total bytes read.
+      size_t rate_limited_bytes = static_cast<size_t>(
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL));
+      // The charges can exist for `IO_LOW` and `IO_USER` priorities.
+      size_t rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+      ASSERT_EQ(rate_limited_bytes,
+                static_cast<size_t>(rate_limited_bytes_by_pri));
+      // Include the explicit prefetch of the footer in direct I/O case.
+      size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
+      ASSERT_GE(
+          rate_limited_bytes,
+          static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files));
+      ASSERT_LT(
+          rate_limited_bytes,
+          static_cast<size_t>(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files +
+                              direct_io_extra));
+
+      Iterator* iter = db_->NewIterator(ReadOptions());
+      ASSERT_OK(iter->status());
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey));
+      }
+      delete iter;
+      // bytes read for user iterator shouldn't count against the rate limit.
+      rate_limited_bytes_by_pri =
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) +
+          options.rate_limiter->GetTotalBytesThrough(Env::IO_USER);
+      ASSERT_EQ(rate_limited_bytes,
+                static_cast<size_t>(rate_limited_bytes_by_pri));
+    }
+  }
+}
+#endif  // ROCKSDB_LITE
+
+// Make sure DB can be reopen with reduced number of levels, given no file
+// is on levels higher than the new num_levels.
+TEST_F(DBTest2, ReduceLevel) {
+  Options options;
+  options.env = env_;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  Reopen(options);
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 1;
+  ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  options.num_levels = 3;
+  Reopen(options);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+}
+
+// Test that ReadCallback is actually used in both memtbale and sst tables
+TEST_F(DBTest2, ReadCallbackTest) {
+  Options options;
+  options.disable_auto_compactions = true;
+  options.num_levels = 7;
+  options.env = env_;
+  Reopen(options);
+  std::vector<const Snapshot*> snapshots;
+  // Try to create a db with multiple layers and a memtable
+  const std::string key = "foo";
+  const std::string value = "bar";
+  // This test assumes that the seq start with 1 and increased by 1 after each
+  // write batch of size 1. If that behavior changes, the test needs to be
+  // updated as well.
+  // TODO(myabandeh): update this test to use the seq number that is returned by
+  // the DB instead of assuming what seq the DB used.
+  int i = 1;
+  for (; i < 10; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+  for (; i < 20; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    // Take a snapshot to avoid the value being removed during compaction
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+  MoveFilesToLevel(6);
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  for (; i < 30; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+  ASSERT_OK(Flush());
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel());
+#endif  // !ROCKSDB_LITE
+  // And also add some values to the memtable
+  for (; i < 40; i++) {
+    ASSERT_OK(Put(key, value + std::to_string(i)));
+    auto snapshot = dbfull()->GetSnapshot();
+    snapshots.push_back(snapshot);
+  }
+
+  class TestReadCallback : public ReadCallback {
+   public:
+    explicit TestReadCallback(SequenceNumber snapshot)
+        : ReadCallback(snapshot), snapshot_(snapshot) {}
+    bool IsVisibleFullCheck(SequenceNumber seq) override {
+      return seq <= snapshot_;
+    }
+
+   private:
+    SequenceNumber snapshot_;
+  };
+
+  for (int seq = 1; seq < i; seq++) {
+    PinnableSlice pinnable_val;
+    ReadOptions roptions;
+    TestReadCallback callback(seq);
+    bool dont_care = true;
+    DBImpl::GetImplOptions get_impl_options;
+    get_impl_options.column_family = dbfull()->DefaultColumnFamily();
+    get_impl_options.value = &pinnable_val;
+    get_impl_options.value_found = &dont_care;
+    get_impl_options.callback = &callback;
+    Status s = dbfull()->GetImpl(roptions, key, get_impl_options);
+    ASSERT_TRUE(s.ok());
+    // Assuming that after each Put the DB increased seq by one, the value and
+    // seq number must be equal since we also inc value by 1 after each Put.
+    ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString());
+  }
+
+  for (auto snapshot : snapshots) {
+    dbfull()->ReleaseSnapshot(snapshot);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+
+TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) {
+  // Regression test for race condition where an obsolete file is returned to
+  // user as a "live file" but then deleted, all while file deletions are
+  // disabled.
+  //
+  // It happened like this:
+  //
+  // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles
+  // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the
+  //    latter returned "x.log"
+  // 3. [flush thread] PurgeObsoleteFiles deleted "x.log"
+  // 4. [user thread] Reading "x.log" failed
+  //
+  // Unfortunately the only regression test I can come up with involves sleep.
+  // We cannot set SyncPoints to repro since, once the fix is applied, the
+  // SyncPoints would cause a deadlock as the repro's sequence of events is now
+  // prohibited.
+  //
+  // Instead, if we sleep for a second between Find and Purge, and ensure the
+  // read attempt happens after purge, then the sequence of events will almost
+  // certainly happen on the old code.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallFlush:FilesFound",
+       "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"},
+      {"DBImpl::PurgeObsoleteFiles:End",
+       "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::PurgeObsoleteFiles:Begin",
+      [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("key", "val"));
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  db_->Flush(flush_opts);
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered");
+
+  ASSERT_OK(db_->DisableFileDeletions());
+  VectorLogPtr log_files;
+  ASSERT_OK(db_->GetSortedWalFiles(log_files));
+  TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured");
+  for (const auto& log_file : log_files) {
+    ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber())));
+  }
+
+  ASSERT_OK(db_->EnableFileDeletions());
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestNumPread) {
+  Options options = CurrentOptions();
+  bool prefetch_supported =
+      test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
+  // disable block cache
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  env_->count_random_reads_ = true;
+  env_->random_file_open_counter_.store(0);
+  ASSERT_OK(Put("bar", "foo"));
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // All files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(Put("bar2", "foo2"));
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Flush());
+  if (prefetch_supported) {
+    // After flush, we'll open the file and read footer, meta block,
+    // property block and index block.
+    ASSERT_EQ(4, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch not supported, we will do a single read into a buffer
+    ASSERT_EQ(1, env_->random_read_counter_.Read());
+  }
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  if (prefetch_supported) {
+    // Compaction needs two input blocks, which requires 2 preads, and
+    // generate a new SST file which needs 4 preads (footer, meta block,
+    // property block and index block). In total 6.
+    ASSERT_EQ(6, env_->random_read_counter_.Read());
+  } else {
+    // With prefetch off, compaction needs two input blocks,
+    // followed by a single buffered read.  In total 3.
+    ASSERT_EQ(3, env_->random_read_counter_.Read());
+  }
+  // All compaction input files should have already been opened.
+  ASSERT_EQ(1, env_->random_file_open_counter_.load());
+
+  // One pread per a normal data block read
+  env_->random_file_open_counter_.store(0);
+  env_->random_read_counter_.Reset();
+  ASSERT_EQ("foo2", Get("bar2"));
+  ASSERT_EQ(1, env_->random_read_counter_.Read());
+  // SST files are already opened.
+  ASSERT_EQ(0, env_->random_file_open_counter_.load());
+}
+
+class TraceExecutionResultHandler : public TraceRecordResult::Handler {
+ public:
+  TraceExecutionResultHandler() {}
+  ~TraceExecutionResultHandler() override {}
+
+  virtual Status Handle(const StatusOnlyTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceWrite: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        writes_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const SingleValueTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        gets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(
+      const MultiValuesTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    for (const Status& s : result.GetMultiStatus()) {
+      s.PermitUncheckedError();
+    }
+    switch (result.GetTraceType()) {
+      case kTraceMultiGet: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        multigets_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Handle(const IteratorTraceExecutionResult& result) override {
+    if (result.GetStartTimestamp() > result.GetEndTimestamp()) {
+      return Status::InvalidArgument("Invalid timestamps.");
+    }
+    result.GetStatus().PermitUncheckedError();
+    switch (result.GetTraceType()) {
+      case kTraceIteratorSeek:
+      case kTraceIteratorSeekForPrev: {
+        total_latency_ += result.GetLatency();
+        cnt_++;
+        seeks_++;
+        break;
+      }
+      default:
+        return Status::Corruption("Type mismatch.");
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    total_latency_ = 0;
+    cnt_ = 0;
+    writes_ = 0;
+    gets_ = 0;
+    seeks_ = 0;
+    multigets_ = 0;
+  }
+
+  double GetAvgLatency() const {
+    return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_;
+  }
+
+  int GetNumWrites() const { return writes_; }
+
+  int GetNumGets() const { return gets_; }
+
+  int GetNumIterSeeks() const { return seeks_; }
+
+  int GetNumMultiGets() const { return multigets_; }
+
+ private:
+  std::atomic<uint64_t> total_latency_{0};
+  std::atomic<uint32_t> cnt_{0};
+  std::atomic<int> writes_{0};
+  std::atomic<int> gets_{0};
+  std::atomic<int> seeks_{0};
+  std::atomic<int> multigets_{0};
+};
+
+TEST_F(DBTest2, TraceAndReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  // 5 Writes
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  // 6th Write
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  // 2 Seek(ForPrev)s
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");  // Seek 1
+  single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // 2 Gets
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  // 7th and 8th Write, 3rd Get
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  // Total Write x 8, Get x 3, Seek x 2.
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+  std::function<void(Status, std::unique_ptr<TraceRecordResult> &&)> res_cb =
+      [&res_handler](Status exec_s, std::unique_ptr<TraceRecordResult>&& res) {
+        ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported());
+        if (res != nullptr) {
+          ASSERT_OK(res->Accept(&res_handler));
+          res.reset();
+        }
+      };
+
+  // Unprepared replay should fail with Status::Incomplete()
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+  ASSERT_OK(replayer->Prepare());
+  // Ok to repeatedly Prepare().
+  ASSERT_OK(replayer->Prepare());
+  // Replay using 1 thread, 1x speed.
+  ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  // Re-replay should fail with Status::Incomplete() if Prepare() was not
+  // called. Currently we don't distinguish between unprepared and trace end.
+  ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete());
+
+  // Re-replay using 2 threads, 2x speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Re-replay using 2 threads, 1/2 speed.
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb));
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 8);
+  ASSERT_EQ(res_handler.GetNumGets(), 3);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 2);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  replayer.reset();
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceAndManualReplay) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  ASSERT_TRUE(db_->EndTrace().IsIOError());
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  // Write some sequenced keys for testing lower/upper bounds of iterator.
+  batch.Clear();
+  ASSERT_OK(batch.Put("iter-0", "iter-0"));
+  ASSERT_OK(batch.Put("iter-1", "iter-1"));
+  ASSERT_OK(batch.Put("iter-2", "iter-2"));
+  ASSERT_OK(batch.Put("iter-3", "iter-3"));
+  ASSERT_OK(batch.Put("iter-4", "iter-4"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  ReadOptions bounded_ro = ro;
+  Slice lower_bound("iter-1");
+  Slice upper_bound("iter-3");
+  bounded_ro.iterate_lower_bound = &lower_bound;
+  bounded_ro.iterate_upper_bound = &upper_bound;
+  single_iter = db_->NewIterator(bounded_ro);
+  single_iter->Seek("iter-0");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-1");
+  single_iter->Seek("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->Seek("iter-4");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-0");
+  ASSERT_FALSE(single_iter->Valid());
+  single_iter->SeekForPrev("iter-2");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  single_iter->SeekForPrev("iter-4");
+  ASSERT_EQ(single_iter->key().ToString(), "iter-2");
+  ASSERT_OK(single_iter->status());
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2.
+  // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6
+  // Seek(ForPrev)s.
+  // Total Write x 9, Get x 3, Seek x 8
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+
+  TraceExecutionResultHandler res_handler;
+
+  // Manual replay for 2 times. The 2nd checks if the replay can restart.
+  std::unique_ptr<TraceRecord> record;
+  std::unique_ptr<TraceRecordResult> result;
+  for (int i = 0; i < 2; i++) {
+    // Next should fail if unprepared.
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_OK(replayer->Prepare());
+    Status s = Status::OK();
+    // Looping until trace end.
+    while (s.ok()) {
+      s = replayer->Next(&record);
+      // Skip unsupported operations.
+      if (s.IsNotSupported()) {
+        continue;
+      }
+      if (s.ok()) {
+        ASSERT_OK(replayer->Execute(record, &result));
+        if (result != nullptr) {
+          ASSERT_OK(result->Accept(&res_handler));
+          if (record->GetTraceType() == kTraceIteratorSeek ||
+              record->GetTraceType() == kTraceIteratorSeekForPrev) {
+            IteratorSeekQueryTraceRecord* iter_rec =
+                dynamic_cast<IteratorSeekQueryTraceRecord*>(record.get());
+            IteratorTraceExecutionResult* iter_res =
+                dynamic_cast<IteratorTraceExecutionResult*>(result.get());
+            // Check if lower/upper bounds are correctly saved and decoded.
+            std::string lower_str = iter_rec->GetLowerBound().ToString();
+            std::string upper_str = iter_rec->GetUpperBound().ToString();
+            std::string iter_key = iter_res->GetKey().ToString();
+            std::string iter_value = iter_res->GetValue().ToString();
+            if (!lower_str.empty() && !upper_str.empty()) {
+              ASSERT_EQ(lower_str, "iter-1");
+              ASSERT_EQ(upper_str, "iter-3");
+              if (iter_res->GetValid()) {
+                // If iterator is valid, then lower_bound <= key < upper_bound.
+                ASSERT_GE(iter_key, lower_str);
+                ASSERT_LT(iter_key, upper_str);
+              } else {
+                // If iterator is invalid, then
+                //   key < lower_bound or key >= upper_bound.
+                ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str);
+              }
+            }
+            // If iterator is invalid, the key and value should be empty.
+            if (!iter_res->GetValid()) {
+              ASSERT_TRUE(iter_key.empty());
+              ASSERT_TRUE(iter_value.empty());
+            }
+          }
+          result.reset();
+        }
+      }
+    }
+    // Status::Incomplete() will be returned when manually reading the trace
+    // end, or Prepare() was not called.
+    ASSERT_TRUE(s.IsIncomplete());
+    ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete());
+    ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+    ASSERT_EQ(res_handler.GetNumWrites(), 9);
+    ASSERT_EQ(res_handler.GetNumGets(), 3);
+    ASSERT_EQ(res_handler.GetNumIterSeeks(), 8);
+    ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+    res_handler.Reset();
+  }
+
+  ASSERT_OK(db2->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ("1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "g", &value));
+  ASSERT_EQ("12", value);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+
+  ASSERT_OK(db2->Get(ro, handles[1], "foo", &value));
+  ASSERT_EQ("bar", value);
+  ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value));
+  ASSERT_EQ("rocks", value);
+
+  // Test execution of artificially created TraceRecords.
+  uint64_t fake_ts = 1U;
+  // Write
+  batch.Clear();
+  ASSERT_OK(batch.Put("trace-record-write1", "write1"));
+  ASSERT_OK(batch.Put("trace-record-write2", "write2"));
+  record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Write x 1
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value));
+  ASSERT_EQ("write1", value);
+  ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value));
+  ASSERT_EQ("write2", value);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 1);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Get related
+  // Get an existing key.
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(),
+                                       "trace-record-write1", fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 1
+  // Get an non-existing key, should still return Status::OK().
+  record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get",
+                                       fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // Get x 2
+  // Get from an invalid (non-existing) cf_id.
+  uint32_t invalid_cf_id = handles[1]->GetID() + 1;
+  record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 2);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // Iteration related
+  for (IteratorSeekQueryTraceRecord::SeekType seekType :
+       {IteratorSeekQueryTraceRecord::kSeek,
+        IteratorSeekQueryTraceRecord::kSeekForPrev}) {
+    // Seek to an existing key.
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 1 in one iteration
+    // Seek to an non-existing key, should still return Status::OK().
+    record.reset(new IteratorSeekQueryTraceRecord(
+        seekType, handles[0]->GetID(), "trace-record-get", fake_ts++));
+    ASSERT_OK(replayer->Execute(record, &result));
+    ASSERT_TRUE(result != nullptr);
+    ASSERT_OK(result->Accept(&res_handler));  // Seek x 2 in one iteration
+    // Seek from an invalid cf_id.
+    record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id,
+                                                  "whatever", fake_ts++));
+    ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+    ASSERT_TRUE(result == nullptr);
+  }
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 4);  // Seek x 2 in two iterations
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 0);
+  res_handler.Reset();
+
+  // MultiGet related
+  // Get existing keys.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "foo"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 1
+  // Get all non-existing keys, should still return Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"no1", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 2
+  // Get mixed of existing and non-existing keys, should still return
+  // Status::OK().
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a", "no2"}), fake_ts++));
+  ASSERT_OK(replayer->Execute(record, &result));
+  ASSERT_TRUE(result != nullptr);
+  MultiValuesTraceExecutionResult* mvr =
+      dynamic_cast<MultiValuesTraceExecutionResult*>(result.get());
+  ASSERT_TRUE(mvr != nullptr);
+  ASSERT_OK(mvr->GetMultiStatus()[0]);
+  ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound());
+  ASSERT_EQ(mvr->GetValues()[0], "1");
+  ASSERT_EQ(mvr->GetValues()[1], "");
+  ASSERT_OK(result->Accept(&res_handler));  // MultiGet x 3
+  // Get from an invalid (non-existing) cf_id.
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(
+          {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}),
+      std::vector<std::string>({"a", "foo", "whatever"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption());
+  ASSERT_TRUE(result == nullptr);
+  // Empty MultiGet
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>(), std::vector<std::string>(), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  // MultiGet size mismatch
+  record.reset(new MultiGetQueryTraceRecord(
+      std::vector<uint32_t>({handles[0]->GetID(), handles[1]->GetID()}),
+      std::vector<std::string>({"a"}), fake_ts++));
+  ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument());
+  ASSERT_TRUE(result == nullptr);
+  ASSERT_GT(res_handler.GetAvgLatency(), 0.0);
+  ASSERT_EQ(res_handler.GetNumWrites(), 0);
+  ASSERT_EQ(res_handler.GetNumGets(), 0);
+  ASSERT_EQ(res_handler.GetNumIterSeeks(), 0);
+  ASSERT_EQ(res_handler.GetNumMultiGets(), 3);
+  res_handler.Reset();
+
+  replayer.reset();
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithLimit) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the max trace file size options
+  trace_opts.max_trace_file_size = 5;
+  std::string trace_filename = dbname_ + "/rocksdb.trace1";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "1"));
+  ASSERT_OK(Put(0, "c", "1"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2");
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithSampling) {
+  Options options = CurrentOptions();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+
+  // test the trace file sampling options
+  trace_opts.sampling_frequency = 2;
+  std::string trace_filename = dbname_ + "/rocksdb.trace_sampling";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Put(0, "b", "2"));
+  ASSERT_OK(Put(0, "c", "3"));
+  ASSERT_OK(Put(0, "d", "4"));
+  ASSERT_OK(Put(0, "e", "5"));
+  ASSERT_OK(db_->EndTrace());
+
+  std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling");
+  std::string value;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound());
+  ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+}
+
+TEST_F(DBTest2, TraceWithFilter) {
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  ReadOptions ro;
+  WriteOptions wo;
+  TraceOptions trace_opts;
+  EnvOptions env_opts;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  Random rnd(301);
+  Iterator* single_iter = nullptr;
+
+  trace_opts.filter = TraceFilterType::kTraceFilterWrite;
+
+  std::string trace_filename = dbname_ + "/rocksdb.trace";
+  std::unique_ptr<TraceWriter> trace_writer;
+  ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer));
+  ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer)));
+
+  ASSERT_OK(Put(0, "a", "1"));
+  ASSERT_OK(Merge(0, "b", "2"));
+  ASSERT_OK(Delete(0, "c"));
+  ASSERT_OK(SingleDelete(0, "d"));
+  ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f"));
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("f", "11"));
+  ASSERT_OK(batch.Merge("g", "12"));
+  ASSERT_OK(batch.Delete("h"));
+  ASSERT_OK(batch.SingleDelete("i"));
+  ASSERT_OK(batch.DeleteRange("j", "k"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  single_iter = db_->NewIterator(ro);
+  single_iter->Seek("f");
+  single_iter->SeekForPrev("g");
+  delete single_iter;
+
+  ASSERT_EQ("1", Get(0, "a"));
+  ASSERT_EQ("12", Get(0, "g"));
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+  ASSERT_OK(Put(1, "rocksdb", "rocks"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "leveldb"));
+
+  ASSERT_OK(db_->EndTrace());
+  // These should not get into the trace file as it is after EndTrace.
+  ASSERT_OK(Put("hello", "world"));
+  ASSERT_OK(Merge("foo", "bar"));
+
+  // Open another db, replay, and verify the data
+  std::string value;
+  std::string dbname2 = test::PerThreadDBPath(env_, "db_replay");
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Using a different name than db2, to pacify infer's use-after-lifetime
+  // warnings (http://fbinfer.com).
+  DB* db2_init = nullptr;
+  options.create_if_missing = true;
+  ASSERT_OK(DB::Open(options, dbname2, &db2_init));
+  ColumnFamilyHandle* cf;
+  ASSERT_OK(
+      db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf));
+  delete cf;
+  delete db2_init;
+
+  DB* db2 = nullptr;
+  std::vector<ColumnFamilyDescriptor> column_families;
+  ColumnFamilyOptions cf_options;
+  cf_options.merge_operator = MergeOperators::CreatePutOperator();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  std::vector<ColumnFamilyHandle*> handles;
+  DBOptions db_opts;
+  db_opts.env = env_;
+  ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  std::unique_ptr<TraceReader> trace_reader;
+  ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader));
+  std::unique_ptr<Replayer> replayer;
+  ASSERT_OK(
+      db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer));
+  ASSERT_OK(replayer->Prepare());
+  ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr));
+  replayer.reset();
+
+  // All the key-values should not present since we filter out the WRITE ops.
+  ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound());
+  ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db2;
+  ASSERT_OK(DestroyDB(dbname2, options));
+
+  // Set up a new db.
+  std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read");
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  DB* db3_init = nullptr;
+  options.create_if_missing = true;
+  ColumnFamilyHandle* cf3;
+  ASSERT_OK(DB::Open(options, dbname3, &db3_init));
+  ASSERT_OK(
+      db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3));
+  delete cf3;
+  delete db3_init;
+
+  column_families.clear();
+  column_families.push_back(ColumnFamilyDescriptor("default", cf_options));
+  column_families.push_back(
+      ColumnFamilyDescriptor("pikachu", ColumnFamilyOptions()));
+  handles.clear();
+
+  DB* db3 = nullptr;
+  ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3));
+
+  env_->SleepForMicroseconds(100);
+  // Verify that the keys don't already exist
+  ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound());
+  ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound());
+
+  // The tracer will not record the READ ops.
+  trace_opts.filter = TraceFilterType::kTraceFilterGet;
+  std::string trace_filename3 = dbname_ + "/rocksdb.trace_3";
+  std::unique_ptr<TraceWriter> trace_writer3;
+  ASSERT_OK(
+      NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3));
+  ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3)));
+
+  ASSERT_OK(db3->Put(wo, handles[0], "a", "1"));
+  ASSERT_OK(db3->Merge(wo, handles[0], "b", "2"));
+  ASSERT_OK(db3->Delete(wo, handles[0], "c"));
+  ASSERT_OK(db3->SingleDelete(wo, handles[0], "d"));
+
+  ASSERT_OK(db3->Get(ro, handles[0], "a", &value));
+  ASSERT_EQ(value, "1");
+  ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound());
+
+  ASSERT_OK(db3->EndTrace());
+
+  for (auto handle : handles) {
+    delete handle;
+  }
+  delete db3;
+  ASSERT_OK(DestroyDB(dbname3, options));
+
+  std::unique_ptr<TraceReader> trace_reader3;
+  ASSERT_OK(
+      NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3));
+
+  // Count the number of records in the trace file;
+  int count = 0;
+  std::string data;
+  Status s;
+  while (true) {
+    s = trace_reader3->Read(&data);
+    if (!s.ok()) {
+      break;
+    }
+    count += 1;
+  }
+  // We also need to count the header and footer
+  // 4 WRITE + HEADER + FOOTER = 6
+  ASSERT_EQ(count, 6);
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, PinnableSliceAndMmapReads) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  if (!IsMemoryMappedAccessSupported()) {
+    ROCKSDB_GTEST_SKIP("Test requires default environment");
+    return;
+  }
+  options.allow_mmap_reads = true;
+  options.max_open_files = 100;
+  options.compression = kNoCompression;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  PinnableSlice pinned_value;
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  // It is not safe to pin mmap files as they might disappear by compaction
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  ASSERT_OK(dbfull()->TEST_CompactRange(
+      0 /* level */, nullptr /* begin */, nullptr /* end */,
+      nullptr /* column_family */, true /* disallow_trivial_move */));
+
+  // Ensure pinned_value doesn't rely on memory munmap'd by the above
+  // compaction. It crashes if it does.
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+#ifndef ROCKSDB_LITE
+  pinned_value.Reset();
+  // Unsafe to pin mmap files when they could be kicked out of table cache
+  Close();
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_FALSE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+
+  pinned_value.Reset();
+  // In read-only mode with infinite capacity on table cache it should pin the
+  // value and avoid the memcpy
+  Close();
+  options.max_open_files = -1;
+  ASSERT_OK(ReadOnlyReopen(options));
+  ASSERT_EQ(Get("foo", &pinned_value), Status::OK());
+  ASSERT_TRUE(pinned_value.IsPinned());
+  ASSERT_EQ(pinned_value.ToString(), "bar");
+#endif
+}
+
+TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  BlockBasedTableOptions bbto;
+  bbto.no_block_cache = false;
+  bbto.cache_index_and_filter_blocks = false;
+  bbto.block_cache = NewLRUCache(100000);
+  bbto.block_size = 400;  // small block size
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  Reopen(options);
+
+  Random rnd(301);
+  std::string v = rnd.RandomString(400);
+
+  // Since v is the size of a block, each key should take a block
+  // of 400+ bytes.
+  ASSERT_OK(Put("1", v));
+  ASSERT_OK(Put("3", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("7", v));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Verify that iterators don't pin more than one data block in block cache
+  // at each time.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+    iter->SeekToFirst();
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_TRUE(iter->Valid());
+      // Block cache should contain exactly one block.
+      ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+      ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+      iter->Next();
+    }
+    ASSERT_FALSE(iter->Valid());
+
+    iter->Seek("4");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+
+    iter->Seek("3");
+    ASSERT_TRUE(iter->Valid());
+
+    ASSERT_OK(iter->status());
+
+    ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0);
+    ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800);
+  }
+  ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage());
+
+  // Test compaction case
+  ASSERT_OK(Put("2", v));
+  ASSERT_OK(Put("5", v));
+  ASSERT_OK(Put("6", v));
+  ASSERT_OK(Put("8", v));
+  ASSERT_OK(Flush());
+
+  // Clear existing data in block cache
+  bbto.block_cache->SetCapacity(0);
+  bbto.block_cache->SetCapacity(100000);
+
+  // Verify compaction input iterators don't hold more than one data blocks at
+  // one time.
+  std::atomic<bool> finished(false);
+  std::atomic<int> block_newed(0);
+  std::atomic<int> block_destroyed(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::Block:0", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load());
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 1);
+        block_newed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "Block::~Block", [&](void* /*arg*/) {
+        if (finished) {
+          return;
+        }
+        // Two iterators. At most 2 outstanding blocks.
+        EXPECT_GE(block_newed.load(), block_destroyed.load() + 1);
+        EXPECT_LE(block_newed.load(), block_destroyed.load() + 2);
+        block_destroyed.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run:BeforeVerify",
+      [&](void* /*arg*/) { finished = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  // Two input files. Each of them has 4 data blocks.
+  ASSERT_EQ(8, block_newed.load());
+  ASSERT_EQ(8, block_destroyed.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, TestBBTTailPrefetch) {
+  std::atomic<bool> called(false);
+  size_t expected_lower_bound = 512 * 1024;
+  size_t expected_higher_bound = 512 * 1024;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        EXPECT_LE(expected_lower_bound, *prefetch_size);
+        EXPECT_GE(expected_higher_bound, *prefetch_size);
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  expected_lower_bound = 0;
+  expected_higher_bound = 8 * 1024;
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  // Full compaction to make sure there is no L0 file after the open.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  std::atomic<bool> first_call(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) {
+        size_t* prefetch_size = static_cast<size_t*>(arg);
+        if (first_call) {
+          EXPECT_EQ(4 * 1024, *prefetch_size);
+          first_call = false;
+        } else {
+          EXPECT_GE(4 * 1024, *prefetch_size);
+        }
+        called = true;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.max_file_opening_threads = 1;  // one thread
+  BlockBasedTableOptions table_options;
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.max_open_files = -1;
+  Reopen(options);
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("1", "1"));
+  ASSERT_OK(Put("9", "1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_TRUE(called.load());
+  called = false;
+
+  // Parallel loading SST files
+  options.max_file_opening_threads = 16;
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_TRUE(called.load());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1",
+       "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"},
+      {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2",
+       "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1", "test2"}, Options());
+  ASSERT_EQ(handles_.size(), 2);
+
+  DBImpl* dbi = static_cast_with_check<DBImpl>(db_);
+  port::Thread user_thread1([&]() {
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1");
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1");
+    ASSERT_EQ(cfh->GetID(), handles_[0]->GetID());
+  });
+
+  port::Thread user_thread2([&]() {
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2");
+    auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID());
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+    TEST_SYNC_POINT(
+        "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2");
+    ASSERT_EQ(cfh->GetID(), handles_[1]->GetID());
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, TestCompactFiles) {
+  // Setup sync point dependency to reproduce the race condition of
+  // DBImpl::GetColumnFamilyHandleUnlocked
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"TestCompactFiles::IngestExternalFile1",
+       "TestCompactFiles::IngestExternalFile2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.env = env_;
+  options.num_levels = 2;
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  auto* handle = db_->DefaultColumnFamily();
+  ASSERT_EQ(db_->NumberLevels(handle), 2);
+
+  ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{
+      ROCKSDB_NAMESPACE::EnvOptions(), options};
+  std::string external_file1 = dbname_ + "/test_compact_files1.sst_t";
+  std::string external_file2 = dbname_ + "/test_compact_files2.sst_t";
+  std::string external_file3 = dbname_ + "/test_compact_files3.sst_t";
+
+  ASSERT_OK(sst_file_writer.Open(external_file1));
+  ASSERT_OK(sst_file_writer.Put("1", "1"));
+  ASSERT_OK(sst_file_writer.Put("2", "2"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file2));
+  ASSERT_OK(sst_file_writer.Put("3", "3"));
+  ASSERT_OK(sst_file_writer.Put("4", "4"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(sst_file_writer.Open(external_file3));
+  ASSERT_OK(sst_file_writer.Put("5", "5"));
+  ASSERT_OK(sst_file_writer.Put("6", "6"));
+  ASSERT_OK(sst_file_writer.Finish());
+
+  ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3},
+                                    IngestExternalFileOptions()));
+  ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2);
+  std::vector<std::string> files;
+  GetSstFiles(env_, dbname_, &files);
+  ASSERT_EQ(files.size(), 2);
+
+  Status user_thread1_status;
+  port::Thread user_thread1([&]() {
+    user_thread1_status =
+        db_->CompactFiles(CompactionOptions(), handle, files, 1);
+  });
+
+  Status user_thread2_status;
+  port::Thread user_thread2([&]() {
+    user_thread2_status = db_->IngestExternalFile(handle, {external_file2},
+                                                  IngestExternalFileOptions());
+    TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1");
+  });
+
+  user_thread1.join();
+  user_thread2.join();
+
+  ASSERT_OK(user_thread1_status);
+  ASSERT_OK(user_thread2_status);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBTest2, MultiDBParallelOpenTest) {
+  const int kNumDbs = 2;
+  Options options = CurrentOptions();
+  std::vector<std::string> dbnames;
+  for (int i = 0; i < kNumDbs; ++i) {
+    dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i)));
+    ASSERT_OK(DestroyDB(dbnames.back(), options));
+  }
+
+  // Verify empty DBs can be created in parallel
+  std::vector<std::thread> open_threads;
+  std::vector<DB*> dbs{static_cast<unsigned int>(kNumDbs), nullptr};
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Now add some data and close, so next we can verify non-empty DBs can be
+  // recovered in parallel
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua"));
+    delete dbs[i];
+  }
+
+  // Verify non-empty DBs can be recovered in parallel
+  open_threads.clear();
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads.emplace_back(
+        [&](int dbnum) {
+          ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum]));
+        },
+        i);
+  }
+
+  // Wait and cleanup
+  for (int i = 0; i < kNumDbs; ++i) {
+    open_threads[i].join();
+    delete dbs[i];
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+  }
+}
+
+namespace {
+class DummyOldStats : public Statistics {
+ public:
+  const char* Name() const override { return "DummyOldStats"; }
+  uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; }
+  void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override {
+    num_rt++;
+  }
+  void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {}
+  uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override {
+    return 0;
+  }
+  void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override {
+    num_mt++;
+  }
+  void histogramData(
+      uint32_t /*histogram_type*/,
+      ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {}
+  std::string getHistogramString(uint32_t /*type*/) const override {
+    return "";
+  }
+  bool HistEnabledForType(uint32_t /*type*/) const override { return false; }
+  std::string ToString() const override { return ""; }
+  std::atomic<int> num_rt{0};
+  std::atomic<int> num_mt{0};
+};
+}  // anonymous namespace
+
+TEST_F(DBTest2, OldStatsInterface) {
+  DummyOldStats* dos = new DummyOldStats();
+  std::shared_ptr<Statistics> stats(dos);
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.statistics = stats;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_EQ("bar", Get("foo"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_GT(dos->num_rt, 0);
+  ASSERT_GT(dos->num_mt, 0);
+}
+
+TEST_F(DBTest2, CloseWithUnreleasedSnapshot) {
+  const Snapshot* ss = db_->GetSnapshot();
+
+  for (auto h : handles_) {
+    db_->DestroyColumnFamilyHandle(h);
+  }
+  handles_.clear();
+
+  ASSERT_NOK(db_->Close());
+  db_->ReleaseSnapshot(ss);
+  ASSERT_OK(db_->Close());
+  delete db_;
+  db_ = nullptr;
+}
+
+TEST_F(DBTest2, PrefixBloomReseek) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Put("bbb1", ""));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+
+  // Seeking into f1, the iterator will check bloom filter which returns the
+  // file iterator ot be invalidate, and the cursor will put into f2, with
+  // the next key to be "ddd0".
+  iter->Seek("bbb1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("bbb1", iter->key().ToString());
+
+  // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek.
+  iter->Seek("ccc1");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("ccc1", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_F(DBTest2, PrefixBloomFilteredOut) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Construct two L1 files with keys:
+  // f1:[aaa1 ccc1] f2:[ddd0]
+  ASSERT_OK(Put("aaa1", ""));
+  ASSERT_OK(Put("ccc1", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("ddd0", ""));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+
+  // Bloom filter is filterd out by f1.
+  // This is just one of several valid position following the contract.
+  // Postioning to ccc1 or ddd0 is also valid. This is just to validate
+  // the behavior of the current implementation. If underlying implementation
+  // changes, the test might fail here.
+  iter->Seek("bbb1");
+  ASSERT_OK(iter->status());
+  ASSERT_FALSE(iter->Valid());
+
+  delete iter;
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, RowCacheSnapshot) {
+  Options options = CurrentOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.row_cache = NewLRUCache(8 * 8192);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "bar1"));
+
+  const Snapshot* s1 = db_->GetSnapshot();
+
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("foo2", "bar"));
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(Put("foo3", "bar"));
+  const Snapshot* s3 = db_->GetSnapshot();
+
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo"), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s2), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s1), "bar1");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+  ASSERT_EQ(Get("foo", s3), "bar2");
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4);
+  ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2);
+
+  db_->ReleaseSnapshot(s1);
+  db_->ReleaseSnapshot(s2);
+  db_->ReleaseSnapshot(s3);
+}
+#endif  // ROCKSDB_LITE
+
+// When DB is reopened with multiple column families, the manifest file
+// is written after the first CF is flushed, and it is written again
+// after each flush. If DB crashes between the flushes, the flushed CF
+// flushed will pass the latest log file, and now we require it not
+// to be corrupted, and triggering a corruption report.
+// We need to fix the bug and enable the test.
+TEST_F(DBTest2, CrashInRecoveryMultipleCF) {
+  const std::vector<std::string> sync_points = {
+      "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable",
+      "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"};
+  for (const auto& test_sync_point : sync_points) {
+    Options options = CurrentOptions();
+    // First destroy original db to ensure a clean start.
+    DestroyAndReopen(options);
+    options.create_if_missing = true;
+    options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+    CreateAndReopenWithCF({"pikachu"}, options);
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    ASSERT_OK(Put(1, "foo", "bar"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put(1, "foo", "bar"));
+    // The value is large enough to be divided to two blocks.
+    std::string large_value(400, ' ');
+    ASSERT_OK(Put("foo1", large_value));
+    ASSERT_OK(Put("foo2", large_value));
+    Close();
+
+    // Corrupt the log file in the middle, so that it is not corrupted
+    // in the tail.
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+    for (const auto& f : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) {
+        std::string fname = dbname_ + "/" + f;
+        std::string file_content;
+        ASSERT_OK(ReadFileToString(env_, fname, &file_content));
+        file_content[400] = 'h';
+        file_content[401] = 'a';
+        ASSERT_OK(WriteStringToFile(env_, file_content, fname));
+        break;
+      }
+    }
+
+    // Reopen and freeze the file system after the first manifest write.
+    FaultInjectionTestEnv fit_env(options.env);
+    options.env = &fit_env;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        test_sync_point,
+        [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_NOK(TryReopenWithColumnFamilies(
+        {kDefaultColumnFamilyName, "pikachu"}, options));
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    fit_env.SetFilesystemActive(true);
+    // If we continue using failure ingestion Env, it will conplain something
+    // when renaming current file, which is not expected. Need to investigate
+    // why.
+    options.env = env_;
+    ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                                          options));
+  }
+}
+
+TEST_F(DBTest2, SeekFileRangeDeleteTail) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewCappedPrefixTransform(1));
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("a", "a"));
+  const Snapshot* s1 = db_->GetSnapshot();
+  ASSERT_OK(
+      db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f"));
+  ASSERT_OK(Put("b", "a"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("x", "a"));
+  ASSERT_OK(Put("z", "a"));
+  ASSERT_OK(Flush());
+
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  {
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(ro));
+    ASSERT_OK(iter->status());
+    iter->Seek("e");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("x", iter->key().ToString());
+  }
+  db_->ReleaseSnapshot(s1);
+}
+
+TEST_F(DBTest2, BackgroundPurgeTest) {
+  Options options = CurrentOptions();
+  options.write_buffer_manager =
+      std::make_shared<ROCKSDB_NAMESPACE::WriteBufferManager>(1 << 20);
+  options.avoid_unnecessary_blocking_io = true;
+  DestroyAndReopen(options);
+  size_t base_value = options.write_buffer_manager->memory_usage();
+
+  ASSERT_OK(Put("a", "a"));
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  ASSERT_OK(iter->status());
+  ASSERT_OK(Flush());
+  size_t value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH);
+  test::SleepingBackgroundTask sleeping_task_after;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after, Env::Priority::HIGH);
+  delete iter;
+
+  Env::Default()->SleepForMicroseconds(100000);
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_GT(value, base_value);
+
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+
+  test::SleepingBackgroundTask sleeping_task_after2;
+  db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                          &sleeping_task_after2, Env::Priority::HIGH);
+  sleeping_task_after2.WakeUp();
+  sleeping_task_after2.WaitUntilDone();
+
+  value = options.write_buffer_manager->memory_usage();
+  ASSERT_EQ(base_value, value);
+}
+
+TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  options.max_manifest_file_size = 10;
+  options.create_if_missing = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(2, handles_.size());
+
+  ASSERT_OK(Put("foo", "value"));
+  const int kL0Files = options.level0_file_num_compaction_trigger;
+  for (int i = 0; i < kL0Files; ++i) {
+    ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i)));
+    ASSERT_OK(Flush(/*cf=*/1));
+  }
+
+  port::Thread thread([&]() { ASSERT_OK(Flush()); });
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  thread.join();
+}
+
+TEST_F(DBTest2, SameSmallestInSameLevel) {
+  // This test validates fractional casacading logic when several files at one
+  // one level only contains the same user key.
+  Options options = CurrentOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("key", "1"));
+  ASSERT_OK(Put("key", "2"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "3"));
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "4"));
+  ASSERT_OK(Flush());
+  CompactRangeOptions cro;
+  cro.change_level = true;
+  cro.target_level = 2;
+  ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr,
+                                   nullptr));
+
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "5"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "6"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "7"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->Merge(WriteOptions(), "key", "8"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,4,1", FilesPerLevel());
+#endif  // ROCKSDB_LITE
+
+  ASSERT_EQ("2,3,4,5,6,7,8", Get("key"));
+}
+
+TEST_F(DBTest2, FileConsistencyCheckInOpen) {
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) {
+        Status* ret_s = static_cast<Status*>(arg);
+        *ret_s = Status::Corruption("fcc");
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.force_consistency_checks = true;
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+  Reopen(options);
+
+  Random rnd(301);
+  std::string large_value = rnd.RandomString(500);
+
+  ASSERT_OK(Put("a1", large_value));
+  ASSERT_OK(Put("x1", large_value));
+  ASSERT_OK(Put("y1", large_value));
+  ASSERT_OK(Flush());
+
+  {
+    std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+    ASSERT_OK(iterator->status());
+    iterator->SeekForPrev("x3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("x1", iterator->key().ToString());
+
+    iterator->SeekForPrev("a3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("a1", iterator->key().ToString());
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+
+    // Query more than one non-existing prefix to cover the case both
+    // of empty hash bucket and hash bucket conflict.
+    iterator->SeekForPrev("b1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("c1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("d1");
+    // Result should be not valid or "a1".
+    if (iterator->Valid()) {
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    iterator->SeekForPrev("y3");
+    ASSERT_TRUE(iterator->Valid());
+    ASSERT_EQ("y1", iterator->key().ToString());
+  }
+}
+
+TEST_F(DBTest2, PartitionedIndexPrefetchFailure) {
+  Options options = last_options_;
+  options.env = env_;
+  options.max_open_files = 20;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+  bbto.metadata_block_size = 128;
+  bbto.block_size = 128;
+  bbto.block_cache = NewLRUCache(16777216);
+  bbto.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  // Force no table cache so every read will preload the SST file.
+  dbfull()->TEST_table_cache()->SetCapacity(0);
+  bbto.block_cache->SetCapacity(0);
+
+  Random rnd(301);
+  for (int i = 0; i < 4096; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(32)));
+  }
+  ASSERT_OK(Flush());
+
+  // Try different random failures in table open for 300 times.
+  for (int i = 0; i < 300; i++) {
+    env_->num_reads_fails_ = 0;
+    env_->rand_reads_fail_odd_ = 8;
+
+    std::string value;
+    Status s = dbfull()->Get(ReadOptions(), Key(1), &value);
+    if (env_->num_reads_fails_ > 0) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  }
+
+  env_->rand_reads_fail_odd_ = 0;
+}
+
+TEST_F(DBTest2, ChangePrefixExtractor) {
+  for (bool use_partitioned_filter : {true, false}) {
+    // create a DB with block prefix index
+    BlockBasedTableOptions table_options;
+    Options options = CurrentOptions();
+
+    // Sometimes filter is checked based on upper bound. Assert counters
+    // for that case. Otherwise, only check data correctness.
+#ifndef ROCKSDB_LITE
+    bool expect_filter_check = !use_partitioned_filter;
+#else
+    bool expect_filter_check = false;
+#endif
+    table_options.partition_filters = use_partitioned_filter;
+    if (use_partitioned_filter) {
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+    }
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.statistics = CreateDBStatistics();
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+
+    ASSERT_OK(Put("aa", ""));
+    ASSERT_OK(Put("xb", ""));
+    ASSERT_OK(Put("xx1", ""));
+    ASSERT_OK(Put("xz1", ""));
+    ASSERT_OK(Put("zz", ""));
+    ASSERT_OK(Flush());
+
+    // After reopening DB with prefix size 2 => 1, prefix extractor
+    // won't take effective unless it won't change results based
+    // on upper bound and seek key.
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    Reopen(options);
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+      ASSERT_OK(iterator->status());
+      iterator->Seek("xa");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      // It's a bug that the counter BLOOM_FILTER_PREFIX_CHECKED is not
+      // correct in this case. So don't check counters in this case.
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    std::string ub_str = "xg9";
+    Slice ub(ub_str);
+    ReadOptions ro;
+    ro.iterate_upper_bound = &ub;
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
+
+      // SeekForPrev() never uses prefix bloom if it is changed.
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    ub_str = "xx9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      ASSERT_OK(iterator->status());
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(0, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(1, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+    }
+
+    CompactRangeOptions compact_range_opts;
+    compact_range_opts.bottommost_level_compaction =
+        BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+    ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr));
+
+    // Re-execute similar queries after a full compaction
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ReadOptions()));
+
+      iterator->Seek("x");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(2, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xg");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(3, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xz");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xz1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(4, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      ASSERT_OK(iterator->status());
+    }
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      iterator->SeekForPrev("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(5, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      iterator->Seek("xx0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xx1", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(6, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+
+      ASSERT_OK(iterator->status());
+    }
+
+    ub_str = "xg9";
+    ub = Slice(ub_str);
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->SeekForPrev("xg0");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("xb", iterator->key().ToString());
+      if (expect_filter_check) {
+        ASSERT_EQ(7, TestGetTickerCount(options, BLOOM_FILTER_PREFIX_CHECKED));
+      }
+      ASSERT_OK(iterator->status());
+    }
+  }
+}
+
+TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) {
+  // create a DB with block prefix index
+  BlockBasedTableOptions table_options;
+  Options options = CurrentOptions();
+  table_options.block_size = 300;
+  table_options.index_type = BlockBasedTableOptions::kHashSearch;
+  table_options.index_shortening =
+      BlockBasedTableOptions::IndexShorteningMode::kNoShortening;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.level0_file_num_compaction_trigger = 8;
+
+  Reopen(options);
+
+  ASSERT_OK(Put("b1", "ok"));
+  ASSERT_OK(Flush());
+
+  // Flushing several files so that the chance that hash bucket
+  // is empty fo "b" in at least one of the files is high.
+  ASSERT_OK(Put("a1", ""));
+  ASSERT_OK(Put("c1", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a2", ""));
+  ASSERT_OK(Put("c2", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a3", ""));
+  ASSERT_OK(Put("c3", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a4", ""));
+  ASSERT_OK(Put("c4", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("a5", ""));
+  ASSERT_OK(Put("c5", ""));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("ok", Get("b1"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, AutoPrefixMode1) {
+  do {
+    // create a DB with block prefix index
+    Options options = CurrentOptions();
+    BlockBasedTableOptions table_options =
+        *options.table_factory->GetOptions<BlockBasedTableOptions>();
+    table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+    options.statistics = CreateDBStatistics();
+
+    Reopen(options);
+
+    Random rnd(301);
+    std::string large_value = rnd.RandomString(500);
+
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());
+
+    ReadOptions ro;
+    ro.total_order_seek = false;
+    ro.auto_prefix_mode = true;
+
+    const auto stat = BLOOM_FILTER_PREFIX_CHECKED;
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    Slice ub;
+    ro.iterate_upper_bound = &ub;
+
+    ub = "b9";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "z";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "c";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    ub = "c1";
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+
+    // The same queries without recreating iterator
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b9";
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      ub = "z";
+      iterator->Seek("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "c";
+      iterator->Seek("b1");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b9";
+      iterator->SeekForPrev("b1");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "zz";
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+    }
+
+    // Similar, now with reverse comparator
+    // Technically, we are violating axiom 2 of prefix_extractors, but
+    // it should be revised because of major use-cases using
+    // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME)
+    options.comparator = ReverseBytewiseComparator();
+    options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+
+    DestroyAndReopen(options);
+
+    ASSERT_OK(Put("a1", large_value));
+    ASSERT_OK(Put("x1", large_value));
+    ASSERT_OK(Put("y1", large_value));
+    ASSERT_OK(Flush());
+
+    {
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+
+      ub = "b1";
+      iterator->Seek("b9");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      ub = "b1";
+      iterator->Seek("z");
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->Seek("c");
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("c9");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->Seek("b9");
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b";
+      iterator->Seek("a");
+      ASSERT_FALSE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper
+      // comparing before seek key prevents a real bug from surfacing.
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "b1";
+      iterator->SeekForPrev("b9");
+      ASSERT_TRUE(iterator->Valid());
+      // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor
+      // is "correctly" implemented.
+      ASSERT_EQ("x1", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+
+      ub = "a";
+      iterator->SeekToLast();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("a1", iterator->key().ToString());
+
+      iterator->SeekToFirst();
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("y1", iterator->key().ToString());
+    }
+
+    // Now something a bit different, related to "short" keys that
+    // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode.
+    options.comparator = BytewiseComparator();
+    for (const auto config : {"fixed:2", "capped:2"}) {
+      ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config,
+                                                 &options.prefix_extractor));
+
+      // FIXME: kHashSearch, etc. requires all keys be InDomain
+      if (StartsWith(config, "fixed") &&
+          (table_options.index_type == BlockBasedTableOptions::kHashSearch ||
+           StartsWith(options.memtable_factory->Name(), "Hash"))) {
+        continue;
+      }
+      DestroyAndReopen(options);
+
+      const char* a_end_stuff = "a\xffXYZ";
+      const char* b_begin_stuff = "b\x00XYZ";
+      ASSERT_OK(Put("a", large_value));
+      ASSERT_OK(Put("b", large_value));
+      ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value));
+      ASSERT_OK(Put("c", large_value));
+      ASSERT_OK(Flush());
+
+      // control showing valid optimization with auto_prefix mode
+      ub = Slice(a_end_stuff, 4);
+      ro.iterate_upper_bound = &ub;
+
+      std::unique_ptr<Iterator> iterator(db_->NewIterator(ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // test, cannot be validly optimized with auto_prefix_mode
+      ub = Slice(b_begin_stuff, 2);
+      ro.iterate_upper_bound = &ub;
+
+      iterator->Seek(Slice(a_end_stuff, 2));
+      // !!! BUG !!! See "BUG" section of auto_prefix_mode.
+      ASSERT_FALSE(iterator->Valid());
+      EXPECT_EQ(1, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+
+      // To prove that is the wrong result, now use total order seek
+      ReadOptions tos_ro = ro;
+      tos_ro.total_order_seek = true;
+      tos_ro.auto_prefix_mode = false;
+      iterator.reset(db_->NewIterator(tos_ro));
+      iterator->Seek(Slice(a_end_stuff, 2));
+      ASSERT_TRUE(iterator->Valid());
+      ASSERT_EQ("b", iterator->key().ToString());
+      EXPECT_EQ(0, TestGetAndResetTickerCount(options, stat));
+      ASSERT_OK(iterator->status());
+    }
+  } while (ChangeOptions(kSkipPlainTable));
+}
+
+class RenameCurrentTest : public DBTestBase,
+                          public testing::WithParamInterface<std::string> {
+ public:
+  RenameCurrentTest()
+      : DBTestBase("rename_current_test", /*env_do_fsync=*/true),
+        sync_point_(GetParam()) {}
+
+  ~RenameCurrentTest() override {}
+
+  void SetUp() override {
+    env_->no_file_overwrite_.store(true, std::memory_order_release);
+  }
+
+  void TearDown() override {
+    env_->no_file_overwrite_.store(false, std::memory_order_release);
+  }
+
+  void SetupSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) {
+      Status* s = reinterpret_cast<Status*>(arg);
+      assert(s);
+      *s = Status::IOError("Injected IO error.");
+    });
+  }
+
+  const std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest,
+                        ::testing::Values("SetCurrentFile:BeforeRename",
+                                          "SetCurrentFile:AfterRename"));
+
+TEST_P(RenameCurrentTest, Open) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  Status s = TryReopen(options);
+  ASSERT_NOK(s);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+}
+
+TEST_P(RenameCurrentTest, Flush) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("key", "value"));
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(Flush());
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("value", Get("key"));
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+}
+
+TEST_P(RenameCurrentTest, Compaction) {
+  Destroy(last_options_);
+  Options options = GetDefaultOptions();
+  options.max_manifest_file_size = 1;
+  options.create_if_missing = true;
+  Reopen(options);
+  ASSERT_OK(Put("a", "a_value"));
+  ASSERT_OK(Put("c", "c_value"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("b", "b_value"));
+  ASSERT_OK(Put("d", "d_value"));
+  ASSERT_OK(Flush());
+
+  SetupSyncPoints();
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr,
+                               /*end=*/nullptr));
+
+  ASSERT_NOK(Put("foo", "value"));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  Reopen(options);
+  ASSERT_EQ("NOT_FOUND", Get("foo"));
+  ASSERT_EQ("d_value", Get("d"));
+}
+
+TEST_F(DBTest2, LastLevelTemperature) {
+  class TestListener : public EventListener {
+   public:
+    void OnFileReadFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileWriteFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileFlushFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileSyncFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    void OnFileCloseFinish(const FileOperationInfo& info) override {
+      UpdateFileTemperature(info);
+    }
+
+    bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+    std::unordered_map<uint64_t, Temperature> file_temperatures;
+
+   private:
+    void UpdateFileTemperature(const FileOperationInfo& info) {
+      auto filename = GetFileName(info.path);
+      uint64_t number;
+      FileType type;
+      ASSERT_TRUE(ParseFileName(filename, &number, &type));
+      if (type == kTableFile) {
+        MutexLock l(&mutex_);
+        auto ret = file_temperatures.insert({number, info.temperature});
+        if (!ret.second) {
+          // the same file temperature should always be the same for all events
+          ASSERT_TRUE(ret.first->second == info.temperature);
+        }
+      }
+    }
+
+    std::string GetFileName(const std::string& fname) {
+      auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+      // workaround only for Windows that the file path could contain both
+      // Windows FilePathSeparator and '/'
+      filename = filename.substr(filename.find_last_of('/') + 1);
+      return filename;
+    }
+
+    port::Mutex mutex_;
+  };
+
+  const int kNumLevels = 7;
+  const int kLastLevel = kNumLevels - 1;
+
+  auto* listener = new TestListener();
+
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  options.listeners.emplace_back(listener);
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  SstFileMetaData meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  uint64_t number;
+  FileType type;
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  meta = metadata.levels[0].files[0];
+  ASSERT_EQ(Temperature::kUnknown, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+
+  meta = metadata.levels[kLastLevel].files[0];
+  ASSERT_EQ(Temperature::kWarm, meta.temperature);
+  ASSERT_TRUE(ParseFileName(meta.name, &number, &type));
+  ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature);
+}
+
+TEST_F(DBTest2, LastLevelTemperatureUniversal) {
+  const int kTriggerNum = 3;
+  const int kNumLevels = 5;
+  const int kBottommostLevel = kNumLevels - 1;
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kTriggerNum;
+  options.num_levels = kNumLevels;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  get_iostats_context()->Reset();
+  IOStatsContext* iostats = get_iostats_context();
+
+  for (int i = 0; i < kTriggerNum; i++) {
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Put("bar", "bar"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+  ASSERT_EQ("bar", Get("foo"));
+
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0);
+  ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // Update bottommost temperature
+  options.bottommost_temperature = Temperature::kWarm;
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  // Should not impact existing ones
+  ASSERT_EQ(Temperature::kUnknown,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+
+  // new generated file should have the new settings
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0);
+  ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+
+  // Update bottommost temperature dynamically with SetOptions
+  auto s = db_->SetOptions({{"last_level_temperature", "kCold"}});
+  ASSERT_OK(s);
+  ASSERT_EQ(db_->GetOptions().bottommost_temperature, Temperature::kCold);
+  db_->GetColumnFamilyMetaData(&metadata);
+  // Should not impact the existing files
+  ASSERT_EQ(Temperature::kWarm,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+
+  // new generated files should have the new settings
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kCold,
+            metadata.levels[kBottommostLevel].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(size, 0);
+
+  // kLastTemperature is an invalid temperature
+  options.bottommost_temperature = Temperature::kLastTemperature;
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, LastLevelStatistics) {
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  options.level0_file_num_compaction_trigger = 2;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.statistics = CreateDBStatistics();
+  Reopen(options);
+
+  // generate 1 sst on level 0
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0);
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0);
+
+  // 2nd flush to trigger compaction
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+
+  auto pre_bytes =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES);
+  auto pre_count =
+      options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT);
+
+  // 3rd flush to generate 1 sst on level 0
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("bar", Get("bar"));
+
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES),
+            pre_bytes);
+  ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT),
+            pre_count);
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES),
+            options.statistics->getTickerCount(WARM_FILE_READ_BYTES));
+  ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT),
+            options.statistics->getTickerCount(WARM_FILE_READ_COUNT));
+}
+
+TEST_F(DBTest2, CheckpointFileTemperature) {
+  class NoLinkTestFS : public FileTemperatureTestFS {
+    using FileTemperatureTestFS::FileTemperatureTestFS;
+
+    IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
+                      IODebugContext*) override {
+      // return not supported to force checkpoint copy the file instead of just
+      // link
+      return IOStatus::NotSupported();
+    }
+  };
+  auto test_fs = std::make_shared<NoLinkTestFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  // set dynamic_level to true so the compaction would compact the data to the
+  // last level directly which will have the last_level_temperature
+  options.level_compaction_dynamic_level_bytes = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = env.get();
+  Reopen(options);
+
+  // generate a bottommost file and a non-bottommost file
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  auto size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  std::map<uint64_t, Temperature> temperatures;
+  std::vector<LiveFileStorageInfo> infos;
+  ASSERT_OK(
+      dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos));
+  for (auto info : infos) {
+    temperatures.emplace(info.file_number, info.temperature);
+  }
+
+  test_fs->PopRequestedSstFileTemperatures();
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(
+      checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp"));
+
+  // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm,
+  // another is kUnknown
+  std::vector<std::pair<uint64_t, Temperature>> requested_temps;
+  test_fs->PopRequestedSstFileTemperatures(&requested_temps);
+  // Two requests
+  ASSERT_EQ(requested_temps.size(), 2);
+  std::set<uint64_t> distinct_requests;
+  for (const auto& requested_temp : requested_temps) {
+    // Matching manifest temperatures
+    ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second);
+    distinct_requests.insert(requested_temp.first);
+  }
+  // Each request to distinct file
+  ASSERT_EQ(distinct_requests.size(), requested_temps.size());
+
+  delete checkpoint;
+  Close();
+}
+
+TEST_F(DBTest2, FileTemperatureManifestFixup) {
+  auto test_fs = std::make_shared<FileTemperatureTestFS>(env_->GetFileSystem());
+  std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, test_fs));
+  Options options = CurrentOptions();
+  options.bottommost_temperature = Temperature::kWarm;
+  // set dynamic_level to true so the compaction would compact the data to the
+  // last level directly which will have the last_level_temperature
+  options.level_compaction_dynamic_level_bytes = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.env = env.get();
+  std::vector<std::string> cfs = {/*"default",*/ "test1", "test2"};
+  CreateAndReopenWithCF(cfs, options);
+  // Needed for later re-opens (weird)
+  cfs.insert(cfs.begin(), kDefaultColumnFamilyName);
+
+  // Generate a bottommost file in all CFs
+  for (int cf = 0; cf < 3; ++cf) {
+    ASSERT_OK(Put(cf, "a", "val"));
+    ASSERT_OK(Put(cf, "c", "val"));
+    ASSERT_OK(Flush(cf));
+    ASSERT_OK(Put(cf, "b", "val"));
+    ASSERT_OK(Put(cf, "d", "val"));
+    ASSERT_OK(Flush(cf));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // verify
+  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Generate a non-bottommost file in all CFs
+  for (int cf = 0; cf < 3; ++cf) {
+    ASSERT_OK(Put(cf, "e", "val"));
+    ASSERT_OK(Flush(cf));
+  }
+
+  // re-verify
+  ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0);
+  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Now change FS temperature on bottommost file(s) to kCold
+  std::map<uint64_t, Temperature> current_temps;
+  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+  for (auto e : current_temps) {
+    if (e.second == Temperature::kWarm) {
+      test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold);
+    }
+  }
+  // Metadata not yet updated
+  ASSERT_EQ(Get("a"), "val");
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // Update with Close and UpdateManifestForFilesState, but first save cf
+  // descriptors
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < handles_.size(); ++i) {
+    ColumnFamilyDescriptor cfdescriptor;
+    // GetDescriptor is not implemented for ROCKSDB_LITE
+    handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+    column_families.push_back(cfdescriptor);
+  }
+  Close();
+  experimental::UpdateManifestForFilesStateOptions update_opts;
+  update_opts.update_temperatures = true;
+
+  ASSERT_OK(experimental::UpdateManifestForFilesState(
+      options, dbname_, column_families, update_opts));
+
+  // Re-open and re-verify after update
+  ReopenWithColumnFamilies(cfs, options);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0);
+
+  // Change kUnknown to kHot
+  test_fs->CopyCurrentSstFileTemperatures(&current_temps);
+  for (auto e : current_temps) {
+    if (e.second == Temperature::kUnknown) {
+      test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot);
+    }
+  }
+
+  // Update with Close and UpdateManifestForFilesState
+  Close();
+  ASSERT_OK(experimental::UpdateManifestForFilesState(
+      options, dbname_, column_families, update_opts));
+
+  // Re-open and re-verify after update
+  ReopenWithColumnFamilies(cfs, options);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0);
+
+  Close();
+}
+#endif  // ROCKSDB_LITE
+
+// WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery.
+TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  bool should_inject_error = false;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::RecoverLogFiles:BeforeReadWal",
+      [&](void* /*arg*/) { should_inject_error = true; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "LogReader::ReadMore:AfterReadFile", [&](void* arg) {
+        if (should_inject_error) {
+          ASSERT_NE(nullptr, arg);
+          *reinterpret_cast<Status*>(arg) = Status::IOError("Injected IOError");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  options.avoid_flush_during_recovery = true;
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  Status s = TryReopen(options);
+  ASSERT_TRUE(s.IsIOError());
+}
+
+TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallFlush:Start:1",
+        "PointInTimeRecoveryWithSyncFailureInCFCreation:1"},
+       {"PointInTimeRecoveryWithSyncFailureInCFCreation:2",
+        "DBImpl::BackgroundCallFlush:Start:2"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateColumnFamilies({"test1"}, Options());
+  ASSERT_OK(Put("foo", "bar"));
+
+  // Creating a CF when a flush is going on, log is synced but the
+  // closed log file is not synced and corrupted.
+  port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); });
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1");
+  CreateColumnFamilies({"test2"}, Options());
+  env_->corrupt_in_sync_ = true;
+  TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2");
+  flush_thread.join();
+  env_->corrupt_in_sync_ = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  // Reopening the DB should not corrupt anything
+  Options options = CurrentOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ReopenWithColumnFamilies({"default", "test1", "test2"}, options);
+}
+
+TEST_F(DBTest2, RenameDirectory) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "value0"));
+  Close();
+  auto old_dbname = dbname_;
+  auto new_dbname = dbname_ + "_2";
+  EXPECT_OK(env_->RenameFile(dbname_, new_dbname));
+  options.create_if_missing = false;
+  dbname_ = new_dbname;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ("value0", Get("foo"));
+  Destroy(options);
+  dbname_ = old_dbname;
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  options.statistics = CreateDBStatistics();
+  // Skip for now
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  std::atomic_int skipped = 0;
+  std::atomic_int passed = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::SkippedVerifyUniqueId",
+      [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId",
+      [&](void* /*arg*/) { passed++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Verification has been skipped on files so far
+  EXPECT_EQ(skipped, kNumSst);
+  EXPECT_EQ(passed, 0);
+
+  // Reopen with verification
+  options.verify_sst_unique_id_in_manifest = true;
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  EXPECT_EQ(skipped, 0);
+  EXPECT_EQ(passed, kNumSst);
+
+  // Now simulate no unique id in manifest for next file
+  // NOTE: this only works for loading manifest from disk,
+  // not in-memory manifest, so we need to re-open below.
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:UniqueId", [&](void* arg) {
+        auto unique_id = static_cast<UniqueId64x2*>(arg);
+        // remove id before writing it to manifest
+        (*unique_id)[0] = 0;
+        (*unique_id)[1] = 0;
+      });
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen (with verification)
+  ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  EXPECT_EQ(skipped, 1);
+  EXPECT_EQ(passed, 0);
+}
+
+TEST_F(DBTest2, SstUniqueIdVerify) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  // Allow mismatch for now
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one, which
+        // changes unique ID
+        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate a few SSTs
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Reopen with verification should report corruption
+  options.verify_sst_unique_id_in_manifest = true;
+  auto s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+
+  // Reopen without verification should be fine
+  options.verify_sst_unique_id_in_manifest = false;
+  Reopen(options);
+
+  // test compaction generated Sst
+  for (int i = kNumSst; i < kLevel0Trigger; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+#ifndef ROCKSDB_LITE
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+#endif  // ROCKSDB_LITE
+
+  // Reopen with verification should fail
+  options.verify_sst_unique_id_in_manifest = true;
+  s = TryReopen(options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
+  const int kNumSst = 3;
+  const int kLevel0Trigger = 4;
+  auto options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kLevel0Trigger;
+  // Allow mismatch for now
+  options.verify_sst_unique_id_in_manifest = false;
+
+  CreateAndReopenWithCF({"one", "two"}, options);
+
+  // generate good SSTs
+  for (int cf_num : {0, 2}) {
+    for (int i = 0; i < kNumSst; i++) {
+      for (int j = 0; j < 100; j++) {
+        ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value"));
+      }
+      ASSERT_OK(Flush(cf_num));
+    }
+  }
+
+  // generate SSTs with bad unique id
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumSst; i++) {
+    for (int j = 0; j < 100; j++) {
+      ASSERT_OK(Put(1, Key(i * 10 + j), "value"));
+    }
+    ASSERT_OK(Flush(1));
+  }
+
+  // Reopen with verification should report corruption
+  options.verify_sst_unique_id_in_manifest = true;
+  auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options);
+  ASSERT_TRUE(s.IsCorruption());
+}
+
+TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
+  const auto tamper_with_uniq_id = [&](void* arg) {
+    auto props = static_cast<TableProperties*>(arg);
+    assert(props);
+    // update table property session_id to a different one
+    props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
+  };
+
+  const auto assert_db = [&](size_t expected_count,
+                             const std::string& expected_v) {
+    std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
+    size_t cnt = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
+      ASSERT_EQ(std::to_string(cnt), it->key());
+      ASSERT_EQ(expected_v, it->value());
+    }
+    ASSERT_EQ(expected_count, cnt);
+  };
+
+  const int num_l0_compaction_trigger = 8;
+  const int num_l0 = num_l0_compaction_trigger - 1;
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;
+
+  for (int k = 0; k < num_l0; ++k) {
+    // Allow mismatch for now
+    options.verify_sst_unique_id_in_manifest = false;
+
+    DestroyAndReopen(options);
+
+    constexpr size_t num_keys_per_file = 10;
+    for (int i = 0; i < num_l0; ++i) {
+      for (size_t j = 0; j < num_keys_per_file; ++j) {
+        ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
+      }
+      if (i == k) {
+        SyncPoint::GetInstance()->DisableProcessing();
+        SyncPoint::GetInstance()->SetCallBack(
+            "PropertyBlockBuilder::AddTableProperty:Start",
+            tamper_with_uniq_id);
+        SyncPoint::GetInstance()->EnableProcessing();
+      }
+      ASSERT_OK(Flush());
+    }
+
+    options.verify_sst_unique_id_in_manifest = true;
+    Status s = TryReopen(options);
+    ASSERT_TRUE(s.IsCorruption());
+
+    options.best_efforts_recovery = true;
+    Reopen(options);
+    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+    // Reopen with regular recovery
+    options.best_efforts_recovery = false;
+    Reopen(options);
+    assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    for (size_t i = 0; i < num_keys_per_file; ++i) {
+      ASSERT_OK(Put(std::to_string(i), "v"));
+    }
+    ASSERT_OK(Flush());
+    Reopen(options);
+    {
+      for (size_t i = 0; i < num_keys_per_file; ++i) {
+        ASSERT_EQ("v", Get(std::to_string(i)));
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
+  Destroy(last_options_);
+
+  Options options = CurrentOptions();
+  options.max_write_buffer_size_to_maintain = 64 << 10;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.statistics = CreateDBStatistics();
+
+  Reopen(options);
+
+  constexpr uint64_t kTsU64Value = 12;
+
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string ts;
+    PutFixed64(&ts, kTsU64Value);
+
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value"));
+  }
+
+  ASSERT_OK(Flush());
+
+  constexpr bool cache_only = true;
+  constexpr SequenceNumber lower_bound_seq = 0;
+  auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(
+      dbfull()->DefaultColumnFamily());
+  assert(cfhi);
+  assert(cfhi->cfd());
+  SuperVersion* sv = cfhi->cfd()->GetSuperVersion();
+  for (uint64_t key = 0; key < 100; ++key) {
+    std::string key_str;
+    PutFixed64(&key_str, key);
+    std::reverse(key_str.begin(), key_str.end());
+    std::string ts;
+    SequenceNumber seq = kMaxSequenceNumber;
+    bool found_record_for_key = false;
+    bool is_blob_index = false;
+
+    const Status s = dbfull()->GetLatestSequenceForKey(
+        sv, key_str, cache_only, lower_bound_seq, &seq, &ts,
+        &found_record_for_key, &is_blob_index);
+    ASSERT_OK(s);
+    std::string expected_ts;
+    PutFixed64(&expected_ts, kTsU64Value);
+    ASSERT_EQ(expected_ts, ts);
+    ASSERT_TRUE(found_record_for_key);
+    ASSERT_FALSE(is_blob_index);
+  }
+
+  // Verify that no read to SST files.
+  ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0));
+}
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_test_util.cc b/src/rocksdb/db/db_test_util.cc
new file mode 100644
index 000000000..d53bca51a
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.cc
@@ -0,0 +1,1773 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+
+#include "cache/cache_reservation_manager.h"
+#include "db/forward_iterator.h"
+#include "env/mock_env.h"
+#include "port/lang.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/env_encryption.h"
+#include "rocksdb/unique_id.h"
+#include "rocksdb/utilities/object_registry.h"
+#include "table/format.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+int64_t MaybeCurrentTime(Env* env) {
+  int64_t time = 1337346000;  // arbitrary fallback default
+  env->GetCurrentTime(&time).PermitUncheckedError();
+  return time;
+}
+}  // anonymous namespace
+
+// Special Env used to delay background operations
+
+SpecialEnv::SpecialEnv(Env* base, bool time_elapse_only_sleep)
+    : EnvWrapper(base),
+      maybe_starting_time_(MaybeCurrentTime(base)),
+      rnd_(301),
+      sleep_counter_(this),
+      time_elapse_only_sleep_(time_elapse_only_sleep),
+      no_slowdown_(time_elapse_only_sleep) {
+  delay_sstable_sync_.store(false, std::memory_order_release);
+  drop_writes_.store(false, std::memory_order_release);
+  no_space_.store(false, std::memory_order_release);
+  non_writable_.store(false, std::memory_order_release);
+  count_random_reads_ = false;
+  count_sequential_reads_ = false;
+  manifest_sync_error_.store(false, std::memory_order_release);
+  manifest_write_error_.store(false, std::memory_order_release);
+  log_write_error_.store(false, std::memory_order_release);
+  no_file_overwrite_.store(false, std::memory_order_release);
+  random_file_open_counter_.store(0, std::memory_order_relaxed);
+  delete_count_.store(0, std::memory_order_relaxed);
+  num_open_wal_file_.store(0);
+  log_write_slowdown_ = 0;
+  bytes_written_ = 0;
+  sync_counter_ = 0;
+  non_writeable_rate_ = 0;
+  new_writable_count_ = 0;
+  non_writable_count_ = 0;
+  table_write_callback_ = nullptr;
+}
+DBTestBase::DBTestBase(const std::string path, bool env_do_fsync)
+    : mem_env_(nullptr), encrypted_env_(nullptr), option_config_(kDefault) {
+  Env* base_env = Env::Default();
+  ConfigOptions config_options;
+  EXPECT_OK(test::CreateEnvFromSystem(config_options, &base_env, &env_guard_));
+  EXPECT_NE(nullptr, base_env);
+  if (getenv("MEM_ENV")) {
+    mem_env_ = MockEnv::Create(base_env, base_env->GetSystemClock());
+  }
+#ifndef ROCKSDB_LITE
+  if (getenv("ENCRYPTED_ENV")) {
+    std::shared_ptr<EncryptionProvider> provider;
+    std::string provider_id = getenv("ENCRYPTED_ENV");
+    if (provider_id.find("=") == std::string::npos &&
+        !EndsWith(provider_id, "://test")) {
+      provider_id = provider_id + "://test";
+    }
+    EXPECT_OK(EncryptionProvider::CreateFromString(ConfigOptions(), provider_id,
+                                                   &provider));
+    encrypted_env_ = NewEncryptedEnv(mem_env_ ? mem_env_ : base_env, provider);
+  }
+#endif  // !ROCKSDB_LITE
+  env_ = new SpecialEnv(encrypted_env_ ? encrypted_env_
+                                       : (mem_env_ ? mem_env_ : base_env));
+  env_->SetBackgroundThreads(1, Env::LOW);
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  env_->skip_fsync_ = !env_do_fsync;
+  dbname_ = test::PerThreadDBPath(env_, path);
+  alternative_wal_dir_ = dbname_ + "/wal";
+  alternative_db_log_dir_ = dbname_ + "/db_log_dir";
+  auto options = CurrentOptions();
+  options.env = env_;
+  auto delete_options = options;
+  delete_options.wal_dir = alternative_wal_dir_;
+  EXPECT_OK(DestroyDB(dbname_, delete_options));
+  // Destroy it for not alternative WAL dir is used.
+  EXPECT_OK(DestroyDB(dbname_, options));
+  db_ = nullptr;
+  Reopen(options);
+  Random::GetTLSInstance()->Reset(0xdeadbeef);
+}
+
+DBTestBase::~DBTestBase() {
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  Close();
+  Options options;
+  options.db_paths.emplace_back(dbname_, 0);
+  options.db_paths.emplace_back(dbname_ + "_2", 0);
+  options.db_paths.emplace_back(dbname_ + "_3", 0);
+  options.db_paths.emplace_back(dbname_ + "_4", 0);
+  options.env = env_;
+
+  if (getenv("KEEP_DB")) {
+    printf("DB is still at %s\n", dbname_.c_str());
+  } else {
+    EXPECT_OK(DestroyDB(dbname_, options));
+  }
+  delete env_;
+}
+
+bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) {
+#ifdef ROCKSDB_LITE
+  // These options are not supported in ROCKSDB_LITE
+  if (option_config == kHashSkipList ||
+      option_config == kPlainTableFirstBytePrefix ||
+      option_config == kPlainTableCappedPrefix ||
+      option_config == kPlainTableCappedPrefixNonMmap ||
+      option_config == kPlainTableAllBytesPrefix ||
+      option_config == kVectorRep || option_config == kHashLinkList ||
+      option_config == kUniversalCompaction ||
+      option_config == kUniversalCompactionMultiLevel ||
+      option_config == kUniversalSubcompactions ||
+      option_config == kFIFOCompaction ||
+      option_config == kConcurrentSkipList) {
+    return true;
+  }
+#endif
+
+  if ((skip_mask & kSkipUniversalCompaction) &&
+      (option_config == kUniversalCompaction ||
+       option_config == kUniversalCompactionMultiLevel ||
+       option_config == kUniversalSubcompactions)) {
+    return true;
+  }
+  if ((skip_mask & kSkipMergePut) && option_config == kMergePut) {
+    return true;
+  }
+  if ((skip_mask & kSkipNoSeekToLast) &&
+      (option_config == kHashLinkList || option_config == kHashSkipList)) {
+    return true;
+  }
+  if ((skip_mask & kSkipPlainTable) &&
+      (option_config == kPlainTableAllBytesPrefix ||
+       option_config == kPlainTableFirstBytePrefix ||
+       option_config == kPlainTableCappedPrefix ||
+       option_config == kPlainTableCappedPrefixNonMmap)) {
+    return true;
+  }
+  if ((skip_mask & kSkipHashIndex) &&
+      (option_config == kBlockBasedTableWithPrefixHashIndex ||
+       option_config == kBlockBasedTableWithWholeKeyHashIndex)) {
+    return true;
+  }
+  if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) {
+    return true;
+  }
+  if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) {
+    return true;
+  }
+  return false;
+}
+
+// Switch to a fresh database with the next option configuration to
+// test.  Return false if there are no more configurations to test.
+bool DBTestBase::ChangeOptions(int skip_mask) {
+  for (option_config_++; option_config_ < kEnd; option_config_++) {
+    if (ShouldSkipOptions(option_config_, skip_mask)) {
+      continue;
+    }
+    break;
+  }
+
+  if (option_config_ >= kEnd) {
+    Destroy(last_options_);
+    return false;
+  } else {
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    return true;
+  }
+}
+
+// Switch between different compaction styles.
+bool DBTestBase::ChangeCompactOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    Reopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different WAL settings
+bool DBTestBase::ChangeWalOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kDBLogDir;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kDBLogDir) {
+    option_config_ = kWalDirAndMmapReads;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    options.create_if_missing = true;
+    Reopen(options);
+    return true;
+  } else if (option_config_ == kWalDirAndMmapReads) {
+    option_config_ = kRecycleLogFiles;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    Destroy(options);
+    Reopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Switch between different filter policy
+// Jump from kDefault to kFilter to kFullFilter
+bool DBTestBase::ChangeFilterOptions() {
+  if (option_config_ == kDefault) {
+    option_config_ = kFilter;
+  } else if (option_config_ == kFilter) {
+    option_config_ = kFullFilterWithNewTableReaderForCompactions;
+  } else if (option_config_ == kFullFilterWithNewTableReaderForCompactions) {
+    option_config_ = kPartitionedFilterWithNewTableReaderForCompactions;
+  } else {
+    return false;
+  }
+  Destroy(last_options_);
+
+  auto options = CurrentOptions();
+  options.create_if_missing = true;
+  TryReopen(options);
+  return true;
+}
+
+// Switch between different DB options for file ingestion tests.
+bool DBTestBase::ChangeOptionsForFileIngestionTest() {
+  if (option_config_ == kDefault) {
+    option_config_ = kUniversalCompaction;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompaction) {
+    option_config_ = kUniversalCompactionMultiLevel;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    options.create_if_missing = true;
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalCompactionMultiLevel) {
+    option_config_ = kLevelSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kLevelSubcompactions) {
+    option_config_ = kUniversalSubcompactions;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    assert(options.max_subcompactions > 1);
+    TryReopen(options);
+    return true;
+  } else if (option_config_ == kUniversalSubcompactions) {
+    option_config_ = kDirectIO;
+    Destroy(last_options_);
+    auto options = CurrentOptions();
+    TryReopen(options);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// Return the current option configuration.
+Options DBTestBase::CurrentOptions(
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, GetDefaultOptions(), options_override);
+}
+
+Options DBTestBase::CurrentOptions(
+    const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  return GetOptions(option_config_, default_options, options_override);
+}
+
+Options DBTestBase::GetDefaultOptions() const {
+  Options options;
+  options.write_buffer_size = 4090 * 4096;
+  options.target_file_size_base = 2 * 1024 * 1024;
+  options.max_bytes_for_level_base = 10 * 1024 * 1024;
+  options.max_open_files = 5000;
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  options.compaction_pri = CompactionPri::kByCompensatedSize;
+  options.env = env_;
+  if (!env_->skip_fsync_) {
+    options.track_and_verify_wals_in_manifest = true;
+  }
+  return options;
+}
+
+Options DBTestBase::GetOptions(
+    int option_config, const Options& default_options,
+    const anon::OptionsOverride& options_override) const {
+  // this redundant copy is to minimize code change w/o having lint error.
+  Options options = default_options;
+  BlockBasedTableOptions table_options;
+  bool set_block_based_table_factory = true;
+#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \
+    !defined(OS_AIX)
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewRandomAccessFile:O_DIRECT");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack(
+      "NewWritableFile:O_DIRECT");
+#endif
+  // kMustFreeHeapAllocations -> indicates ASAN build
+  if (kMustFreeHeapAllocations && !options_override.full_block_cache) {
+    // Detecting block cache use-after-free is normally difficult in unit
+    // tests, because as a cache, it tends to keep unreferenced entries in
+    // memory, and we normally want unit tests to take advantage of block
+    // cache for speed. However, we also want a strong chance of detecting
+    // block cache use-after-free in unit tests in ASAN builds, so for ASAN
+    // builds we use a trivially small block cache to which entries can be
+    // added but are immediately freed on no more references.
+    table_options.block_cache = NewLRUCache(/* too small */ 1);
+  }
+
+  bool can_allow_mmap = IsMemoryMappedAccessSupported();
+  switch (option_config) {
+#ifndef ROCKSDB_LITE
+    case kHashSkipList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(NewHashSkipListRepFactory(16));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kPlainTableFirstBytePrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableCappedPrefixNonMmap:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewCappedPrefixTransform(8));
+      options.allow_mmap_reads = false;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kPlainTableAllBytesPrefix:
+      options.table_factory.reset(NewPlainTableFactory());
+      options.prefix_extractor.reset(NewNoopTransform());
+      options.allow_mmap_reads = can_allow_mmap;
+      options.max_sequential_skip_in_iterations = 999999;
+      set_block_based_table_factory = false;
+      break;
+    case kVectorRep:
+      options.memtable_factory.reset(new VectorRepFactory(100));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kHashLinkList:
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      options.memtable_factory.reset(
+          NewHashLinkListRepFactory(4, 0, 3, true, 4));
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    case kDirectIO: {
+      options.use_direct_reads = true;
+      options.use_direct_io_for_flush_and_compaction = true;
+      options.compaction_readahead_size = 2 * 1024 * 1024;
+      SetupSyncPointsToMockDirectIO();
+      break;
+    }
+#endif  // ROCKSDB_LITE
+    case kMergePut:
+      options.merge_operator = MergeOperators::CreatePutOperator();
+      break;
+    case kFilter:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, true));
+      break;
+    case kFullFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kPartitionedFilterWithNewTableReaderForCompactions:
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_type =
+          BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
+      options.compaction_readahead_size = 10 * 1024 * 1024;
+      break;
+    case kUncompressed:
+      options.compression = kNoCompression;
+      break;
+    case kNumLevel_3:
+      options.num_levels = 3;
+      break;
+    case kDBLogDir:
+      options.db_log_dir = alternative_db_log_dir_;
+      break;
+    case kWalDirAndMmapReads:
+      options.wal_dir = alternative_wal_dir_;
+      // mmap reads should be orthogonal to WalDir setting, so we piggyback to
+      // this option config to test mmap reads as well
+      options.allow_mmap_reads = can_allow_mmap;
+      break;
+    case kManifestFileSize:
+      options.max_manifest_file_size = 50;  // 50 bytes
+      break;
+    case kPerfOptions:
+      options.delayed_write_rate = 8 * 1024 * 1024;
+      options.report_bg_io_stats = true;
+      // TODO(3.13) -- test more options
+      break;
+    case kUniversalCompaction:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 1;
+      break;
+    case kUniversalCompactionMultiLevel:
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      break;
+    case kCompressedBlockCache:
+      options.allow_mmap_writes = can_allow_mmap;
+      table_options.block_cache_compressed = NewLRUCache(8 * 1024 * 1024);
+      break;
+    case kInfiniteMaxOpenFiles:
+      options.max_open_files = -1;
+      break;
+    case kCRC32cChecksum: {
+      // Old default was CRC32c, but XXH3 (new default) is faster on common
+      // hardware
+      table_options.checksum = kCRC32c;
+      // Thrown in here for basic coverage:
+      options.DisableExtraChecks();
+      break;
+    }
+    case kFIFOCompaction: {
+      options.compaction_style = kCompactionStyleFIFO;
+      options.max_open_files = -1;
+      break;
+    }
+    case kBlockBasedTableWithPrefixHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+      break;
+    }
+    case kBlockBasedTableWithWholeKeyHashIndex: {
+      table_options.index_type = BlockBasedTableOptions::kHashSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndex: {
+      table_options.format_version = 3;
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      options.prefix_extractor.reset(NewNoopTransform());
+      break;
+    }
+    case kBlockBasedTableWithPartitionedIndexFormat4: {
+      table_options.format_version = 4;
+      // Format 4 changes the binary index format. Since partitioned index is a
+      // super-set of simple indexes, we are also using kTwoLevelIndexSearch to
+      // test this format.
+      table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch;
+      // The top-level index in partition filters are also affected by format 4.
+      table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
+      table_options.partition_filters = true;
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kBlockBasedTableWithIndexRestartInterval: {
+      table_options.index_block_restart_interval = 8;
+      break;
+    }
+    case kBlockBasedTableWithLatestFormat: {
+      // In case different from default
+      table_options.format_version = kLatestFormatVersion;
+      break;
+    }
+    case kOptimizeFiltersForHits: {
+      options.optimize_filters_for_hits = true;
+      set_block_based_table_factory = true;
+      break;
+    }
+    case kRowCache: {
+      options.row_cache = NewLRUCache(1024 * 1024);
+      break;
+    }
+    case kRecycleLogFiles: {
+      options.recycle_log_file_num = 2;
+      break;
+    }
+    case kLevelSubcompactions: {
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kUniversalSubcompactions: {
+      options.compaction_style = kCompactionStyleUniversal;
+      options.num_levels = 8;
+      options.max_subcompactions = 4;
+      break;
+    }
+    case kConcurrentSkipList: {
+      options.allow_concurrent_memtable_write = true;
+      options.enable_write_thread_adaptive_yield = true;
+      break;
+    }
+    case kPipelinedWrite: {
+      options.enable_pipelined_write = true;
+      break;
+    }
+    case kConcurrentWALWrites: {
+      // This options optimize 2PC commit path
+      options.two_write_queues = true;
+      options.manual_wal_flush = true;
+      break;
+    }
+    case kUnorderedWrite: {
+      options.allow_concurrent_memtable_write = false;
+      options.unordered_write = false;
+      break;
+    }
+
+    default:
+      break;
+  }
+
+  if (options_override.filter_policy) {
+    table_options.filter_policy = options_override.filter_policy;
+    table_options.partition_filters = options_override.partition_filters;
+    table_options.metadata_block_size = options_override.metadata_block_size;
+  }
+  if (set_block_based_table_factory) {
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  }
+  options.env = env_;
+  options.create_if_missing = true;
+  options.fail_if_options_file_error = true;
+  return options;
+}
+
+void DBTestBase::CreateColumnFamilies(const std::vector<std::string>& cfs,
+                                      const Options& options) {
+  ColumnFamilyOptions cf_opts(options);
+  size_t cfi = handles_.size();
+  handles_.resize(cfi + cfs.size());
+  for (auto cf : cfs) {
+    Status s = db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++]);
+    ASSERT_OK(s);
+  }
+}
+
+void DBTestBase::CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                                       const Options& options) {
+  CreateColumnFamilies(cfs, options);
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+  ReopenWithColumnFamilies(cfs_plus_default, options);
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const std::vector<Options>& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                          const Options& options) {
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs, options));
+}
+
+void DBTestBase::SetTimeElapseOnlySleepOnReopen(DBOptions* options) {
+  time_elapse_only_sleep_on_reopen_ = true;
+
+  // Need to disable stats dumping and persisting which also use
+  // RepeatableThread, which uses InstrumentedCondVar::TimedWaitInternal.
+  // With time_elapse_only_sleep_, this can hang on some platforms (MacOS)
+  // because (a) on some platforms, pthread_cond_timedwait does not appear
+  // to release the lock for other threads to operate if the deadline time
+  // is already passed, and (b) TimedWait calls are currently a bad abstraction
+  // because the deadline parameter is usually computed from Env time,
+  // but is interpreted in real clock time.
+  options->stats_dump_period_sec = 0;
+  options->stats_persist_period_sec = 0;
+}
+
+void DBTestBase::MaybeInstallTimeElapseOnlySleep(const DBOptions& options) {
+  if (time_elapse_only_sleep_on_reopen_) {
+    assert(options.env == env_ ||
+           static_cast_with_check<CompositeEnvWrapper>(options.env)
+                   ->env_target() == env_);
+    assert(options.stats_dump_period_sec == 0);
+    assert(options.stats_persist_period_sec == 0);
+    // We cannot set these before destroying the last DB because they might
+    // cause a deadlock or similar without the appropriate options set in
+    // the DB.
+    env_->time_elapse_only_sleep_ = true;
+    env_->no_slowdown_ = true;
+  } else {
+    // Going back in same test run is not yet supported, so no
+    // reset in this case.
+  }
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const std::vector<Options>& options) {
+  Close();
+  EXPECT_EQ(cfs.size(), options.size());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (size_t i = 0; i < cfs.size(); ++i) {
+    column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i]));
+  }
+  DBOptions db_opts = DBOptions(options[0]);
+  last_options_ = options[0];
+  MaybeInstallTimeElapseOnlySleep(db_opts);
+  return DB::Open(db_opts, dbname_, column_families, &handles_, &db_);
+}
+
+Status DBTestBase::TryReopenWithColumnFamilies(
+    const std::vector<std::string>& cfs, const Options& options) {
+  Close();
+  std::vector<Options> v_opts(cfs.size(), options);
+  return TryReopenWithColumnFamilies(cfs, v_opts);
+}
+
+void DBTestBase::Reopen(const Options& options) {
+  ASSERT_OK(TryReopen(options));
+}
+
+void DBTestBase::Close() {
+  for (auto h : handles_) {
+    EXPECT_OK(db_->DestroyColumnFamilyHandle(h));
+  }
+  handles_.clear();
+  delete db_;
+  db_ = nullptr;
+}
+
+void DBTestBase::DestroyAndReopen(const Options& options) {
+  // Destroy using last options
+  Destroy(last_options_);
+  Reopen(options);
+}
+
+void DBTestBase::Destroy(const Options& options, bool delete_cf_paths) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  if (delete_cf_paths) {
+    for (size_t i = 0; i < handles_.size(); ++i) {
+      ColumnFamilyDescriptor cfdescriptor;
+      // GetDescriptor is not implemented for ROCKSDB_LITE
+      handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError();
+      column_families.push_back(cfdescriptor);
+    }
+  }
+  Close();
+  ASSERT_OK(DestroyDB(dbname_, options, column_families));
+}
+
+Status DBTestBase::ReadOnlyReopen(const Options& options) {
+  MaybeInstallTimeElapseOnlySleep(options);
+  return DB::OpenForReadOnly(options, dbname_, &db_);
+}
+
+Status DBTestBase::TryReopen(const Options& options) {
+  Close();
+  last_options_.table_factory.reset();
+  // Note: operator= is an unsafe approach here since it destructs
+  // std::shared_ptr in the same order of their creation, in contrast to
+  // destructors which destructs them in the opposite order of creation. One
+  // particular problem is that the cache destructor might invoke callback
+  // functions that use Option members such as statistics. To work around this
+  // problem, we manually call destructor of table_factory which eventually
+  // clears the block cache.
+  last_options_ = options;
+  MaybeInstallTimeElapseOnlySleep(options);
+  return DB::Open(options, dbname_, &db_);
+}
+
+bool DBTestBase::IsDirectIOSupported() {
+  return test::IsDirectIOSupported(env_, dbname_);
+}
+
+bool DBTestBase::IsMemoryMappedAccessSupported() const {
+  return (!encrypted_env_);
+}
+
+Status DBTestBase::Flush(int cf) {
+  if (cf == 0) {
+    return db_->Flush(FlushOptions());
+  } else {
+    return db_->Flush(FlushOptions(), handles_[cf]);
+  }
+}
+
+Status DBTestBase::Flush(const std::vector<int>& cf_ids) {
+  std::vector<ColumnFamilyHandle*> cfhs;
+  std::for_each(cf_ids.begin(), cf_ids.end(),
+                [&cfhs, this](int id) { cfhs.emplace_back(handles_[id]); });
+  return db_->Flush(FlushOptions(), cfhs);
+}
+
+Status DBTestBase::Put(const Slice& k, const Slice& v, WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, k, v);
+  } else {
+    return db_->Put(wo, k, v);
+  }
+}
+
+Status DBTestBase::Put(int cf, const Slice& k, const Slice& v,
+                       WriteOptions wo) {
+  if (kMergePut == option_config_) {
+    return db_->Merge(wo, handles_[cf], k, v);
+  } else {
+    return db_->Put(wo, handles_[cf], k, v);
+  }
+}
+
+Status DBTestBase::Merge(const Slice& k, const Slice& v, WriteOptions wo) {
+  return db_->Merge(wo, k, v);
+}
+
+Status DBTestBase::Merge(int cf, const Slice& k, const Slice& v,
+                         WriteOptions wo) {
+  return db_->Merge(wo, handles_[cf], k, v);
+}
+
+Status DBTestBase::Delete(const std::string& k) {
+  return db_->Delete(WriteOptions(), k);
+}
+
+Status DBTestBase::Delete(int cf, const std::string& k) {
+  return db_->Delete(WriteOptions(), handles_[cf], k);
+}
+
+Status DBTestBase::SingleDelete(const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), k);
+}
+
+Status DBTestBase::SingleDelete(int cf, const std::string& k) {
+  return db_->SingleDelete(WriteOptions(), handles_[cf], k);
+}
+
+std::string DBTestBase::Get(const std::string& k, const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::string DBTestBase::Get(int cf, const std::string& k,
+                            const Snapshot* snapshot) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  std::string result;
+  Status s = db_->Get(options, handles_[cf], k, &result);
+  if (s.IsNotFound()) {
+    result = "NOT_FOUND";
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(std::vector<int> cfs,
+                                              const std::vector<std::string>& k,
+                                              const Snapshot* snapshot,
+                                              const bool batched,
+                                              const bool async) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  options.async_io = async;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::vector<Slice> keys;
+  std::vector<std::string> result;
+
+  for (unsigned int i = 0; i < cfs.size(); ++i) {
+    handles.push_back(handles_[cfs[i]]);
+    keys.push_back(k[i]);
+  }
+  std::vector<Status> s;
+  if (!batched) {
+    s = db_->MultiGet(options, handles, keys, &result);
+    for (size_t i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      }
+    }
+  } else {
+    std::vector<PinnableSlice> pin_values(cfs.size());
+    result.resize(cfs.size());
+    s.resize(cfs.size());
+    db_->MultiGet(options, cfs.size(), handles.data(), keys.data(),
+                  pin_values.data(), s.data());
+    for (size_t i = 0; i < s.size(); ++i) {
+      if (s[i].IsNotFound()) {
+        result[i] = "NOT_FOUND";
+      } else if (!s[i].ok()) {
+        result[i] = s[i].ToString();
+      } else {
+        result[i].assign(pin_values[i].data(), pin_values[i].size());
+        // Increase likelihood of detecting potential use-after-free bugs with
+        // PinnableSlices tracking the same resource
+        pin_values[i].Reset();
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<std::string> DBTestBase::MultiGet(const std::vector<std::string>& k,
+                                              const Snapshot* snapshot,
+                                              const bool async) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  options.snapshot = snapshot;
+  options.async_io = async;
+  std::vector<Slice> keys;
+  std::vector<std::string> result(k.size());
+  std::vector<Status> statuses(k.size());
+  std::vector<PinnableSlice> pin_values(k.size());
+
+  for (size_t i = 0; i < k.size(); ++i) {
+    keys.push_back(k[i]);
+  }
+  db_->MultiGet(options, dbfull()->DefaultColumnFamily(), keys.size(),
+                keys.data(), pin_values.data(), statuses.data());
+  for (size_t i = 0; i < statuses.size(); ++i) {
+    if (statuses[i].IsNotFound()) {
+      result[i] = "NOT_FOUND";
+    } else if (!statuses[i].ok()) {
+      result[i] = statuses[i].ToString();
+    } else {
+      result[i].assign(pin_values[i].data(), pin_values[i].size());
+      // Increase likelihood of detecting potential use-after-free bugs with
+      // PinnableSlices tracking the same resource
+      pin_values[i].Reset();
+    }
+  }
+  return result;
+}
+
+Status DBTestBase::Get(const std::string& k, PinnableSlice* v) {
+  ReadOptions options;
+  options.verify_checksums = true;
+  Status s = dbfull()->Get(options, dbfull()->DefaultColumnFamily(), k, v);
+  return s;
+}
+
+uint64_t DBTestBase::GetNumSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetTimeOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num));
+  return int_num;
+}
+
+uint64_t DBTestBase::GetSequenceOldestSnapshots() {
+  uint64_t int_num;
+  EXPECT_TRUE(
+      dbfull()->GetIntProperty("rocksdb.oldest-snapshot-sequence", &int_num));
+  return int_num;
+}
+
+// Return a string that contains all key,value pairs in order,
+// formatted like "(k1->v1)(k2->v2)".
+std::string DBTestBase::Contents(int cf) {
+  std::vector<std::string> forward;
+  std::string result;
+  Iterator* iter = (cf == 0) ? db_->NewIterator(ReadOptions())
+                             : db_->NewIterator(ReadOptions(), handles_[cf]);
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    std::string s = IterStatus(iter);
+    result.push_back('(');
+    result.append(s);
+    result.push_back(')');
+    forward.push_back(s);
+  }
+
+  // Check reverse iteration results are the reverse of forward results
+  unsigned int matched = 0;
+  for (iter->SeekToLast(); iter->Valid(); iter->Prev()) {
+    EXPECT_LT(matched, forward.size());
+    EXPECT_EQ(IterStatus(iter), forward[forward.size() - matched - 1]);
+    matched++;
+  }
+  EXPECT_EQ(matched, forward.size());
+
+  delete iter;
+  return result;
+}
+
+void DBTestBase::CheckAllEntriesWithFifoReopen(
+    const std::string& expected_value, const Slice& user_key, int cf,
+    const std::vector<std::string>& cfs, const Options& options) {
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+  std::vector<std::string> cfs_plus_default = cfs;
+  cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName);
+
+  Options fifo_options(options);
+  fifo_options.compaction_style = kCompactionStyleFIFO;
+  fifo_options.max_open_files = -1;
+  fifo_options.disable_auto_compactions = true;
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, fifo_options));
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+
+  ASSERT_OK(TryReopenWithColumnFamilies(cfs_plus_default, options));
+  ASSERT_EQ(AllEntriesFor(user_key, cf), expected_value);
+}
+
+std::string DBTestBase::AllEntriesFor(const Slice& user_key, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  if (cf == 0) {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber, handles_[cf]));
+  }
+  InternalKey target(user_key, kMaxSequenceNumber, kTypeValue);
+  iter->Seek(target.Encode());
+  std::string result;
+  if (!iter->status().ok()) {
+    result = iter->status().ToString();
+  } else {
+    result = "[ ";
+    bool first = true;
+    while (iter->Valid()) {
+      ParsedInternalKey ikey(Slice(), 0, kTypeValue);
+      if (ParseInternalKey(iter->key(), &ikey, true /* log_err_key */) !=
+          Status::OK()) {
+        result += "CORRUPTED";
+      } else {
+        if (!last_options_.comparator->Equal(ikey.user_key, user_key)) {
+          break;
+        }
+        if (!first) {
+          result += ", ";
+        }
+        first = false;
+        switch (ikey.type) {
+          case kTypeValue:
+            result += iter->value().ToString();
+            break;
+          case kTypeMerge:
+            // keep it the same as kTypeValue for testing kMergePut
+            result += iter->value().ToString();
+            break;
+          case kTypeDeletion:
+            result += "DEL";
+            break;
+          case kTypeSingleDeletion:
+            result += "SDEL";
+            break;
+          default:
+            assert(false);
+            break;
+        }
+      }
+      iter->Next();
+    }
+    if (!first) {
+      result += " ";
+    }
+    result += "]";
+  }
+  return result;
+}
+
+#ifndef ROCKSDB_LITE
+int DBTestBase::NumSortedRuns(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  int num_sr = static_cast<int>(cf_meta.levels[0].files.size());
+  for (size_t i = 1U; i < cf_meta.levels.size(); i++) {
+    if (cf_meta.levels[i].files.size() > 0) {
+      num_sr++;
+    }
+  }
+  return num_sr;
+}
+
+uint64_t DBTestBase::TotalSize(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  return cf_meta.size;
+}
+
+uint64_t DBTestBase::SizeAtLevel(int level) {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  uint64_t sum = 0;
+  for (const auto& m : metadata) {
+    if (m.level == level) {
+      sum += m.size;
+    }
+  }
+  return sum;
+}
+
+size_t DBTestBase::TotalLiveFiles(int cf) {
+  ColumnFamilyMetaData cf_meta;
+  if (cf == 0) {
+    db_->GetColumnFamilyMetaData(&cf_meta);
+  } else {
+    db_->GetColumnFamilyMetaData(handles_[cf], &cf_meta);
+  }
+  size_t num_files = 0;
+  for (auto& level : cf_meta.levels) {
+    num_files += level.files.size();
+  }
+  return num_files;
+}
+
+size_t DBTestBase::CountLiveFiles() {
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  return metadata.size();
+}
+
+int DBTestBase::NumTableFilesAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf], "rocksdb.num-files-at-level" + std::to_string(level),
+        &property));
+  }
+  return atoi(property.c_str());
+}
+
+double DBTestBase::CompressionRatioAtLevel(int level, int cf) {
+  std::string property;
+  if (cf == 0) {
+    // default cfd
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.compression-ratio-at-level" + std::to_string(level),
+        &property));
+  } else {
+    EXPECT_TRUE(db_->GetProperty(
+        handles_[cf],
+        "rocksdb.compression-ratio-at-level" + std::to_string(level),
+        &property));
+  }
+  return std::stod(property);
+}
+
+int DBTestBase::TotalTableFiles(int cf, int levels) {
+  if (levels == -1) {
+    levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  }
+  int result = 0;
+  for (int level = 0; level < levels; level++) {
+    result += NumTableFilesAtLevel(level, cf);
+  }
+  return result;
+}
+
+// Return spread of files per level
+std::string DBTestBase::FilesPerLevel(int cf) {
+  int num_levels =
+      (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]);
+  std::string result;
+  size_t last_non_zero_offset = 0;
+  for (int level = 0; level < num_levels; level++) {
+    int f = NumTableFilesAtLevel(level, cf);
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+    result += buf;
+    if (f > 0) {
+      last_non_zero_offset = result.size();
+    }
+  }
+  result.resize(last_non_zero_offset);
+  return result;
+}
+
+#endif  // !ROCKSDB_LITE
+
+std::vector<uint64_t> DBTestBase::GetBlobFileNumbers() {
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const current = cfd->current();
+  assert(current);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  assert(storage_info);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+
+  std::vector<uint64_t> result;
+  result.reserve(blob_files.size());
+
+  for (const auto& blob_file : blob_files) {
+    assert(blob_file);
+    result.emplace_back(blob_file->GetBlobFileNumber());
+  }
+
+  return result;
+}
+
+size_t DBTestBase::CountFiles() {
+  size_t count = 0;
+  std::vector<std::string> files;
+  if (env_->GetChildren(dbname_, &files).ok()) {
+    count += files.size();
+  }
+
+  if (dbname_ != last_options_.wal_dir) {
+    if (env_->GetChildren(last_options_.wal_dir, &files).ok()) {
+      count += files.size();
+    }
+  }
+
+  return count;
+};
+
+Status DBTestBase::CountFiles(size_t* count) {
+  std::vector<std::string> files;
+  Status s = env_->GetChildren(dbname_, &files);
+  if (!s.ok()) {
+    return s;
+  }
+  size_t files_count = files.size();
+
+  if (dbname_ != last_options_.wal_dir) {
+    s = env_->GetChildren(last_options_.wal_dir, &files);
+    if (!s.ok()) {
+      return s;
+    }
+    *count = files_count + files.size();
+  }
+
+  return Status::OK();
+}
+
+Status DBTestBase::Size(const Slice& start, const Slice& limit, int cf,
+                        uint64_t* size) {
+  Range r(start, limit);
+  if (cf == 0) {
+    return db_->GetApproximateSizes(&r, 1, size);
+  } else {
+    return db_->GetApproximateSizes(handles_[1], &r, 1, size);
+  }
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit,
+                         uint32_t target_path_id) {
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = target_path_id;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(int cf, const Slice& start, const Slice& limit) {
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[cf], &start, &limit));
+}
+
+void DBTestBase::Compact(const Slice& start, const Slice& limit) {
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &limit));
+}
+
+// Do n memtable compactions, each of which produces an sstable
+// covering the range [small,large].
+void DBTestBase::MakeTables(int n, const std::string& small,
+                            const std::string& large, int cf) {
+  for (int i = 0; i < n; i++) {
+    ASSERT_OK(Put(cf, small, "begin"));
+    ASSERT_OK(Put(cf, large, "end"));
+    ASSERT_OK(Flush(cf));
+    MoveFilesToLevel(n - i - 1, cf);
+  }
+}
+
+// Prevent pushing of new sstables into deeper levels by adding
+// tables that cover a specified range to all levels.
+void DBTestBase::FillLevels(const std::string& smallest,
+                            const std::string& largest, int cf) {
+  MakeTables(db_->NumberLevels(handles_[cf]), smallest, largest, cf);
+}
+
+void DBTestBase::MoveFilesToLevel(int level, int cf) {
+  for (int l = 0; l < level; ++l) {
+    if (cf > 0) {
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr, handles_[cf]));
+    } else {
+      EXPECT_OK(dbfull()->TEST_CompactRange(l, nullptr, nullptr));
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+void DBTestBase::DumpFileCounts(const char* label) {
+  fprintf(stderr, "---\n%s:\n", label);
+  fprintf(stderr, "maxoverlap: %" PRIu64 "\n",
+          dbfull()->TEST_MaxNextLevelOverlappingBytes());
+  for (int level = 0; level < db_->NumberLevels(); level++) {
+    int num = NumTableFilesAtLevel(level);
+    if (num > 0) {
+      fprintf(stderr, "  level %3d : %d files\n", level, num);
+    }
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+std::string DBTestBase::DumpSSTableList() {
+  std::string property;
+  db_->GetProperty("rocksdb.sstables", &property);
+  return property;
+}
+
+void DBTestBase::GetSstFiles(Env* env, std::string path,
+                             std::vector<std::string>* files) {
+  EXPECT_OK(env->GetChildren(path, files));
+
+  files->erase(std::remove_if(files->begin(), files->end(),
+                              [](std::string name) {
+                                uint64_t number;
+                                FileType type;
+                                return !(ParseFileName(name, &number, &type) &&
+                                         type == kTableFile);
+                              }),
+               files->end());
+}
+
+int DBTestBase::GetSstFileCount(std::string path) {
+  std::vector<std::string> files;
+  DBTestBase::GetSstFiles(env_, path, &files);
+  return static_cast<int>(files.size());
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(int cf, Random* rnd, int* key_idx,
+                                 bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(cf, Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+// this will generate non-overlapping files since it keeps increasing key_idx
+void DBTestBase::GenerateNewFile(Random* rnd, int* key_idx, bool nowait) {
+  for (int i = 0; i < KNumKeysByGenerateNewFile; i++) {
+    ASSERT_OK(Put(Key(*key_idx), rnd->RandomString((i == 99) ? 1 : 990)));
+    (*key_idx)++;
+  }
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+const int DBTestBase::kNumKeysByGenerateNewRandomFile = 51;
+
+void DBTestBase::GenerateNewRandomFile(Random* rnd, bool nowait) {
+  for (int i = 0; i < kNumKeysByGenerateNewRandomFile; i++) {
+    ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(2000)));
+  }
+  ASSERT_OK(Put("key" + rnd->RandomString(7), rnd->RandomString(200)));
+  if (!nowait) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+}
+
+std::string DBTestBase::IterStatus(Iterator* iter) {
+  std::string result;
+  if (iter->Valid()) {
+    result = iter->key().ToString() + "->" + iter->value().ToString();
+  } else {
+    result = "(invalid)";
+  }
+  return result;
+}
+
+Options DBTestBase::OptionsForLogIterTest() {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.WAL_ttl_seconds = 1000;
+  return options;
+}
+
+std::string DBTestBase::DummyString(size_t len, char c) {
+  return std::string(len, c);
+}
+
+void DBTestBase::VerifyIterLast(std::string expected_key, int cf) {
+  Iterator* iter;
+  ReadOptions ro;
+  if (cf == 0) {
+    iter = db_->NewIterator(ro);
+  } else {
+    iter = db_->NewIterator(ro, handles_[cf]);
+  }
+  iter->SeekToLast();
+  ASSERT_EQ(IterStatus(iter), expected_key);
+  delete iter;
+}
+
+// Used to test InplaceUpdate
+
+// If previous value is nullptr or delta is > than previous value,
+//   sets newValue with delta
+// If previous value is not empty,
+//   updates previous value with 'b' string of previous value size - 1.
+UpdateStatus DBTestBase::updateInPlaceSmallerSize(char* prevValue,
+                                                  uint32_t* prevSize,
+                                                  Slice delta,
+                                                  std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = *prevSize - 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceSmallerVarintSize(char* prevValue,
+                                                        uint32_t* prevSize,
+                                                        Slice delta,
+                                                        std::string* newValue) {
+  if (prevValue == nullptr) {
+    *newValue = std::string(delta.size(), 'c');
+    return UpdateStatus::UPDATED;
+  } else {
+    *prevSize = 1;
+    std::string str_b = std::string(*prevSize, 'b');
+    memcpy(prevValue, str_b.c_str(), str_b.size());
+    return UpdateStatus::UPDATED_INPLACE;
+  }
+}
+
+UpdateStatus DBTestBase::updateInPlaceLargerSize(char* /*prevValue*/,
+                                                 uint32_t* /*prevSize*/,
+                                                 Slice delta,
+                                                 std::string* newValue) {
+  *newValue = std::string(delta.size(), 'c');
+  return UpdateStatus::UPDATED;
+}
+
+UpdateStatus DBTestBase::updateInPlaceNoAction(char* /*prevValue*/,
+                                               uint32_t* /*prevSize*/,
+                                               Slice /*delta*/,
+                                               std::string* /*newValue*/) {
+  return UpdateStatus::UPDATE_FAILED;
+}
+
+// Utility method to test InplaceUpdate
+void DBTestBase::validateNumberOfEntries(int numValues, int cf) {
+  Arena arena;
+  auto options = CurrentOptions();
+  InternalKeyComparator icmp(options.comparator);
+  ReadOptions read_options;
+  ScopedArenaIterator iter;
+  if (cf != 0) {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber, handles_[cf]));
+  } else {
+    iter.set(dbfull()->NewInternalIterator(read_options, &arena,
+                                           kMaxSequenceNumber));
+  }
+  iter->SeekToFirst();
+  ASSERT_OK(iter->status());
+  int seq = numValues;
+  while (iter->Valid()) {
+    ParsedInternalKey ikey;
+    ikey.clear();
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+
+    // checks sequence number for updates
+    ASSERT_EQ(ikey.sequence, (unsigned)seq--);
+    iter->Next();
+  }
+  ASSERT_EQ(0, seq);
+}
+
+void DBTestBase::CopyFile(const std::string& source,
+                          const std::string& destination, uint64_t size) {
+  const EnvOptions soptions;
+  std::unique_ptr<SequentialFile> srcfile;
+  ASSERT_OK(env_->NewSequentialFile(source, &srcfile, soptions));
+  std::unique_ptr<WritableFile> destfile;
+  ASSERT_OK(env_->NewWritableFile(destination, &destfile, soptions));
+
+  if (size == 0) {
+    // default argument means copy everything
+    ASSERT_OK(env_->GetFileSize(source, &size));
+  }
+
+  char buffer[4096];
+  Slice slice;
+  while (size > 0) {
+    uint64_t one = std::min(uint64_t(sizeof(buffer)), size);
+    ASSERT_OK(srcfile->Read(one, &slice, buffer));
+    ASSERT_OK(destfile->Append(slice));
+    size -= slice.size();
+  }
+  ASSERT_OK(destfile->Close());
+}
+
+Status DBTestBase::GetAllDataFiles(
+    const FileType file_type, std::unordered_map<std::string, uint64_t>* files,
+    uint64_t* total_size /* = nullptr */) {
+  if (total_size) {
+    *total_size = 0;
+  }
+  std::vector<std::string> children;
+  Status s = env_->GetChildren(dbname_, &children);
+  if (s.ok()) {
+    for (auto& file_name : children) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file_name, &number, &type) && type == file_type) {
+        std::string file_path = dbname_ + "/" + file_name;
+        uint64_t file_size = 0;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          break;
+        }
+        (*files)[file_path] = file_size;
+        if (total_size) {
+          *total_size += file_size;
+        }
+      }
+    }
+  }
+  return s;
+}
+
+std::vector<std::uint64_t> DBTestBase::ListTableFiles(Env* env,
+                                                      const std::string& path) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  EXPECT_OK(env->GetChildren(path, &files));
+  uint64_t number;
+  FileType type;
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == kTableFile) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+void DBTestBase::VerifyDBFromMap(std::map<std::string, std::string> true_data,
+                                 size_t* total_reads_res, bool tailing_iter,
+                                 std::map<std::string, Status> status) {
+  size_t total_reads = 0;
+
+  for (auto& kv : true_data) {
+    Status s = status[kv.first];
+    if (s.ok()) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    } else {
+      std::string value;
+      ASSERT_EQ(s, db_->Get(ReadOptions(), kv.first, &value));
+    }
+    total_reads++;
+  }
+
+  // Normal Iterator
+  {
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+    // Verify Iterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    Status s;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      Status current_status = status[data_iter->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end())
+        << iter_cnt << " / " << true_data.size();
+    delete iter;
+
+    // Verify Iterator::Prev()
+    // Use a new iterator to make sure its status is clean.
+    iter = db_->NewIterator(ro);
+    iter_cnt = 0;
+    s = Status::OK();
+    auto data_rev = true_data.rbegin();
+    for (iter->SeekToLast(); iter->Valid(); iter->Prev(), data_rev++) {
+      ASSERT_EQ(iter->key().ToString(), data_rev->first);
+      Status current_status = status[data_rev->first];
+      if (!current_status.ok()) {
+        s = current_status;
+      }
+      ASSERT_EQ(iter->status(), s);
+      if (current_status.ok()) {
+        ASSERT_EQ(iter->value().ToString(), data_rev->second);
+      }
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_rev, true_data.rend())
+        << iter_cnt << " / " << true_data.size();
+
+    // Verify Iterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+    delete iter;
+  }
+
+  if (tailing_iter) {
+#ifndef ROCKSDB_LITE
+    // Tailing iterator
+    int iter_cnt = 0;
+    ReadOptions ro;
+    ro.tailing = true;
+    ro.total_order_seek = true;
+    Iterator* iter = db_->NewIterator(ro);
+
+    // Verify ForwardIterator::Next()
+    iter_cnt = 0;
+    auto data_iter = true_data.begin();
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), data_iter++) {
+      ASSERT_EQ(iter->key().ToString(), data_iter->first);
+      ASSERT_EQ(iter->value().ToString(), data_iter->second);
+      iter_cnt++;
+      total_reads++;
+    }
+    ASSERT_EQ(data_iter, true_data.end())
+        << iter_cnt << " / " << true_data.size();
+
+    // Verify ForwardIterator::Seek()
+    for (auto kv : true_data) {
+      iter->Seek(kv.first);
+      ASSERT_EQ(kv.first, iter->key().ToString());
+      ASSERT_EQ(kv.second, iter->value().ToString());
+      total_reads++;
+    }
+
+    delete iter;
+#endif  // ROCKSDB_LITE
+  }
+
+  if (total_reads_res) {
+    *total_reads_res = total_reads;
+  }
+}
+
+void DBTestBase::VerifyDBInternal(
+    std::vector<std::pair<std::string, std::string>> true_data) {
+  Arena arena;
+  InternalKeyComparator icmp(last_options_.comparator);
+  ReadOptions read_options;
+  auto iter =
+      dbfull()->NewInternalIterator(read_options, &arena, kMaxSequenceNumber);
+  iter->SeekToFirst();
+  for (auto p : true_data) {
+    ASSERT_TRUE(iter->Valid());
+    ParsedInternalKey ikey;
+    ASSERT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+    ASSERT_EQ(p.first, ikey.user_key);
+    ASSERT_EQ(p.second, iter->value());
+    iter->Next();
+  };
+  ASSERT_FALSE(iter->Valid());
+  iter->~InternalIterator();
+}
+
+#ifndef ROCKSDB_LITE
+
+uint64_t DBTestBase::GetNumberOfSstFilesForColumnFamily(
+    DB* db, std::string column_family_name) {
+  std::vector<LiveFileMetaData> metadata;
+  db->GetLiveFilesMetaData(&metadata);
+  uint64_t result = 0;
+  for (auto& fileMetadata : metadata) {
+    result += (fileMetadata.column_family_name == column_family_name);
+  }
+  return result;
+}
+
+uint64_t DBTestBase::GetSstSizeHelper(Temperature temperature) {
+  std::string prop;
+  EXPECT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature +
+          std::to_string(static_cast<uint8_t>(temperature)),
+      &prop));
+  return static_cast<uint64_t>(std::atoi(prop.c_str()));
+}
+#endif  // ROCKSDB_LITE
+
+void VerifySstUniqueIds(const TablePropertiesCollection& props) {
+  ASSERT_FALSE(props.empty());  // suspicious test if empty
+  std::unordered_set<std::string> seen;
+  for (auto& pair : props) {
+    std::string id;
+    ASSERT_OK(GetUniqueIdFromTableProperties(*pair.second, &id));
+    ASSERT_TRUE(seen.insert(id).second);
+  }
+}
+
+template <CacheEntryRole R>
+TargetCacheChargeTrackingCache<R>::TargetCacheChargeTrackingCache(
+    std::shared_ptr<Cache> target)
+    : CacheWrapper(std::move(target)),
+      cur_cache_charge_(0),
+      cache_charge_peak_(0),
+      cache_charge_increment_(0),
+      last_peak_tracked_(false),
+      cache_charge_increments_sum_(0) {}
+
+template <CacheEntryRole R>
+Status TargetCacheChargeTrackingCache<R>::Insert(
+    const Slice& key, void* value, size_t charge,
+    void (*deleter)(const Slice& key, void* value), Handle** handle,
+    Priority priority) {
+  Status s = target_->Insert(key, value, charge, deleter, handle, priority);
+  if (deleter == kNoopDeleter) {
+    if (last_peak_tracked_) {
+      cache_charge_peak_ = 0;
+      cache_charge_increment_ = 0;
+      last_peak_tracked_ = false;
+    }
+    if (s.ok()) {
+      cur_cache_charge_ += charge;
+    }
+    cache_charge_peak_ = std::max(cache_charge_peak_, cur_cache_charge_);
+    cache_charge_increment_ += charge;
+  }
+
+  return s;
+}
+
+template <CacheEntryRole R>
+bool TargetCacheChargeTrackingCache<R>::Release(Handle* handle,
+                                                bool erase_if_last_ref) {
+  auto deleter = GetDeleter(handle);
+  if (deleter == kNoopDeleter) {
+    if (!last_peak_tracked_) {
+      cache_charge_peaks_.push_back(cache_charge_peak_);
+      cache_charge_increments_sum_ += cache_charge_increment_;
+      last_peak_tracked_ = true;
+    }
+    cur_cache_charge_ -= GetCharge(handle);
+  }
+  bool is_successful = target_->Release(handle, erase_if_last_ref);
+  return is_successful;
+}
+
+template <CacheEntryRole R>
+const Cache::DeleterFn TargetCacheChargeTrackingCache<R>::kNoopDeleter =
+    CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole();
+
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kFilterConstruction>;
+template class TargetCacheChargeTrackingCache<
+    CacheEntryRole::kBlockBasedTableReader>;
+template class TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>;
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_test_util.h b/src/rocksdb/db/db_test_util.h
new file mode 100644
index 000000000..29d5cd9d7
--- /dev/null
+++ b/src/rocksdb/db/db_test_util.h
@@ -0,0 +1,1402 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <fcntl.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "file/filename.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/sst_file_writer.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/table.h"
+#include "rocksdb/utilities/checkpoint.h"
+#include "table/mock_table.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/compression.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class MockEnv;
+
+namespace anon {
+class AtomicCounter {
+ public:
+  explicit AtomicCounter(Env* env = NULL)
+      : env_(env), cond_count_(&mu_), count_(0) {}
+
+  void Increment() {
+    MutexLock l(&mu_);
+    count_++;
+    cond_count_.SignalAll();
+  }
+
+  int Read() {
+    MutexLock l(&mu_);
+    return count_;
+  }
+
+  bool WaitFor(int count) {
+    MutexLock l(&mu_);
+
+    uint64_t start = env_->NowMicros();
+    while (count_ < count) {
+      uint64_t now = env_->NowMicros();
+      cond_count_.TimedWait(now + /*1s*/ 1 * 1000 * 1000);
+      if (env_->NowMicros() - start > /*10s*/ 10 * 1000 * 1000) {
+        return false;
+      }
+      if (count_ < count) {
+        GTEST_LOG_(WARNING) << "WaitFor is taking more time than usual";
+      }
+    }
+
+    return true;
+  }
+
+  void Reset() {
+    MutexLock l(&mu_);
+    count_ = 0;
+    cond_count_.SignalAll();
+  }
+
+ private:
+  Env* env_;
+  port::Mutex mu_;
+  port::CondVar cond_count_;
+  int count_;
+};
+
+struct OptionsOverride {
+  std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
+  // These will be used only if filter_policy is set
+  bool partition_filters = false;
+  // Force using a default block cache. (Setting to false allows ASAN build
+  // use a trivially small block cache for better UAF error detection.)
+  bool full_block_cache = false;
+  uint64_t metadata_block_size = 1024;
+
+  // Used as a bit mask of individual enums in which to skip an XF test point
+  int skip_policy = 0;
+};
+
+}  // namespace anon
+
+enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 };
+
+// Special Env used to delay background operations
+class SpecialEnv : public EnvWrapper {
+ public:
+  explicit SpecialEnv(Env* base, bool time_elapse_only_sleep = false);
+
+  static const char* kClassName() { return "SpecialEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status NewWritableFile(const std::string& f, std::unique_ptr<WritableFile>* r,
+                         const EnvOptions& soptions) override {
+    class SSTableFile : public WritableFile {
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+
+     public:
+      SSTableFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& base)
+          : env_(env), base_(std::move(base)) {}
+      Status Append(const Slice& data) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->Append(data);
+        }
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+      Status PositionedAppend(const Slice& data, uint64_t offset) override {
+        if (env_->table_write_callback_) {
+          (*env_->table_write_callback_)();
+        }
+        if (env_->drop_writes_.load(std::memory_order_acquire)) {
+          // Drop writes on the floor
+          return Status::OK();
+        } else if (env_->no_space_.load(std::memory_order_acquire)) {
+          return Status::NoSpace("No space left on device");
+        } else {
+          env_->bytes_written_ += data.size();
+          return base_->PositionedAppend(data, offset);
+        }
+      }
+      Status PositionedAppend(
+          const Slice& data, uint64_t offset,
+          const DataVerificationInfo& /* verification_info */) override {
+        return PositionedAppend(data, offset);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status RangeSync(uint64_t offset, uint64_t nbytes) override {
+        Status s = base_->RangeSync(offset, nbytes);
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::RangeSync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        // preallocation size is never passed to base file.
+        size_t preallocation_size = preallocation_block_size();
+        TEST_SYNC_POINT_CALLBACK("DBTestWritableFile.GetPreallocationStatus",
+                                 &preallocation_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        Status s = base_->Close();
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Close", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) {
+          env_->SleepForMicroseconds(100000);
+        }
+        Status s;
+        if (!env_->skip_fsync_) {
+          s = base_->Sync();
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT_CALLBACK("SpecialEnv::SStableFile::Sync", &s);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+        return s;
+      }
+      void SetIOPriority(Env::IOPriority pri) override {
+        base_->SetIOPriority(pri);
+      }
+      Env::IOPriority GetIOPriority() override {
+        return base_->GetIOPriority();
+      }
+      bool use_direct_io() const override { return base_->use_direct_io(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+      size_t GetUniqueId(char* id, size_t max_size) const override {
+        return base_->GetUniqueId(id, max_size);
+      }
+    };
+    class ManifestFile : public WritableFile {
+     public:
+      ManifestFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override {
+        if (env_->manifest_write_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated writer error");
+        } else {
+          return base_->Append(data);
+        }
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->manifest_sync_error_.load(std::memory_order_acquire)) {
+          return Status::IOError("simulated sync error");
+        } else {
+          if (env_->skip_fsync_) {
+            return Status::OK();
+          } else {
+            return base_->Sync();
+          }
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+    class WalFile : public WritableFile {
+     public:
+      WalFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {
+        env_->num_open_wal_file_.fetch_add(1);
+      }
+      virtual ~WalFile() { env_->num_open_wal_file_.fetch_add(-1); }
+      Status Append(const Slice& data) override {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:1");
+#endif
+        Status s;
+        if (env_->log_write_error_.load(std::memory_order_acquire)) {
+          s = Status::IOError("simulated writer error");
+        } else {
+          int slowdown =
+              env_->log_write_slowdown_.load(std::memory_order_acquire);
+          if (slowdown > 0) {
+            env_->SleepForMicroseconds(slowdown);
+          }
+          s = base_->Append(data);
+        }
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        TEST_SYNC_POINT("SpecialEnv::WalFile::Append:2");
+#endif
+        return s;
+      }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /* verification_info */) override {
+        return Append(data);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      void PrepareWrite(size_t offset, size_t len) override {
+        base_->PrepareWrite(offset, len);
+      }
+      void SetPreallocationBlockSize(size_t size) override {
+        base_->SetPreallocationBlockSize(size);
+      }
+      Status Close() override {
+// SyncPoint is not supported in Released Windows Mode.
+#if !(defined NDEBUG) || !defined(OS_WIN)
+        // Check preallocation size
+        size_t block_size, last_allocated_block;
+        base_->GetPreallocationStatus(&block_size, &last_allocated_block);
+        TEST_SYNC_POINT_CALLBACK("DBTestWalFile.GetPreallocationStatus",
+                                 &block_size);
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+        return base_->Close();
+      }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        ++env_->sync_counter_;
+        if (env_->corrupt_in_sync_) {
+          EXPECT_OK(Append(std::string(33000, ' ')));
+          return Status::IOError("Ingested Sync Failure");
+        }
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
+      }
+      bool IsSyncThreadSafe() const override {
+        return env_->is_wal_sync_thread_safe_.load();
+      }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+    class OtherFile : public WritableFile {
+     public:
+      OtherFile(SpecialEnv* env, std::unique_ptr<WritableFile>&& b)
+          : env_(env), base_(std::move(b)) {}
+      Status Append(const Slice& data) override { return base_->Append(data); }
+      Status Append(
+          const Slice& data,
+          const DataVerificationInfo& /*verification_info*/) override {
+        return Append(data);
+      }
+      Status Truncate(uint64_t size) override { return base_->Truncate(size); }
+      Status Close() override { return base_->Close(); }
+      Status Flush() override { return base_->Flush(); }
+      Status Sync() override {
+        if (env_->skip_fsync_) {
+          return Status::OK();
+        } else {
+          return base_->Sync();
+        }
+      }
+      uint64_t GetFileSize() override { return base_->GetFileSize(); }
+      Status Allocate(uint64_t offset, uint64_t len) override {
+        return base_->Allocate(offset, len);
+      }
+
+     private:
+      SpecialEnv* env_;
+      std::unique_ptr<WritableFile> base_;
+    };
+
+    if (no_file_overwrite_.load(std::memory_order_acquire) &&
+        target()->FileExists(f).ok()) {
+      return Status::NotSupported("SpecialEnv::no_file_overwrite_ is true.");
+    }
+
+    if (non_writeable_rate_.load(std::memory_order_acquire) > 0) {
+      uint32_t random_number;
+      {
+        MutexLock l(&rnd_mutex_);
+        random_number = rnd_.Uniform(100);
+      }
+      if (random_number < non_writeable_rate_.load()) {
+        return Status::IOError("simulated random write error");
+      }
+    }
+
+    new_writable_count_++;
+
+    if (non_writable_count_.load() > 0) {
+      non_writable_count_--;
+      return Status::IOError("simulated write error");
+    }
+
+    EnvOptions optimized = soptions;
+    if (strstr(f.c_str(), "MANIFEST") != nullptr ||
+        strstr(f.c_str(), "log") != nullptr) {
+      optimized.use_mmap_writes = false;
+      optimized.use_direct_writes = false;
+    }
+
+    Status s = target()->NewWritableFile(f, r, optimized);
+    if (s.ok()) {
+      if (strstr(f.c_str(), ".sst") != nullptr) {
+        r->reset(new SSTableFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "MANIFEST") != nullptr) {
+        r->reset(new ManifestFile(this, std::move(*r)));
+      } else if (strstr(f.c_str(), "log") != nullptr) {
+        r->reset(new WalFile(this, std::move(*r)));
+      } else {
+        r->reset(new OtherFile(this, std::move(*r)));
+      }
+    }
+    return s;
+  }
+
+  Status NewRandomAccessFile(const std::string& f,
+                             std::unique_ptr<RandomAccessFile>* r,
+                             const EnvOptions& soptions) override {
+    class CountingFile : public RandomAccessFile {
+     public:
+      CountingFile(std::unique_ptr<RandomAccessFile>&& target,
+                   anon::AtomicCounter* counter,
+                   std::atomic<size_t>* bytes_read)
+          : target_(std::move(target)),
+            counter_(counter),
+            bytes_read_(bytes_read) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        counter_->Increment();
+        Status s = target_->Read(offset, n, result, scratch);
+        *bytes_read_ += result->size();
+        return s;
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        Status s = target_->Prefetch(offset, n);
+        *bytes_read_ += n;
+        return s;
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      anon::AtomicCounter* counter_;
+      std::atomic<size_t>* bytes_read_;
+    };
+
+    class RandomFailureFile : public RandomAccessFile {
+     public:
+      RandomFailureFile(std::unique_ptr<RandomAccessFile>&& target,
+                        std::atomic<uint64_t>* failure_cnt, uint32_t fail_odd)
+          : target_(std::move(target)),
+            fail_cnt_(failure_cnt),
+            fail_odd_(fail_odd) {}
+      virtual Status Read(uint64_t offset, size_t n, Slice* result,
+                          char* scratch) const override {
+        if (Random::GetTLSInstance()->OneIn(fail_odd_)) {
+          fail_cnt_->fetch_add(1);
+          return Status::IOError("random error");
+        }
+        return target_->Read(offset, n, result, scratch);
+      }
+
+      virtual Status Prefetch(uint64_t offset, size_t n) override {
+        return target_->Prefetch(offset, n);
+      }
+
+     private:
+      std::unique_ptr<RandomAccessFile> target_;
+      std::atomic<uint64_t>* fail_cnt_;
+      uint32_t fail_odd_;
+    };
+
+    Status s = target()->NewRandomAccessFile(f, r, soptions);
+    random_file_open_counter_++;
+    if (s.ok()) {
+      if (count_random_reads_) {
+        r->reset(new CountingFile(std::move(*r), &random_read_counter_,
+                                  &random_read_bytes_counter_));
+      } else if (rand_reads_fail_odd_ > 0) {
+        r->reset(new RandomFailureFile(std::move(*r), &num_reads_fails_,
+                                       rand_reads_fail_odd_));
+      }
+    }
+
+    if (s.ok() && soptions.compaction_readahead_size > 0) {
+      compaction_readahead_size_ = soptions.compaction_readahead_size;
+    }
+    return s;
+  }
+
+  virtual Status NewSequentialFile(const std::string& f,
+                                   std::unique_ptr<SequentialFile>* r,
+                                   const EnvOptions& soptions) override {
+    class CountingFile : public SequentialFile {
+     public:
+      CountingFile(std::unique_ptr<SequentialFile>&& target,
+                   anon::AtomicCounter* counter)
+          : target_(std::move(target)), counter_(counter) {}
+      virtual Status Read(size_t n, Slice* result, char* scratch) override {
+        counter_->Increment();
+        return target_->Read(n, result, scratch);
+      }
+      virtual Status Skip(uint64_t n) override { return target_->Skip(n); }
+
+     private:
+      std::unique_ptr<SequentialFile> target_;
+      anon::AtomicCounter* counter_;
+    };
+
+    Status s = target()->NewSequentialFile(f, r, soptions);
+    if (s.ok() && count_sequential_reads_) {
+      r->reset(new CountingFile(std::move(*r), &sequential_read_counter_));
+    }
+    return s;
+  }
+
+  virtual void SleepForMicroseconds(int micros) override {
+    sleep_counter_.Increment();
+    if (no_slowdown_ || time_elapse_only_sleep_) {
+      addon_microseconds_.fetch_add(micros);
+    }
+    if (!no_slowdown_) {
+      target()->SleepForMicroseconds(micros);
+    }
+  }
+
+  void MockSleepForMicroseconds(int64_t micros) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(micros);
+  }
+
+  void MockSleepForSeconds(int64_t seconds) {
+    sleep_counter_.Increment();
+    assert(no_slowdown_);
+    addon_microseconds_.fetch_add(seconds * 1000000);
+  }
+
+  virtual Status GetCurrentTime(int64_t* unix_time) override {
+    Status s;
+    if (time_elapse_only_sleep_) {
+      *unix_time = maybe_starting_time_;
+    } else {
+      s = target()->GetCurrentTime(unix_time);
+    }
+    if (s.ok()) {
+      // mock microseconds elapsed to seconds of time
+      *unix_time += addon_microseconds_.load() / 1000000;
+    }
+    return s;
+  }
+
+  virtual uint64_t NowCPUNanos() override {
+    now_cpu_count_.fetch_add(1);
+    return target()->NowCPUNanos();
+  }
+
+  virtual uint64_t NowNanos() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowNanos()) +
+           addon_microseconds_.load() * 1000;
+  }
+
+  virtual uint64_t NowMicros() override {
+    return (time_elapse_only_sleep_ ? 0 : target()->NowMicros()) +
+           addon_microseconds_.load();
+  }
+
+  virtual Status DeleteFile(const std::string& fname) override {
+    delete_count_.fetch_add(1);
+    return target()->DeleteFile(fname);
+  }
+
+  void SetMockSleep(bool enabled = true) { no_slowdown_ = enabled; }
+
+  Status NewDirectory(const std::string& name,
+                      std::unique_ptr<Directory>* result) override {
+    if (!skip_fsync_) {
+      return target()->NewDirectory(name, result);
+    } else {
+      class NoopDirectory : public Directory {
+       public:
+        NoopDirectory() {}
+        ~NoopDirectory() {}
+
+        Status Fsync() override { return Status::OK(); }
+        Status Close() override { return Status::OK(); }
+      };
+
+      result->reset(new NoopDirectory());
+      return Status::OK();
+    }
+  }
+
+  Status RenameFile(const std::string& src, const std::string& dest) override {
+    rename_count_.fetch_add(1);
+    if (rename_error_.load(std::memory_order_acquire)) {
+      return Status::NotSupported("Simulated `RenameFile()` error.");
+    }
+    return target()->RenameFile(src, dest);
+  }
+
+  // Something to return when mocking current time
+  const int64_t maybe_starting_time_;
+
+  Random rnd_;
+  port::Mutex rnd_mutex_;  // Lock to pretect rnd_
+
+  // sstable Sync() calls are blocked while this pointer is non-nullptr.
+  std::atomic<bool> delay_sstable_sync_;
+
+  // Drop writes on the floor while this pointer is non-nullptr.
+  std::atomic<bool> drop_writes_;
+
+  // Simulate no-space errors while this pointer is non-nullptr.
+  std::atomic<bool> no_space_;
+
+  // Simulate non-writable file system while this pointer is non-nullptr
+  std::atomic<bool> non_writable_;
+
+  // Force sync of manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_sync_error_;
+
+  // Force write to manifest files to fail while this pointer is non-nullptr
+  std::atomic<bool> manifest_write_error_;
+
+  // Force write to log files to fail while this pointer is non-nullptr
+  std::atomic<bool> log_write_error_;
+
+  // Force `RenameFile()` to fail while this pointer is non-nullptr
+  std::atomic<bool> rename_error_{false};
+
+  // Slow down every log write, in micro-seconds.
+  std::atomic<int> log_write_slowdown_;
+
+  // If true, returns Status::NotSupported for file overwrite.
+  std::atomic<bool> no_file_overwrite_;
+
+  // Number of WAL files that are still open for write.
+  std::atomic<int> num_open_wal_file_;
+
+  bool count_random_reads_;
+  uint32_t rand_reads_fail_odd_ = 0;
+  std::atomic<uint64_t> num_reads_fails_;
+  anon::AtomicCounter random_read_counter_;
+  std::atomic<size_t> random_read_bytes_counter_;
+  std::atomic<int> random_file_open_counter_;
+
+  bool count_sequential_reads_;
+  anon::AtomicCounter sequential_read_counter_;
+
+  anon::AtomicCounter sleep_counter_;
+
+  std::atomic<int64_t> bytes_written_;
+
+  std::atomic<int> sync_counter_;
+
+  // If true, all fsync to files and directories are skipped.
+  bool skip_fsync_ = false;
+
+  // If true, ingest the corruption to file during sync.
+  bool corrupt_in_sync_ = false;
+
+  std::atomic<uint32_t> non_writeable_rate_;
+
+  std::atomic<uint32_t> new_writable_count_;
+
+  std::atomic<uint32_t> non_writable_count_;
+
+  std::function<void()>* table_write_callback_;
+
+  std::atomic<int> now_cpu_count_;
+
+  std::atomic<int> delete_count_;
+
+  std::atomic<int> rename_count_{0};
+
+  std::atomic<bool> is_wal_sync_thread_safe_{true};
+
+  std::atomic<size_t> compaction_readahead_size_{};
+
+ private:  // accessing these directly is prone to error
+  friend class DBTestBase;
+
+  std::atomic<int64_t> addon_microseconds_{0};
+
+  // Do not modify in the env of a running DB (could cause deadlock)
+  std::atomic<bool> time_elapse_only_sleep_;
+
+  bool no_slowdown_;
+};
+
+#ifndef ROCKSDB_LITE
+class FileTemperatureTestFS : public FileSystemWrapper {
+ public:
+  explicit FileTemperatureTestFS(const std::shared_ptr<FileSystem>& fs)
+      : FileSystemWrapper(fs) {}
+
+  static const char* kClassName() { return "FileTemperatureTestFS"; }
+  const char* Name() const override { return kClassName(); }
+
+  IOStatus NewSequentialFile(const std::string& fname, const FileOptions& opts,
+                             std::unique_ptr<FSSequentialFile>* result,
+                             IODebugContext* dbg) override {
+    IOStatus s = target()->NewSequentialFile(fname, opts, result, dbg);
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+      if (s.ok()) {
+        if (opts.temperature != Temperature::kUnknown) {
+          // Be extra picky and don't open if a wrong non-unknown temperature is
+          // provided
+          auto e = current_sst_file_temperatures_.find(number);
+          if (e != current_sst_file_temperatures_.end() &&
+              e->second != opts.temperature) {
+            result->reset();
+            return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+          }
+        }
+        *result = WrapWithTemperature<FSSequentialFileOwnerWrapper>(
+            number, std::move(*result));
+      }
+    }
+    return s;
+  }
+
+  IOStatus NewRandomAccessFile(const std::string& fname,
+                               const FileOptions& opts,
+                               std::unique_ptr<FSRandomAccessFile>* result,
+                               IODebugContext* dbg) override {
+    IOStatus s = target()->NewRandomAccessFile(fname, opts, result, dbg);
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      requested_sst_file_temperatures_.emplace_back(number, opts.temperature);
+      if (s.ok()) {
+        if (opts.temperature != Temperature::kUnknown) {
+          // Be extra picky and don't open if a wrong non-unknown temperature is
+          // provided
+          auto e = current_sst_file_temperatures_.find(number);
+          if (e != current_sst_file_temperatures_.end() &&
+              e->second != opts.temperature) {
+            result->reset();
+            return IOStatus::PathNotFound("Temperature mismatch on " + fname);
+          }
+        }
+        *result = WrapWithTemperature<FSRandomAccessFileOwnerWrapper>(
+            number, std::move(*result));
+      }
+    }
+    return s;
+  }
+
+  void PopRequestedSstFileTemperatures(
+      std::vector<std::pair<uint64_t, Temperature>>* out = nullptr) {
+    MutexLock lock(&mu_);
+    if (out) {
+      *out = std::move(requested_sst_file_temperatures_);
+      assert(requested_sst_file_temperatures_.empty());
+    } else {
+      requested_sst_file_temperatures_.clear();
+    }
+  }
+
+  IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts,
+                           std::unique_ptr<FSWritableFile>* result,
+                           IODebugContext* dbg) override {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(GetFileName(fname), &number, &type) &&
+        type == kTableFile) {
+      MutexLock lock(&mu_);
+      current_sst_file_temperatures_[number] = opts.temperature;
+    }
+    return target()->NewWritableFile(fname, opts, result, dbg);
+  }
+
+  void CopyCurrentSstFileTemperatures(std::map<uint64_t, Temperature>* out) {
+    MutexLock lock(&mu_);
+    *out = current_sst_file_temperatures_;
+  }
+
+  void OverrideSstFileTemperature(uint64_t number, Temperature temp) {
+    MutexLock lock(&mu_);
+    current_sst_file_temperatures_[number] = temp;
+  }
+
+ protected:
+  port::Mutex mu_;
+  std::vector<std::pair<uint64_t, Temperature>>
+      requested_sst_file_temperatures_;
+  std::map<uint64_t, Temperature> current_sst_file_temperatures_;
+
+  std::string GetFileName(const std::string& fname) {
+    auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1);
+    // workaround only for Windows that the file path could contain both Windows
+    // FilePathSeparator and '/'
+    filename = filename.substr(filename.find_last_of('/') + 1);
+    return filename;
+  }
+
+  template <class FileOwnerWrapperT, /*inferred*/ class FileT>
+  std::unique_ptr<FileT> WrapWithTemperature(uint64_t number,
+                                             std::unique_ptr<FileT>&& t) {
+    class FileWithTemp : public FileOwnerWrapperT {
+     public:
+      FileWithTemp(FileTemperatureTestFS* fs, uint64_t number,
+                   std::unique_ptr<FileT>&& t)
+          : FileOwnerWrapperT(std::move(t)), fs_(fs), number_(number) {}
+
+      Temperature GetTemperature() const override {
+        MutexLock lock(&fs_->mu_);
+        return fs_->current_sst_file_temperatures_[number_];
+      }
+
+     private:
+      FileTemperatureTestFS* fs_;
+      uint64_t number_;
+    };
+    return std::make_unique<FileWithTemp>(this, number, std::move(t));
+  }
+};
+
+class OnFileDeletionListener : public EventListener {
+ public:
+  OnFileDeletionListener() : matched_count_(0), expected_file_name_("") {}
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "OnFileDeletionListener"; }
+
+  void SetExpectedFileName(const std::string file_name) {
+    expected_file_name_ = file_name;
+  }
+
+  void VerifyMatchedCount(size_t expected_value) {
+    ASSERT_EQ(matched_count_, expected_value);
+  }
+
+  void OnTableFileDeleted(const TableFileDeletionInfo& info) override {
+    if (expected_file_name_ != "") {
+      ASSERT_EQ(expected_file_name_, info.file_path);
+      expected_file_name_ = "";
+      matched_count_++;
+    }
+  }
+
+ private:
+  size_t matched_count_;
+  std::string expected_file_name_;
+};
+
+class FlushCounterListener : public EventListener {
+ public:
+  const char* Name() const override { return kClassName(); }
+  static const char* kClassName() { return "FlushCounterListener"; }
+  std::atomic<int> count{0};
+  std::atomic<FlushReason> expected_flush_reason{FlushReason::kOthers};
+
+  void OnFlushBegin(DB* /*db*/, const FlushJobInfo& flush_job_info) override {
+    count++;
+    ASSERT_EQ(expected_flush_reason.load(), flush_job_info.flush_reason);
+  }
+};
+#endif
+
+// A test merge operator mimics put but also fails if one of merge operands is
+// "corrupted".
+class TestPutOperator : public MergeOperator {
+ public:
+  virtual bool FullMergeV2(const MergeOperationInput& merge_in,
+                           MergeOperationOutput* merge_out) const override {
+    if (merge_in.existing_value != nullptr &&
+        *(merge_in.existing_value) == "corrupted") {
+      return false;
+    }
+    for (auto value : merge_in.operand_list) {
+      if (value == "corrupted") {
+        return false;
+      }
+    }
+    merge_out->existing_operand = merge_in.operand_list.back();
+    return true;
+  }
+
+  virtual const char* Name() const override { return "TestPutOperator"; }
+};
+
+// A wrapper around Cache that can easily be extended with instrumentation,
+// etc.
+class CacheWrapper : public Cache {
+ public:
+  explicit CacheWrapper(std::shared_ptr<Cache> target)
+      : target_(std::move(target)) {}
+
+  const char* Name() const override { return target_->Name(); }
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override {
+    return target_->Insert(key, value, charge, deleter, handle, priority);
+  }
+
+  using Cache::Lookup;
+  Handle* Lookup(const Slice& key, Statistics* stats = nullptr) override {
+    return target_->Lookup(key, stats);
+  }
+
+  bool Ref(Handle* handle) override { return target_->Ref(handle); }
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override {
+    return target_->Release(handle, erase_if_last_ref);
+  }
+
+  void* Value(Handle* handle) override { return target_->Value(handle); }
+
+  void Erase(const Slice& key) override { target_->Erase(key); }
+  uint64_t NewId() override { return target_->NewId(); }
+
+  void SetCapacity(size_t capacity) override { target_->SetCapacity(capacity); }
+
+  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
+    target_->SetStrictCapacityLimit(strict_capacity_limit);
+  }
+
+  bool HasStrictCapacityLimit() const override {
+    return target_->HasStrictCapacityLimit();
+  }
+
+  size_t GetCapacity() const override { return target_->GetCapacity(); }
+
+  size_t GetUsage() const override { return target_->GetUsage(); }
+
+  size_t GetUsage(Handle* handle) const override {
+    return target_->GetUsage(handle);
+  }
+
+  size_t GetPinnedUsage() const override { return target_->GetPinnedUsage(); }
+
+  size_t GetCharge(Handle* handle) const override {
+    return target_->GetCharge(handle);
+  }
+
+  DeleterFn GetDeleter(Handle* handle) const override {
+    return target_->GetDeleter(handle);
+  }
+
+  void ApplyToAllCacheEntries(void (*callback)(void*, size_t),
+                              bool thread_safe) override {
+    target_->ApplyToAllCacheEntries(callback, thread_safe);
+  }
+
+  void ApplyToAllEntries(
+      const std::function<void(const Slice& key, void* value, size_t charge,
+                               DeleterFn deleter)>& callback,
+      const ApplyToAllEntriesOptions& opts) override {
+    target_->ApplyToAllEntries(callback, opts);
+  }
+
+  void EraseUnRefEntries() override { target_->EraseUnRefEntries(); }
+
+ protected:
+  std::shared_ptr<Cache> target_;
+};
+
+/*
+ * A cache wrapper that tracks certain CacheEntryRole's cache charge, its
+ * peaks and increments
+ *
+ *        p0
+ *       / \   p1
+ *      /   \  /\
+ *     /     \/  \
+ *  a /       b   \
+ * peaks = {p0, p1}
+ * increments = {p1-a, p2-b}
+ */
+template <CacheEntryRole R>
+class TargetCacheChargeTrackingCache : public CacheWrapper {
+ public:
+  explicit TargetCacheChargeTrackingCache(std::shared_ptr<Cache> target);
+
+  using Cache::Insert;
+  Status Insert(const Slice& key, void* value, size_t charge,
+                void (*deleter)(const Slice& key, void* value),
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override;
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override;
+
+  std::size_t GetCacheCharge() { return cur_cache_charge_; }
+
+  std::deque<std::size_t> GetChargedCachePeaks() { return cache_charge_peaks_; }
+
+  std::size_t GetChargedCacheIncrementSum() {
+    return cache_charge_increments_sum_;
+  }
+
+ private:
+  static const Cache::DeleterFn kNoopDeleter;
+
+  std::size_t cur_cache_charge_;
+  std::size_t cache_charge_peak_;
+  std::size_t cache_charge_increment_;
+  bool last_peak_tracked_;
+  std::deque<std::size_t> cache_charge_peaks_;
+  std::size_t cache_charge_increments_sum_;
+};
+
+class DBTestBase : public testing::Test {
+ public:
+  // Sequence of option configurations to try
+  enum OptionConfig : int {
+    kDefault = 0,
+    kBlockBasedTableWithPrefixHashIndex = 1,
+    kBlockBasedTableWithWholeKeyHashIndex = 2,
+    kPlainTableFirstBytePrefix = 3,
+    kPlainTableCappedPrefix = 4,
+    kPlainTableCappedPrefixNonMmap = 5,
+    kPlainTableAllBytesPrefix = 6,
+    kVectorRep = 7,
+    kHashLinkList = 8,
+    kMergePut = 9,
+    kFilter = 10,
+    kFullFilterWithNewTableReaderForCompactions = 11,
+    kUncompressed = 12,
+    kNumLevel_3 = 13,
+    kDBLogDir = 14,
+    kWalDirAndMmapReads = 15,
+    kManifestFileSize = 16,
+    kPerfOptions = 17,
+    kHashSkipList = 18,
+    kUniversalCompaction = 19,
+    kUniversalCompactionMultiLevel = 20,
+    kCompressedBlockCache = 21,
+    kInfiniteMaxOpenFiles = 22,
+    kCRC32cChecksum = 23,
+    kFIFOCompaction = 24,
+    kOptimizeFiltersForHits = 25,
+    kRowCache = 26,
+    kRecycleLogFiles = 27,
+    kConcurrentSkipList = 28,
+    kPipelinedWrite = 29,
+    kConcurrentWALWrites = 30,
+    kDirectIO,
+    kLevelSubcompactions,
+    kBlockBasedTableWithIndexRestartInterval,
+    kBlockBasedTableWithPartitionedIndex,
+    kBlockBasedTableWithPartitionedIndexFormat4,
+    kBlockBasedTableWithLatestFormat,
+    kPartitionedFilterWithNewTableReaderForCompactions,
+    kUniversalSubcompactions,
+    kUnorderedWrite,
+    // This must be the last line
+    kEnd,
+  };
+
+ public:
+  std::string dbname_;
+  std::string alternative_wal_dir_;
+  std::string alternative_db_log_dir_;
+  MockEnv* mem_env_;
+  Env* encrypted_env_;
+  SpecialEnv* env_;
+  std::shared_ptr<Env> env_guard_;
+  DB* db_;
+  std::vector<ColumnFamilyHandle*> handles_;
+
+  int option_config_;
+  Options last_options_;
+
+  // Skip some options, as they may not be applicable to a specific test.
+  // To add more skip constants, use values 4, 8, 16, etc.
+  enum OptionSkip {
+    kNoSkip = 0,
+    kSkipDeletesFilterFirst = 1,
+    kSkipUniversalCompaction = 2,
+    kSkipMergePut = 4,
+    kSkipPlainTable = 8,
+    kSkipHashIndex = 16,
+    kSkipNoSeekToLast = 32,
+    kSkipFIFOCompaction = 128,
+    kSkipMmapReads = 256,
+  };
+
+  const int kRangeDelSkipConfigs =
+      // Plain tables do not support range deletions.
+      kSkipPlainTable |
+      // MmapReads disables the iterator pinning that RangeDelAggregator
+      // requires.
+      kSkipMmapReads;
+
+  // `env_do_fsync` decides whether the special Env would do real
+  // fsync for files and directories. Skipping fsync can speed up
+  // tests, but won't cover the exact fsync logic.
+  DBTestBase(const std::string path, bool env_do_fsync);
+
+  ~DBTestBase();
+
+  static std::string Key(int i) {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "key%06d", i);
+    return std::string(buf);
+  }
+
+  static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip);
+
+  // Switch to a fresh database with the next option configuration to
+  // test.  Return false if there are no more configurations to test.
+  bool ChangeOptions(int skip_mask = kNoSkip);
+
+  // Switch between different compaction styles.
+  bool ChangeCompactOptions();
+
+  // Switch between different WAL-realted options.
+  bool ChangeWalOptions();
+
+  // Switch between different filter policy
+  // Jump from kDefault to kFilter to kFullFilter
+  bool ChangeFilterOptions();
+
+  // Switch between different DB options for file ingestion tests.
+  bool ChangeOptionsForFileIngestionTest();
+
+  // Return the current option configuration.
+  Options CurrentOptions(const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  Options CurrentOptions(const Options& default_options,
+                         const anon::OptionsOverride& options_override =
+                             anon::OptionsOverride()) const;
+
+  Options GetDefaultOptions() const;
+
+  Options GetOptions(int option_config) const {
+    return GetOptions(option_config, GetDefaultOptions());
+  }
+
+  Options GetOptions(int option_config, const Options& default_options,
+                     const anon::OptionsOverride& options_override =
+                         anon::OptionsOverride()) const;
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void CreateColumnFamilies(const std::vector<std::string>& cfs,
+                            const Options& options);
+
+  void CreateAndReopenWithCF(const std::vector<std::string>& cfs,
+                             const Options& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const std::vector<Options>& options);
+
+  void ReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                const Options& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const std::vector<Options>& options);
+
+  Status TryReopenWithColumnFamilies(const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+  void Reopen(const Options& options);
+
+  void Close();
+
+  void DestroyAndReopen(const Options& options);
+
+  void Destroy(const Options& options, bool delete_cf_paths = false);
+
+  Status ReadOnlyReopen(const Options& options);
+
+  Status TryReopen(const Options& options);
+
+  bool IsDirectIOSupported();
+
+  bool IsMemoryMappedAccessSupported() const;
+
+  Status Flush(int cf = 0);
+
+  Status Flush(const std::vector<int>& cf_ids);
+
+  Status Put(const Slice& k, const Slice& v, WriteOptions wo = WriteOptions());
+
+  Status Put(int cf, const Slice& k, const Slice& v,
+             WriteOptions wo = WriteOptions());
+
+  Status Merge(const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Merge(int cf, const Slice& k, const Slice& v,
+               WriteOptions wo = WriteOptions());
+
+  Status Delete(const std::string& k);
+
+  Status Delete(int cf, const std::string& k);
+
+  Status SingleDelete(const std::string& k);
+
+  Status SingleDelete(int cf, const std::string& k);
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr);
+
+  std::string Get(int cf, const std::string& k,
+                  const Snapshot* snapshot = nullptr);
+
+  Status Get(const std::string& k, PinnableSlice* v);
+
+  std::vector<std::string> MultiGet(std::vector<int> cfs,
+                                    const std::vector<std::string>& k,
+                                    const Snapshot* snapshot,
+                                    const bool batched,
+                                    const bool async = false);
+
+  std::vector<std::string> MultiGet(const std::vector<std::string>& k,
+                                    const Snapshot* snapshot = nullptr,
+                                    const bool async = false);
+
+  uint64_t GetNumSnapshots();
+
+  uint64_t GetTimeOldestSnapshots();
+
+  uint64_t GetSequenceOldestSnapshots();
+
+  // Return a string that contains all key,value pairs in order,
+  // formatted like "(k1->v1)(k2->v2)".
+  std::string Contents(int cf = 0);
+
+  std::string AllEntriesFor(const Slice& user_key, int cf = 0);
+
+  // Similar to AllEntriesFor but this function also covers reopen with fifo.
+  // Note that test cases with snapshots or entries in memtable should simply
+  // use AllEntriesFor instead as snapshots and entries in memtable will
+  // survive after db reopen.
+  void CheckAllEntriesWithFifoReopen(const std::string& expected_value,
+                                     const Slice& user_key, int cf,
+                                     const std::vector<std::string>& cfs,
+                                     const Options& options);
+
+#ifndef ROCKSDB_LITE
+  int NumSortedRuns(int cf = 0);
+
+  uint64_t TotalSize(int cf = 0);
+
+  uint64_t SizeAtLevel(int level);
+
+  size_t TotalLiveFiles(int cf = 0);
+
+  size_t CountLiveFiles();
+
+  int NumTableFilesAtLevel(int level, int cf = 0);
+
+  double CompressionRatioAtLevel(int level, int cf = 0);
+
+  int TotalTableFiles(int cf = 0, int levels = -1);
+#endif  // ROCKSDB_LITE
+
+  std::vector<uint64_t> GetBlobFileNumbers();
+
+  // Return spread of files per level
+  std::string FilesPerLevel(int cf = 0);
+
+  size_t CountFiles();
+
+  Status CountFiles(size_t* count);
+
+  Status Size(const Slice& start, const Slice& limit, uint64_t* size) {
+    return Size(start, limit, 0, size);
+  }
+
+  Status Size(const Slice& start, const Slice& limit, int cf, uint64_t* size);
+
+  void Compact(int cf, const Slice& start, const Slice& limit,
+               uint32_t target_path_id);
+
+  void Compact(int cf, const Slice& start, const Slice& limit);
+
+  void Compact(const Slice& start, const Slice& limit);
+
+  // Do n memtable compactions, each of which produces an sstable
+  // covering the range [small,large].
+  void MakeTables(int n, const std::string& small, const std::string& large,
+                  int cf = 0);
+
+  // Prevent pushing of new sstables into deeper levels by adding
+  // tables that cover a specified range to all levels.
+  void FillLevels(const std::string& smallest, const std::string& largest,
+                  int cf);
+
+  void MoveFilesToLevel(int level, int cf = 0);
+
+#ifndef ROCKSDB_LITE
+  void DumpFileCounts(const char* label);
+#endif  // ROCKSDB_LITE
+
+  std::string DumpSSTableList();
+
+  static void GetSstFiles(Env* env, std::string path,
+                          std::vector<std::string>* files);
+
+  int GetSstFileCount(std::string path);
+
+  // this will generate non-overlapping files since it keeps increasing key_idx
+  void GenerateNewFile(Random* rnd, int* key_idx, bool nowait = false);
+
+  void GenerateNewFile(int fd, Random* rnd, int* key_idx, bool nowait = false);
+
+  static const int kNumKeysByGenerateNewRandomFile;
+  static const int KNumKeysByGenerateNewFile = 100;
+
+  void GenerateNewRandomFile(Random* rnd, bool nowait = false);
+
+  std::string IterStatus(Iterator* iter);
+
+  Options OptionsForLogIterTest();
+
+  std::string DummyString(size_t len, char c = 'a');
+
+  void VerifyIterLast(std::string expected_key, int cf = 0);
+
+  // Used to test InplaceUpdate
+
+  // If previous value is nullptr or delta is > than previous value,
+  //   sets newValue with delta
+  // If previous value is not empty,
+  //   updates previous value with 'b' string of previous value size - 1.
+  static UpdateStatus updateInPlaceSmallerSize(char* prevValue,
+                                               uint32_t* prevSize, Slice delta,
+                                               std::string* newValue);
+
+  static UpdateStatus updateInPlaceSmallerVarintSize(char* prevValue,
+                                                     uint32_t* prevSize,
+                                                     Slice delta,
+                                                     std::string* newValue);
+
+  static UpdateStatus updateInPlaceLargerSize(char* prevValue,
+                                              uint32_t* prevSize, Slice delta,
+                                              std::string* newValue);
+
+  static UpdateStatus updateInPlaceNoAction(char* prevValue, uint32_t* prevSize,
+                                            Slice delta, std::string* newValue);
+
+  // Utility method to test InplaceUpdate
+  void validateNumberOfEntries(int numValues, int cf = 0);
+
+  void CopyFile(const std::string& source, const std::string& destination,
+                uint64_t size = 0);
+
+  Status GetAllDataFiles(const FileType file_type,
+                         std::unordered_map<std::string, uint64_t>* sst_files,
+                         uint64_t* total_size = nullptr);
+
+  std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path);
+
+  void VerifyDBFromMap(
+      std::map<std::string, std::string> true_data,
+      size_t* total_reads_res = nullptr, bool tailing_iter = false,
+      std::map<std::string, Status> status = std::map<std::string, Status>());
+
+  void VerifyDBInternal(
+      std::vector<std::pair<std::string, std::string>> true_data);
+
+#ifndef ROCKSDB_LITE
+  uint64_t GetNumberOfSstFilesForColumnFamily(DB* db,
+                                              std::string column_family_name);
+
+  uint64_t GetSstSizeHelper(Temperature temperature);
+#endif  // ROCKSDB_LITE
+
+  uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) {
+    return options.statistics->getTickerCount(ticker_type);
+  }
+
+  uint64_t TestGetAndResetTickerCount(const Options& options,
+                                      Tickers ticker_type) {
+    return options.statistics->getAndResetTickerCount(ticker_type);
+  }
+
+  // Note: reverting this setting within the same test run is not yet
+  // supported
+  void SetTimeElapseOnlySleepOnReopen(DBOptions* options);
+
+ private:  // Prone to error on direct use
+  void MaybeInstallTimeElapseOnlySleep(const DBOptions& options);
+
+  bool time_elapse_only_sleep_on_reopen_ = false;
+};
+
+// For verifying that all files generated by current version have SST
+// unique ids.
+void VerifySstUniqueIds(const TablePropertiesCollection& props);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_universal_compaction_test.cc b/src/rocksdb/db/db_universal_compaction_test.cc
new file mode 100644
index 000000000..f53c36f22
--- /dev/null
+++ b/src/rocksdb/db/db_universal_compaction_test.cc
@@ -0,0 +1,2235 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#if !defined(ROCKSDB_LITE)
+#include "rocksdb/utilities/table_properties_collectors.h"
+#include "test_util/sync_point.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string CompressibleString(Random* rnd, int len) {
+  std::string r;
+  test::CompressibleString(rnd, 0.8, len, &r);
+  return r;
+}
+
+class DBTestUniversalCompactionBase
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<int, bool>> {
+ public:
+  explicit DBTestUniversalCompactionBase(const std::string& path)
+      : DBTestBase(path, /*env_do_fsync=*/false) {}
+  void SetUp() override {
+    num_levels_ = std::get<0>(GetParam());
+    exclusive_manual_compaction_ = std::get<1>(GetParam());
+  }
+  int num_levels_;
+  bool exclusive_manual_compaction_;
+};
+
+class DBTestUniversalCompaction : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompaction()
+      : DBTestUniversalCompactionBase("/db_universal_compaction_test") {}
+};
+
+class DBTestUniversalCompaction2 : public DBTestBase {
+ public:
+  DBTestUniversalCompaction2()
+      : DBTestBase("db_universal_compaction_test2", /*env_do_fsync=*/false) {}
+};
+
+namespace {
+void VerifyCompactionResult(
+    const ColumnFamilyMetaData& cf_meta,
+    const std::set<std::string>& overlapping_file_numbers) {
+#ifndef NDEBUG
+  for (auto& level : cf_meta.levels) {
+    for (auto& file : level.files) {
+      assert(overlapping_file_numbers.find(file.name) ==
+             overlapping_file_numbers.end());
+    }
+  }
+#endif
+}
+
+class KeepFilter : public CompactionFilter {
+ public:
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& /*value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return false;
+  }
+
+  const char* Name() const override { return "KeepFilter"; }
+};
+
+class KeepFilterFactory : public CompactionFilterFactory {
+ public:
+  explicit KeepFilterFactory(bool check_context = false)
+      : check_context_(check_context) {}
+
+  std::unique_ptr<CompactionFilter> CreateCompactionFilter(
+      const CompactionFilter::Context& context) override {
+    if (check_context_) {
+      EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
+      EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
+    }
+    return std::unique_ptr<CompactionFilter>(new KeepFilter());
+  }
+
+  const char* Name() const override { return "KeepFilterFactory"; }
+  bool check_context_;
+  std::atomic_bool expect_full_compaction_;
+  std::atomic_bool expect_manual_compaction_;
+};
+}  // anonymous namespace
+
+// Make sure we don't trigger a problem if the trigger condtion is given
+// to be 0, which is invalid.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) {
+  Options options = CurrentOptions();
+
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  // Config universal compaction to always compact to one single sorted run.
+  options.level0_file_num_compaction_trigger = 0;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.min_merge_width = 2;
+  options.compaction_options_universal.max_size_amplification_percent = 0;
+
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  DestroyAndReopen(options);
+  ASSERT_EQ(1, db_->GetOptions().level0_file_num_compaction_trigger);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+
+  for (int num = 0; num < 16; num++) {
+    // Write 100KB file. And immediately it should be compacted to one file.
+    GenerateNewFile(&rnd, &key_idx);
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    ASSERT_EQ(NumSortedRuns(0), 1);
+  }
+  ASSERT_OK(Put(Key(key_idx), ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumSortedRuns(0), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  BlockBasedTableOptions bbto;
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.optimize_filters_for_hits = true;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(3));
+
+  DestroyAndReopen(options);
+
+  // block compaction from happening
+  env_->SetBackgroundThreads(1, Env::LOW);
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    ASSERT_OK(Put(Key(num * 10), "val"));
+    if (num) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(Put(Key(30 + num * 10), "val"));
+    ASSERT_OK(Put(Key(60 + num * 10), "val"));
+  }
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+
+  // Query set of non existing keys
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+
+  // Make sure bloom filter is used at least once.
+  ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0);
+  auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Make sure bloom filter is used for all but the last L0 file when looking
+  // up a non-existent key that's in the range of all L0 files.
+  ASSERT_EQ(Get(Key(35)), "NOT_FOUND");
+  ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1,
+            TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+  prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL);
+
+  // Unblock compaction and wait it for happening.
+  sleeping_task_low.WakeUp();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // The same queries will not trigger bloom filter
+  for (int i = 5; i < 90; i += 10) {
+    ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+  }
+  ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL));
+}
+
+// TODO(kailiu) The tests on UniversalCompaction has some issues:
+//  1. A lot of magic numbers ("11" or "12").
+//  2. Made assumption on the memtable flush conditions, which may change from
+//     time to time.
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrigger) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 105 << 10;  // 105KB
+  options.arena_block_size = 4 << 10;
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  filter->expect_manual_compaction_.store(false);
+  options.compaction_filter_factory.reset(filter);
+
+  options = CurrentOptions(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWritableFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        if (num_levels_ > 3) {
+          ASSERT_LE(preallocation_size, options.target_file_size_base * 1.1);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  filter->expect_full_compaction_.store(true);
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB
+    GenerateNewFile(1, &rnd, &key_idx);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  filter->expect_full_compaction_.store(false);
+  ASSERT_OK(Flush(1));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 2 files, with size 4, 2.4.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Stage 3:
+  //   Now we have 2 files at level 0, with size 4 and 2.4. Continue
+  //   generating new files at level 0.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    GenerateNewFile(1, &rnd, &key_idx);
+    ASSERT_EQ(NumSortedRuns(1), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  GenerateNewFile(1, &rnd, &key_idx);
+  // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 2.4, 2.
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Stage 4:
+  //   Now we have 3 files at level 0, with size 4, 2.4, 2. Let's generate a
+  //   new file of size 1.
+  GenerateNewFile(1, &rnd, &key_idx);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(1), 4);
+
+  // Stage 5:
+  //   Now we have 4 files at level 0, with size 4, 2.4, 2, 1. Let's generate
+  //   a new file of size 1.
+  filter->expect_full_compaction_.store(true);
+  GenerateNewFile(1, &rnd, &key_idx);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // All files at level 0 will be compacted into a single one.
+  ASSERT_EQ(NumSortedRuns(1), 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but will instead trigger size amplification.
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Verify that size amplification did occur
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionSizeAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.size_ratio = 100;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_amp_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() ==
+              CompactionReason::kUniversalSizeAmplification) {
+            total_size_amp_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  //   Generate two files in Level 0. Both files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), 2);
+
+  // Flush whatever is remaining in memtable. This is typically
+  // small, which should not trigger size ratio based compaction
+  // but could instead trigger size amplification if it's set
+  // to 110.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), 3);
+
+  // Trigger compaction if size amplification exceeds 110% without reopening DB
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            200U);
+  ASSERT_OK(dbfull()->SetOptions(handles_[1],
+                                 {{"compaction_options_universal",
+                                   "{max_size_amplification_percent=110;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_size_amplification_percent,
+            110u);
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(110u, mutable_cf_options.compaction_options_universal
+                      .max_size_amplification_percent);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify that size amplification did happen
+  ASSERT_EQ(NumSortedRuns(1), 1);
+  ASSERT_EQ(total_picked_compactions, 1);
+  ASSERT_EQ(total_size_amp_compactions, 1);
+}
+
+TEST_P(DBTestUniversalCompaction, DynamicUniversalCompactionReadAmplification) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 3;
+  // Initial setup of compaction_options_universal will prevent universal
+  // compaction from happening
+  options.compaction_options_universal.max_size_amplification_percent = 2000;
+  options.compaction_options_universal.size_ratio = 0;
+  options.compaction_options_universal.min_merge_width = 100;
+  DestroyAndReopen(options);
+
+  int total_picked_compactions = 0;
+  int total_size_ratio_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+          Compaction* c = static_cast<Compaction*>(arg);
+          if (c->compaction_reason() == CompactionReason::kUniversalSizeRatio) {
+            total_size_ratio_compactions++;
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  MutableCFOptions mutable_cf_options;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Generate three files in Level 0. All files are approx the same size.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_EQ(NumSortedRuns(1), num + 1);
+  }
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger);
+
+  // Flush whatever is remaining in memtable. This is typically small, about
+  // 30KB.
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Verify compaction did not happen
+  ASSERT_EQ(NumSortedRuns(1), options.level0_file_num_compaction_trigger + 1);
+  ASSERT_EQ(total_picked_compactions, 0);
+
+  ASSERT_OK(dbfull()->SetOptions(
+      handles_[1],
+      {{"compaction_options_universal",
+        "{min_merge_width=2;max_merge_width=2;size_ratio=100;}"}}));
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(dbfull()
+                ->GetOptions(handles_[1])
+                .compaction_options_universal.max_merge_width,
+            2u);
+  ASSERT_EQ(
+      dbfull()->GetOptions(handles_[1]).compaction_options_universal.size_ratio,
+      100u);
+
+  ASSERT_OK(dbfull()->TEST_GetLatestMutableCFOptions(handles_[1],
+                                                     &mutable_cf_options));
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.size_ratio, 100u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.min_merge_width,
+            2u);
+  ASSERT_EQ(mutable_cf_options.compaction_options_universal.max_merge_width,
+            2u);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Files in L0 are approx: 0.3 (30KB), 1, 1, 1.
+  // On compaction: the files are below the size amp threshold, so we
+  // fallthrough to checking read amp conditions. The configured size ratio is
+  // not big enough to take 0.3 into consideration. So the next files 1 and 1
+  // are compacted together first as they satisfy size ratio condition and
+  // (min_merge_width, max_merge_width) condition, to give out a file size of 2.
+  // Next, the newly generated 2 and the last file 1 are compacted together. So
+  // at the end: #sortedRuns = 2, #picked_compactions = 2, and all the picked
+  // ones are size ratio based compactions.
+  ASSERT_EQ(NumSortedRuns(1), 2);
+  // If max_merge_width had not been changed dynamically above, and if it
+  // continued to be the default value of UINIT_MAX, total_picked_compactions
+  // would have been 1.
+  ASSERT_EQ(total_picked_compactions, 2);
+  ASSERT_EQ(total_size_ratio_compactions, 2);
+}
+
+TEST_P(DBTestUniversalCompaction, CompactFilesOnUniversalCompaction) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 10;
+
+  ChangeCompactOptions();
+  Options options;
+  options.create_if_missing = true;
+  options.compaction_style = kCompactionStyleLevel;
+  options.num_levels = 1;
+  options.target_file_size_base = options.write_buffer_size;
+  options.compression = kNoCompression;
+  options = CurrentOptions(options);
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal);
+  Random rnd(301);
+  for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) {
+    ASSERT_OK(Put(1, std::to_string(key), rnd.RandomString(kTestValueSize)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ColumnFamilyMetaData cf_meta;
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  std::vector<std::string> compaction_input_file_names;
+  for (auto file : cf_meta.levels[0].files) {
+    if (rnd.OneIn(2)) {
+      compaction_input_file_names.push_back(file.name);
+    }
+  }
+
+  if (compaction_input_file_names.size() == 0) {
+    compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+  }
+
+  // expect fail since universal compaction only allow L0 output
+  ASSERT_FALSE(dbfull()
+                   ->CompactFiles(CompactionOptions(), handles_[1],
+                                  compaction_input_file_names, 1)
+                   .ok());
+
+  // expect ok and verify the compacted files no longer exist.
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                   compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  VerifyCompactionResult(
+      cf_meta, std::set<std::string>(compaction_input_file_names.begin(),
+                                     compaction_input_file_names.end()));
+
+  compaction_input_file_names.clear();
+
+  // Pick the first and the last file, expect everything is
+  // compacted into one single file.
+  compaction_input_file_names.push_back(cf_meta.levels[0].files[0].name);
+  compaction_input_file_names.push_back(
+      cf_meta.levels[0].files[cf_meta.levels[0].files.size() - 1].name);
+  ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), handles_[1],
+                                   compaction_input_file_names, 0));
+
+  dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  ASSERT_EQ(cf_meta.levels[0].files.size(), 1U);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTargetLevel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.num_levels = 7;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Generate 3 overlapping files
+  Random rnd(301);
+  for (int i = 0; i < 210; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 200; i < 300; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 250; i < 260; i++) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ("3", FilesPerLevel(0));
+  // Compact all files into 1 file and put it in L4
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 4;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr));
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0));
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBTestUniversalCompactionMultiLevels
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionMultiLevels()
+      : DBTestUniversalCompactionBase(
+            "/db_universal_compaction_multi_levels_test") {}
+};
+
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionMultiLevels) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 3;
+  options.target_file_size_base = 32 * 1024;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 100000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+// Tests universal compaction with trivial move enabled
+TEST_P(DBTestUniversalCompactionMultiLevels, UniversalCompactionTrivialMove) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 3;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 150000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+INSTANTIATE_TEST_CASE_P(MultiLevels, DBTestUniversalCompactionMultiLevels,
+                        ::testing::Combine(::testing::Values(3, 20),
+                                           ::testing::Bool()));
+
+class DBTestUniversalCompactionParallel : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalCompactionParallel()
+      : DBTestUniversalCompactionBase("/db_universal_compaction_prallel_test") {
+  }
+};
+
+TEST_P(DBTestUniversalCompactionParallel, UniversalCompactionParallel) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.env = env_;
+  options.write_buffer_size = 1 << 10;  // 1KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 3;
+  options.max_background_flushes = 3;
+  options.target_file_size_base = 1 * 1024;
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Delay every compaction so multiple compactions will happen.
+  std::atomic<int> num_compactions_running(0);
+  std::atomic<bool> has_parallel(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        if (num_compactions_running.fetch_add(1) > 0) {
+          has_parallel.store(true);
+          return;
+        }
+        for (int nwait = 0; nwait < 20000; nwait++) {
+          if (has_parallel.load() || num_compactions_running.load() > 1) {
+            has_parallel.store(true);
+            break;
+          }
+          env_->SleepForMicroseconds(1000);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():End",
+      [&](void* /*arg*/) { num_compactions_running.fetch_add(-1); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 30000;
+  for (int i = 0; i < num_keys * 2; i++) {
+    ASSERT_OK(Put(1, Key(i % num_keys), Key(i)));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(num_compactions_running.load(), 0);
+  ASSERT_TRUE(has_parallel.load());
+
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+
+  // Reopen and check.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  for (int i = num_keys; i < num_keys * 2; i++) {
+    ASSERT_EQ(Get(1, Key(i % num_keys)), Key(i));
+  }
+}
+
+TEST_P(DBTestUniversalCompactionParallel, PickByFileNumberBug) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 1 * 1024;  // 1KB
+  options.level0_file_num_compaction_trigger = 7;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 1024 * 1024;  // 1MB
+
+  // Disable size amplifiction compaction
+  options.compaction_options_universal.max_size_amplification_percent =
+      UINT_MAX;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBTestUniversalCompactionParallel::PickByFileNumberBug:0",
+        "BackgroundCallCompaction:0"},
+       {"UniversalCompactionBuilder::PickCompaction:Return",
+        "DBTestUniversalCompactionParallel::PickByFileNumberBug:1"},
+       {"DBTestUniversalCompactionParallel::PickByFileNumberBug:2",
+        "CompactionJob::Run():Start"}});
+
+  int total_picked_compactions = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionBuilder::PickCompaction:Return", [&](void* arg) {
+        if (arg) {
+          total_picked_compactions++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  int key_idx = 1;
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 3 files while 1st compaction is held
+  // These 3 files have different sizes to avoid compacting based on size_ratio
+  int num_keys = 1000;
+  for (int i = 0; i < 3; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Hold the 1st compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // There should only be one picked compaction as the score drops below one
+  // after the first one is picked.
+  EXPECT_EQ(total_picked_compactions, 1);
+  EXPECT_EQ(TotalTableFiles(), 4);
+
+  // Stop SyncPoint and destroy the DB and reopen it again
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  key_idx = 1;
+  total_picked_compactions = 0;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Write 7 files to trigger compaction
+  for (int i = 1; i <= 70; i++) {
+    std::string k = Key(key_idx++);
+    ASSERT_OK(Put(k, k));
+    if (i % 10 == 0) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  // Wait for the 1st background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace();
+
+  // Write 8 files while 1st compaction is held
+  // These 8 files have different sizes to avoid compacting based on size_ratio
+  num_keys = 1000;
+  for (int i = 0; i < 8; i++) {
+    for (int j = 1; j <= num_keys; j++) {
+      std::string k = Key(key_idx++);
+      ASSERT_OK(Put(k, k));
+    }
+    ASSERT_OK(Flush());
+    num_keys -= 100;
+  }
+
+  // Wait for the 2nd background compaction process to start
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:0");
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:1");
+
+  // Hold the 1st and 2nd compaction from finishing
+  TEST_SYNC_POINT("DBTestUniversalCompactionParallel::PickByFileNumberBug:2");
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // This time we will trigger a compaction because of size ratio and
+  // another compaction because of number of files that are not compacted
+  // greater than 7
+  EXPECT_GE(total_picked_compactions, 2);
+}
+
+INSTANTIATE_TEST_CASE_P(Parallel, DBTestUniversalCompactionParallel,
+                        ::testing::Combine(::testing::Values(1, 10),
+                                           ::testing::Values(false)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionOptions) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;     // 105KB
+  options.arena_block_size = 4 << 10;        // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = -1;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(1, Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+    if (num < options.level0_file_num_compaction_trigger - 1) {
+      ASSERT_EQ(NumSortedRuns(1), num + 1);
+    }
+  }
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(NumSortedRuns(1), 1);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionStopStyleSimilarSize) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 105 << 10;     // 105KB
+  options.arena_block_size = 4 << 10;        // 4KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  // trigger compaction if there are >= 4 files
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_options_universal.size_ratio = 10;
+  options.compaction_options_universal.stop_style =
+      kCompactionStopStyleSimilarSize;
+  options.num_levels = num_levels_;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // Stage 1:
+  //   Generate a set of files at level 0, but don't trigger level-0
+  //   compaction.
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    // Write 100KB (100 values, each 1K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumSortedRuns(), num + 1);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Suppose each file flushed from mem table has size 1. Now we compact
+  // (level0_file_num_compaction_trigger+1)=4 files and should have a big
+  // file of size 4.
+  ASSERT_EQ(NumSortedRuns(), 1);
+
+  // Stage 2:
+  //   Now we have one file at level 0, with size 4. We also have some data in
+  //   mem table. Let's continue generating new files at level 0, but don't
+  //   trigger level-0 compaction.
+  //   First, clean up memtable before inserting new data. This will generate
+  //   a level-0 file, with size around 0.4 (according to previously written
+  //   data amount).
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 3;
+       num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 100; i++) {
+      ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumSortedRuns(), num + 3);
+  }
+
+  // Generate one more file at level-0, which should trigger level-0
+  // compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1.
+  // After compaction, we should have 3 files, with size 4, 0.4, 2.
+  ASSERT_EQ(NumSortedRuns(), 3);
+  // Stage 3:
+  //   Now we have 3 files at level 0, with size 4, 0.4, 2. Generate one
+  //   more file at level-0, which should trigger level-0 compaction.
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(Put(Key(key_idx), rnd.RandomString(990)));
+    key_idx++;
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Level-0 compaction is triggered, but no file will be picked up.
+  ASSERT_EQ(NumSortedRuns(), 4);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio1) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 70;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // The first compaction (2) is compressed.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000U * 2 * 0.9);
+
+  // The second compaction (4) is compressed
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000 * 4 * 0.9);
+
+  // The third compaction (2 4) is compressed since this time it is
+  // (1 1 3.2) and 3.2/5.2 doesn't reach ratio.
+  for (int num = 0; num < 2; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 110000 * 6 * 0.9);
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is not compressed.
+  for (int num = 0; num < 8; num++) {
+    // Write 110KB (11 values, each 10K)
+    for (int i = 0; i < 11; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_GT(TotalSize(), 110000 * 11 * 0.8 + 110000 * 2);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCompressRatio2) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = num_levels_;
+  options.compaction_options_universal.compression_size_percent = 95;
+  DestroyAndReopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // When we start for the compaction up to (2 4 8), the latest
+  // compressed is compressed given the size ratio to compress.
+  for (int num = 0; num < 14; num++) {
+    // Write 120KB (12 values, each 10K)
+    for (int i = 0; i < 12; i++) {
+      ASSERT_OK(Put(Key(key_idx), CompressibleString(&rnd, 10000)));
+      key_idx++;
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_LT(TotalSize(), 120000U * 12 * 0.82 + 120000 * 2);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest1) {
+  int32_t trivial_move = 0;
+  int32_t non_trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        non_trivial_move++;
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 2;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_background_compactions = 1;
+  options.target_file_size_base = 32 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 250000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+  ASSERT_GT(non_trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+// Test that checks trivial move in universal compaction
+TEST_P(DBTestUniversalCompaction, UniversalCompactionTrivialMoveTest2) {
+  int32_t trivial_move = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:TrivialMove",
+      [&](void* /*arg*/) { trivial_move++; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:NonTrivial", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        int output_level = *(static_cast<int*>(arg));
+        ASSERT_EQ(output_level, 0);
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.allow_trivial_move = true;
+  options.num_levels = 15;
+  options.write_buffer_size = 100 << 10;  // 100KB
+  options.level0_file_num_compaction_trigger = 8;
+  options.max_background_compactions = 2;
+  options.target_file_size_base = 64 * 1024;
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  Random rnd(301);
+  int num_keys = 500000;
+  for (int i = 0; i < num_keys; i++) {
+    ASSERT_OK(Put(1, Key(i), Key(i)));
+  }
+  std::vector<std::string> values;
+
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(trivial_move, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionFourPaths) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<std::string> filenames;
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+  }
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[3].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionCFPathUse) {
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024);
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 10;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+
+  std::vector<Options> option_vector;
+  option_vector.emplace_back(options);
+  ColumnFamilyOptions cf_opt1(options), cf_opt2(options);
+  // Configure CF1 specific paths.
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_2", 300 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_3", 500 * 1024);
+  cf_opt1.cf_paths.emplace_back(dbname_ + "cf1_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt1);
+  CreateColumnFamilies({"one"}, option_vector[1]);
+
+  // Configura CF2 specific paths.
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_2", 300 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_3", 500 * 1024);
+  cf_opt2.cf_paths.emplace_back(dbname_ + "cf2_4", 1024 * 1024 * 1024);
+  option_vector.emplace_back(DBOptions(options), cf_opt2);
+  CreateColumnFamilies({"two"}, option_vector[2]);
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  Random rnd(301);
+  int key_idx = 0;
+  int key_idx1 = 0;
+  int key_idx2 = 0;
+
+  auto generate_file = [&]() {
+    GenerateNewFile(0, &rnd, &key_idx);
+    GenerateNewFile(1, &rnd, &key_idx1);
+    GenerateNewFile(2, &rnd, &key_idx2);
+  };
+
+  auto check_sstfilecount = [&](int path_id, int expected) {
+    ASSERT_EQ(expected, GetSstFileCount(options.db_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt1.cf_paths[path_id].path));
+    ASSERT_EQ(expected, GetSstFileCount(cf_opt2.cf_paths[path_id].path));
+  };
+
+  auto check_getvalues = [&]() {
+    for (int i = 0; i < key_idx; i++) {
+      auto v = Get(0, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx1; i++) {
+      auto v = Get(1, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+
+    for (int i = 0; i < key_idx2; i++) {
+      auto v = Get(2, Key(i));
+      ASSERT_NE(v, "NOT_FOUND");
+      ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+    }
+  };
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    generate_file();
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  generate_file();
+  check_sstfilecount(2, 1);
+
+  // (1, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 1);
+
+  // (1,1,4) -> (2, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 2, 4) -> (3, 4)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 4) -> (8)
+  generate_file();
+  check_sstfilecount(3, 1);
+
+  // (1, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(0, 1);
+
+  // (1, 1, 8) -> (2, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+
+  // (1, 2, 8) -> (3, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(1, 1);
+  check_sstfilecount(0, 0);
+
+  // (1, 3, 8) -> (4, 8)
+  generate_file();
+  check_sstfilecount(2, 1);
+  check_sstfilecount(3, 1);
+
+  // (1, 4, 8) -> (5, 8)
+  generate_file();
+  check_sstfilecount(3, 1);
+  check_sstfilecount(2, 1);
+  check_sstfilecount(0, 0);
+
+  check_getvalues();
+
+  ReopenWithColumnFamilies({"default", "one", "two"}, option_vector);
+
+  check_getvalues();
+
+  Destroy(options, true);
+}
+
+TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) {
+  std::function<void(int)> verify_func = [&](int num_keys_in_db) {
+    std::string keys_in_db;
+    Iterator* iter = dbfull()->NewIterator(ReadOptions(), handles_[1]);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      keys_in_db.append(iter->key().ToString());
+      keys_in_db.push_back(',');
+    }
+    delete iter;
+
+    std::string expected_keys;
+    for (int i = 0; i <= num_keys_in_db; i++) {
+      expected_keys.append(Key(i));
+      expected_keys.push_back(',');
+    }
+
+    ASSERT_EQ(keys_in_db, expected_keys);
+  };
+
+  Random rnd(301);
+  int max_key1 = 200;
+  int max_key2 = 600;
+  int max_key3 = 800;
+  const int KNumKeysPerFile = 10;
+
+  // Stage 1: open a DB with universal compaction, num_levels=1
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options.write_buffer_size = 200 << 10;  // 200KB
+  options.level0_file_num_compaction_trigger = 3;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysPerFile));
+  options = CurrentOptions(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  for (int i = 0; i <= max_key1; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // Stage 2: reopen with universal compaction, num_levels=4
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 4;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  verify_func(max_key1);
+
+  // Insert more keys
+  for (int i = max_key1 + 1; i <= max_key2; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  verify_func(max_key2);
+  // Compaction to non-L0 has happened.
+  ASSERT_GT(NumTableFilesAtLevel(options.num_levels - 1, 1), 0);
+
+  // Stage 3: Revert it back to one level and revert to num_levels=1.
+  options.num_levels = 4;
+  options.target_file_size_base = INT_MAX;
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Compact all to level 0
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(
+      dbfull()->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  // Need to restart it once to remove higher level records in manifest.
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+  // Final reopen
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 1;
+  options = CurrentOptions(options);
+  ReopenWithColumnFamilies({"default", "pikachu"}, options);
+
+  // Insert more keys
+  for (int i = max_key2 + 1; i <= max_key3; i++) {
+    // each value is 10K
+    ASSERT_OK(Put(1, Key(i), rnd.RandomString(10000)));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  verify_func(max_key3);
+}
+
+TEST_P(DBTestUniversalCompaction, UniversalCompactionSecondPathRatio) {
+  if (!Snappy_Supported()) {
+    return;
+  }
+  Options options = CurrentOptions();
+  options.db_paths.emplace_back(dbname_, 500 * 1024);
+  options.db_paths.emplace_back(dbname_ + "_2", 1024 * 1024 * 1024);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.compaction_options_universal.size_ratio = 5;
+  options.write_buffer_size = 111 << 10;  // 114KB
+  options.arena_block_size = 4 << 10;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 1;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+
+  std::vector<std::string> filenames;
+  if (env_->GetChildren(options.db_paths[1].path, &filenames).ok()) {
+    // Delete archival files.
+    for (size_t i = 0; i < filenames.size(); ++i) {
+      ASSERT_OK(
+          env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]));
+    }
+    ASSERT_OK(env_->DeleteDir(options.db_paths[1].path));
+  }
+  Reopen(options);
+
+  Random rnd(301);
+  int key_idx = 0;
+
+  // First three 110KB files are not going to second path.
+  // After that, (100K, 200K)
+  for (int num = 0; num < 3; num++) {
+    GenerateNewFile(&rnd, &key_idx);
+  }
+
+  // Another 110KB triggers a compaction to 400K file to second path
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // (1, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1,1,4) -> (2, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 4) -> (3, 4)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 4) -> (8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 1, 8) -> (2, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(1, GetSstFileCount(dbname_));
+
+  // (1, 2, 8) -> (3, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 3, 8) -> (4, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  // (1, 4, 8) -> (5, 8)
+  GenerateNewFile(&rnd, &key_idx);
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path));
+  ASSERT_EQ(0, GetSstFileCount(dbname_));
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Reopen(options);
+
+  for (int i = 0; i < key_idx; i++) {
+    auto v = Get(Key(i));
+    ASSERT_NE(v, "NOT_FOUND");
+    ASSERT_TRUE(v.size() == 1 || v.size() == 990);
+  }
+
+  Destroy(options);
+}
+
+TEST_P(DBTestUniversalCompaction, ConcurrentBottomPriLowPriCompactions) {
+  if (num_levels_ == 1) {
+    // for single-level universal, everything's bottom level so nothing should
+    // be executed in bottom-pri thread pool.
+    return;
+  }
+  const int kNumFilesTrigger = 3;
+  Env::Default()->SetBackgroundThreads(1, Env::Priority::BOTTOM);
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.max_background_compactions = 2;
+  options.num_levels = num_levels_;
+  options.write_buffer_size = 100 << 10;     // 100KB
+  options.target_file_size_base = 32 << 10;  // 32KB
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  // Trigger compaction if size amplification exceeds 110%
+  options.compaction_options_universal.max_size_amplification_percent = 110;
+  DestroyAndReopen(options);
+
+  // Need to get a token to enable compaction parallelism up to
+  // `max_background_compactions` jobs.
+  auto pressure_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {// wait for the full compaction to be picked before adding files intended
+       // for the second one.
+       {"DBImpl::BackgroundCompaction:ForwardToBottomPriPool",
+        "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0"},
+       // the full (bottom-pri) compaction waits until a partial (low-pri)
+       // compaction has started to verify they can run in parallel.
+       {"DBImpl::BackgroundCompaction:NonTrivial",
+        "DBImpl::BGWorkBottomCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int i = 0; i < 2; ++i) {
+    for (int num = 0; num < kNumFilesTrigger; num++) {
+      int key_idx = 0;
+      GenerateNewFile(&rnd, &key_idx, true /* no_wait */);
+      // use no_wait above because that one waits for flush and compaction. We
+      // don't want to wait for compaction because the full compaction is
+      // intentionally blocked while more files are flushed.
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    if (i == 0) {
+      TEST_SYNC_POINT(
+          "DBTestUniversalCompaction:ConcurrentBottomPriLowPriCompactions:0");
+    }
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  // First compaction should output to bottom level. Second should output to L0
+  // since older L0 files pending compaction prevent it from being placed lower.
+  ASSERT_EQ(NumSortedRuns(), 2);
+  ASSERT_GT(NumTableFilesAtLevel(0), 0);
+  ASSERT_GT(NumTableFilesAtLevel(num_levels_ - 1), 0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  Env::Default()->SetBackgroundThreads(0, Env::Priority::BOTTOM);
+}
+
+TEST_P(DBTestUniversalCompaction, RecalculateScoreAfterPicking) {
+  // Regression test for extra compactions scheduled. Once enough compactions
+  // have been scheduled to bring the score below one, we should stop
+  // scheduling more; otherwise, other CFs/DBs may be delayed unnecessarily.
+  const int kNumFilesTrigger = 8;
+  Options options = CurrentOptions();
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(KNumKeysByGenerateNewFile - 1));
+  options.compaction_options_universal.max_merge_width = kNumFilesTrigger / 2;
+  options.compaction_options_universal.max_size_amplification_percent =
+      static_cast<unsigned int>(-1);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.level0_file_num_compaction_trigger = kNumFilesTrigger;
+  options.num_levels = num_levels_;
+  Reopen(options);
+
+  std::atomic<int> num_compactions_attempted(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { ++num_compactions_attempted; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  for (int num = 0; num < kNumFilesTrigger; num++) {
+    ASSERT_EQ(NumSortedRuns(), num);
+    int key_idx = 0;
+    GenerateNewFile(&rnd, &key_idx);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  // Compacting the first four files was enough to bring the score below one so
+  // there's no need to schedule any more compactions.
+  ASSERT_EQ(1, num_compactions_attempted);
+  ASSERT_EQ(NumSortedRuns(), 5);
+}
+
+TEST_P(DBTestUniversalCompaction, FinalSortedRunCompactFilesConflict) {
+  // Regression test for conflict between:
+  // (1) Running CompactFiles including file in the final sorted run; and
+  // (2) Picking universal size-amp-triggered compaction, which always includes
+  //     the final sorted run.
+  if (exclusive_manual_compaction_) {
+    return;
+  }
+
+  Options opts = CurrentOptions();
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.compaction_options_universal.max_size_amplification_percent = 50;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compression = kNoCompression;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.max_background_compactions = 2;
+  opts.num_levels = num_levels_;
+  Reopen(opts);
+
+  // make sure compaction jobs can be parallelized
+  auto stop_token =
+      dbfull()->TEST_write_controler().GetCompactionPressureToken();
+
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(NumTableFilesAtLevel(num_levels_ - 1), 1);
+  ColumnFamilyMetaData cf_meta;
+  ColumnFamilyHandle* default_cfh = db_->DefaultColumnFamily();
+  dbfull()->GetColumnFamilyMetaData(default_cfh, &cf_meta);
+  ASSERT_EQ(1, cf_meta.levels[num_levels_ - 1].files.size());
+  std::string first_sst_filename =
+      cf_meta.levels[num_levels_ - 1].files[0].name;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"CompactFilesImpl:0",
+        "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0"},
+       {"DBImpl::BackgroundCompaction():AfterPickCompaction",
+        "CompactFilesImpl:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_files_thread([&]() {
+    ASSERT_OK(dbfull()->CompactFiles(CompactionOptions(), default_cfh,
+                                     {first_sst_filename}, num_levels_ - 1));
+  });
+
+  TEST_SYNC_POINT(
+      "DBTestUniversalCompaction:FinalSortedRunCompactFilesConflict:0");
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  compact_files_thread.join();
+}
+
+INSTANTIATE_TEST_CASE_P(NumLevels, DBTestUniversalCompaction,
+                        ::testing::Combine(::testing::Values(1, 3, 5),
+                                           ::testing::Bool()));
+
+class DBTestUniversalManualCompactionOutputPathId
+    : public DBTestUniversalCompactionBase {
+ public:
+  DBTestUniversalManualCompactionOutputPathId()
+      : DBTestUniversalCompactionBase(
+            "/db_universal_compaction_manual_pid_test") {}
+};
+
+TEST_P(DBTestUniversalManualCompactionOutputPathId,
+       ManualCompactionOutputPathId) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.db_paths.emplace_back(dbname_, 1000000000);
+  options.db_paths.emplace_back(dbname_ + "_2", 1000000000);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = num_levels_;
+  options.target_file_size_base = 1 << 30;  // Big size
+  options.level0_file_num_compaction_trigger = 10;
+  Destroy(options);
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  MakeTables(3, "p", "q", 1);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(2, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  CompactRangeOptions compact_options;
+  compact_options.target_path_id = 1;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  MakeTables(1, "p", "q", 1);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options);
+  ASSERT_EQ(2, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path));
+
+  // Full compaction to DB path 0
+  compact_options.target_path_id = 0;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_OK(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr));
+  ASSERT_EQ(1, TotalLiveFiles(1));
+  ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path));
+  ASSERT_EQ(0, GetSstFileCount(options.db_paths[1].path));
+
+  // Fail when compacting to an invalid path ID
+  compact_options.target_path_id = 2;
+  compact_options.exclusive_manual_compaction = exclusive_manual_compaction_;
+  ASSERT_TRUE(db_->CompactRange(compact_options, handles_[1], nullptr, nullptr)
+                  .IsInvalidArgument());
+}
+
+INSTANTIATE_TEST_CASE_P(OutputPathId,
+                        DBTestUniversalManualCompactionOutputPathId,
+                        ::testing::Combine(::testing::Values(1, 8),
+                                           ::testing::Bool()));
+
+TEST_F(DBTestUniversalCompaction2, BasicL0toL1) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  //  MoveFilesToLevel(6);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+#if defined(ENABLE_SINGLE_LEVEL_DTC)
+TEST_F(DBTestUniversalCompaction2, SingleLevel) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.num_levels = 1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush()(;
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+#endif  // ENABLE_SINGLE_LEVEL_DTC
+
+TEST_F(DBTestUniversalCompaction2, MultipleLevels) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 4;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 500; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 500; i < 1000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 1000; i < 1500; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 1500; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  for (i = 1999; i < 2333; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2333; i < 2666; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2666; i < 2999; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+
+  for (i = 1900; i < 2100; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(1));
+  ASSERT_EQ(0, NumTableFilesAtLevel(2));
+  ASSERT_EQ(0, NumTableFilesAtLevel(3));
+  ASSERT_EQ(0, NumTableFilesAtLevel(4));
+  ASSERT_EQ(0, NumTableFilesAtLevel(5));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, OverlappingL0) {
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 5;
+  opts.compression = kNoCompression;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2000; i < 3000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 3500; i < 4000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  for (i = 2900; i < 3100; ++i) {
+    ASSERT_OK(Delete(Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, IngestBehind) {
+  const int kNumKeys = 3000;
+  const int kWindowSize = 100;
+  const int kNumDelsTrigger = 90;
+
+  Options opts = CurrentOptions();
+  opts.table_properties_collector_factories.emplace_back(
+      NewCompactOnDeletionCollectorFactory(kWindowSize, kNumDelsTrigger));
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 2;
+  opts.compression = kNoCompression;
+  opts.allow_ingest_behind = true;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  Reopen(opts);
+
+  // add an L1 file to prevent tombstones from dropping due to obsolescence
+  // during flush
+  int i;
+  for (i = 0; i < 2000; ++i) {
+    ASSERT_OK(Put(Key(i), "val"));
+  }
+  ASSERT_OK(Flush());
+  //  MoveFilesToLevel(6);
+  ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (i = 1999; i < kNumKeys; ++i) {
+    if (i >= kNumKeys - kWindowSize &&
+        i < kNumKeys - kWindowSize + kNumDelsTrigger) {
+      ASSERT_OK(Delete(Key(i)));
+    } else {
+      ASSERT_OK(Put(Key(i), "val"));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(6));
+  ASSERT_GT(NumTableFilesAtLevel(5), 0);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompactionDefault) {
+  Options options;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.env = env_;
+  KeepFilterFactory* filter = new KeepFilterFactory(true);
+  options.compaction_filter_factory.reset(filter);
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  KeepFilter df;
+  options.compaction_filter_factory.reset();
+  options.compaction_filter = &df;
+  Reopen(options);
+  ASSERT_EQ(30 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+
+  options.ttl = 60 * 24 * 60 * 60;
+  options.compaction_filter = nullptr;
+  Reopen(options);
+  ASSERT_EQ(60 * 24 * 60 * 60,
+            dbfull()->GetOptions().periodic_compaction_seconds);
+}
+
+TEST_F(DBTestUniversalCompaction2, PeriodicCompaction) {
+  Options opts = CurrentOptions();
+  opts.env = env_;
+  opts.compaction_style = kCompactionStyleUniversal;
+  opts.level0_file_num_compaction_trigger = 10;
+  opts.max_open_files = -1;
+  opts.compaction_options_universal.size_ratio = 10;
+  opts.compaction_options_universal.min_merge_width = 2;
+  opts.compaction_options_universal.max_size_amplification_percent = 200;
+  opts.periodic_compaction_seconds = 48 * 60 * 60;  // 2 days
+  opts.num_levels = 5;
+  env_->SetMockSleep();
+  Reopen(opts);
+
+  // NOTE: Presumed unnecessary and removed: resetting mock time in env
+
+  int periodic_compactions = 0;
+  int start_level = -1;
+  int output_level = -1;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "UniversalCompactionPicker::PickPeriodicCompaction:Return",
+      [&](void* arg) {
+        Compaction* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_TRUE(arg != nullptr);
+        ASSERT_TRUE(compaction->compaction_reason() ==
+                    CompactionReason::kPeriodicCompaction);
+        start_level = compaction->start_level();
+        output_level = compaction->output_level();
+        periodic_compactions++;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Case 1: Oldest flushed file excceeds periodic compaction threshold.
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ(0, periodic_compactions);
+  // Move clock forward so that the flushed file would qualify periodic
+  // compaction.
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+
+  // Another flush would trigger compaction the oldest file.
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+
+  // Case 2: Oldest compacted file excceeds periodic compaction threshold
+  periodic_compactions = 0;
+  // A flush doesn't trigger a periodic compaction when threshold not hit
+  ASSERT_OK(Put("foo", "bar2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(0, periodic_compactions);
+
+  // After periodic compaction threshold hits, a flush will trigger
+  // a compaction
+  ASSERT_OK(Put("foo", "bar2"));
+  env_->MockSleepForSeconds(48 * 60 * 60 + 100);
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, periodic_compactions);
+  ASSERT_EQ(0, start_level);
+  ASSERT_EQ(4, output_level);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !defined(ROCKSDB_LITE)
+
+int main(int argc, char** argv) {
+#if !defined(ROCKSDB_LITE)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  (void)argc;
+  (void)argv;
+  return 0;
+#endif
+}
diff --git a/src/rocksdb/db/db_wal_test.cc b/src/rocksdb/db/db_wal_test.cc
new file mode 100644
index 000000000..5b5ec76af
--- /dev/null
+++ b/src/rocksdb/db/db_wal_test.cc
@@ -0,0 +1,2314 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/file_system.h"
+#include "test_util/sync_point.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBWALTestBase : public DBTestBase {
+ protected:
+  explicit DBWALTestBase(const std::string& dir_name)
+      : DBTestBase(dir_name, /*env_do_fsync=*/true) {}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+ public:
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+  bool IsFallocateSupported() {
+    // Test fallocate support of running file system.
+    // Skip this test if fallocate is not supported.
+    std::string fname_test_fallocate = dbname_ + "/preallocate_testfile";
+    int fd = -1;
+    do {
+      fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644);
+    } while (fd < 0 && errno == EINTR);
+    assert(fd > 0);
+    int alloc_status = fallocate(fd, 0, 0, 1);
+    int err_number = errno;
+    close(fd);
+    assert(env_->DeleteFile(fname_test_fallocate) == Status::OK());
+    if (err_number == ENOSYS || err_number == EOPNOTSUPP) {
+      fprintf(stderr, "Skipped preallocated space check: %s\n",
+              errnoStr(err_number).c_str());
+      return false;
+    }
+    assert(alloc_status == 0);
+    return true;
+  }
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+
+  uint64_t GetAllocatedFileSize(std::string file_name) {
+    struct stat sbuf;
+    int err = stat(file_name.c_str(), &sbuf);
+    assert(err == 0);
+    return sbuf.st_blocks * 512;
+  }
+#endif  // ROCKSDB_PLATFORM_POSIX
+};
+
+class DBWALTest : public DBWALTestBase {
+ public:
+  DBWALTest() : DBWALTestBase("/db_wal_test") {}
+};
+
+// A SpecialEnv enriched to give more insight about deleted files
+class EnrichedSpecialEnv : public SpecialEnv {
+ public:
+  explicit EnrichedSpecialEnv(Env* base) : SpecialEnv(base) {}
+  Status NewSequentialFile(const std::string& f,
+                           std::unique_ptr<SequentialFile>* r,
+                           const EnvOptions& soptions) override {
+    InstrumentedMutexLock l(&env_mutex_);
+    if (f == skipped_wal) {
+      deleted_wal_reopened = true;
+      if (IsWAL(f) && largest_deleted_wal.size() != 0 &&
+          f.compare(largest_deleted_wal) <= 0) {
+        gap_in_wals = true;
+      }
+    }
+    return SpecialEnv::NewSequentialFile(f, r, soptions);
+  }
+  Status DeleteFile(const std::string& fname) override {
+    if (IsWAL(fname)) {
+      deleted_wal_cnt++;
+      InstrumentedMutexLock l(&env_mutex_);
+      // If this is the first WAL, remember its name and skip deleting it. We
+      // remember its name partly because the application might attempt to
+      // delete the file again.
+      if (skipped_wal.size() != 0 && skipped_wal != fname) {
+        if (largest_deleted_wal.size() == 0 ||
+            largest_deleted_wal.compare(fname) < 0) {
+          largest_deleted_wal = fname;
+        }
+      } else {
+        skipped_wal = fname;
+        return Status::OK();
+      }
+    }
+    return SpecialEnv::DeleteFile(fname);
+  }
+  bool IsWAL(const std::string& fname) {
+    // printf("iswal %s\n", fname.c_str());
+    return fname.compare(fname.size() - 3, 3, "log") == 0;
+  }
+
+  InstrumentedMutex env_mutex_;
+  // the wal whose actual delete was skipped by the env
+  std::string skipped_wal = "";
+  // the largest WAL that was requested to be deleted
+  std::string largest_deleted_wal = "";
+  // number of WALs that were successfully deleted
+  std::atomic<size_t> deleted_wal_cnt = {0};
+  // the WAL whose delete from fs was skipped is reopened during recovery
+  std::atomic<bool> deleted_wal_reopened = {false};
+  // whether a gap in the WALs was detected during recovery
+  std::atomic<bool> gap_in_wals = {false};
+};
+
+class DBWALTestWithEnrichedEnv : public DBTestBase {
+ public:
+  DBWALTestWithEnrichedEnv()
+      : DBTestBase("db_wal_test", /*env_do_fsync=*/true) {
+    enriched_env_ = new EnrichedSpecialEnv(env_->target());
+    auto options = CurrentOptions();
+    options.env = enriched_env_;
+    options.allow_2pc = true;
+    Reopen(options);
+    delete env_;
+    // to be deleted by the parent class
+    env_ = enriched_env_;
+  }
+
+ protected:
+  EnrichedSpecialEnv* enriched_env_;
+};
+
+// Test that the recovery would successfully avoid the gaps between the logs.
+// One known scenario that could cause this is that the application issue the
+// WAL deletion out of order. For the sake of simplicity in the test, here we
+// create the gap by manipulating the env to skip deletion of the first WAL but
+// not the ones after it.
+TEST_F(DBWALTestWithEnrichedEnv, SkipDeletedWALs) {
+  auto options = last_options_;
+  // To cause frequent WAL deletion
+  options.write_buffer_size = 128;
+  Reopen(options);
+
+  WriteOptions writeOpt = WriteOptions();
+  for (int i = 0; i < 128 * 5; i++) {
+    ASSERT_OK(dbfull()->Put(writeOpt, "foo", "v1"));
+  }
+  FlushOptions fo;
+  fo.wait = true;
+  ASSERT_OK(db_->Flush(fo));
+
+  // some wals are deleted
+  ASSERT_NE(0, enriched_env_->deleted_wal_cnt);
+  // but not the first one
+  ASSERT_NE(0, enriched_env_->skipped_wal.size());
+
+  // Test that the WAL that was not deleted will be skipped during recovery
+  options = last_options_;
+  Reopen(options);
+  ASSERT_FALSE(enriched_env_->deleted_wal_reopened);
+  ASSERT_FALSE(enriched_env_->gap_in_wals);
+}
+
+TEST_F(DBWALTest, WAL) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    // again both values should be present.
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RollLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+    ASSERT_OK(Put(1, "foo", "v4"));
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, SyncWALNotBlockWrite) {
+  Options options = CurrentOptions();
+  options.max_write_buffer_number = 4;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo5", "bar5"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"WritableFileWriter::SyncWithoutFlush:1",
+       "DBWALTest::SyncWALNotBlockWrite:1"},
+      {"DBWALTest::SyncWALNotBlockWrite:2",
+       "WritableFileWriter::SyncWithoutFlush:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread([&]() { ASSERT_OK(db_->SyncWAL()); });
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:1");
+  ASSERT_OK(Put("foo2", "bar2"));
+  ASSERT_OK(Put("foo3", "bar3"));
+  FlushOptions fo;
+  fo.wait = false;
+  ASSERT_OK(db_->Flush(fo));
+  ASSERT_OK(Put("foo4", "bar4"));
+
+  TEST_SYNC_POINT("DBWALTest::SyncWALNotBlockWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ASSERT_EQ(Get("foo3"), "bar3");
+  ASSERT_EQ(Get("foo4"), "bar4");
+  ASSERT_EQ(Get("foo5"), "bar5");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, SyncWALNotWaitWrite) {
+  ASSERT_OK(Put("foo1", "bar1"));
+  ASSERT_OK(Put("foo3", "bar3"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"SpecialEnv::WalFile::Append:1", "DBWALTest::SyncWALNotWaitWrite:1"},
+      {"DBWALTest::SyncWALNotWaitWrite:2", "SpecialEnv::WalFile::Append:2"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ROCKSDB_NAMESPACE::port::Thread thread(
+      [&]() { ASSERT_OK(Put("foo2", "bar2")); });
+  // Moving this to SyncWAL before the actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:1");
+  ASSERT_OK(db_->SyncWAL());
+  // Moving this to SyncWAL after actual fsync
+  // TEST_SYNC_POINT("DBWALTest::SyncWALNotWaitWrite:2");
+
+  thread.join();
+
+  ASSERT_EQ(Get("foo1"), "bar1");
+  ASSERT_EQ(Get("foo2"), "bar2");
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, Recover) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "baz", "v5"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Put(1, "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+    ASSERT_OK(Put(1, "foo", "v4"));
+    ASSERT_EQ("v4", Get(1, "foo"));
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v5", Get(1, "baz"));
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithTableHandle) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.disable_auto_compactions = true;
+    options.avoid_flush_during_recovery = false;
+    DestroyAndReopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "bar", "v2"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ASSERT_OK(Put(1, "bar", "v4"));
+    ASSERT_OK(Flush(1));
+    ASSERT_OK(Put(1, "big", std::string(100, 'a')));
+
+    options = CurrentOptions();
+    const int kSmallMaxOpenFiles = 13;
+    if (option_config_ == kDBLogDir) {
+      // Use this option to check not preloading files
+      // Set the max open files to be small enough so no preload will
+      // happen.
+      options.max_open_files = kSmallMaxOpenFiles;
+      // RocksDB sanitize max open files to at least 20. Modify it back.
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+          "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
+            int* max_open_files = static_cast<int*>(arg);
+            *max_open_files = kSmallMaxOpenFiles;
+          });
+
+    } else if (option_config_ == kWalDirAndMmapReads) {
+      // Use this option to check always loading all files.
+      options.max_open_files = 100;
+    } else {
+      options.max_open_files = -1;
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    std::vector<std::vector<FileMetaData>> files;
+    dbfull()->TEST_GetFilesMetaData(handles_[1], &files);
+    size_t total_files = 0;
+    for (const auto& level : files) {
+      total_files += level.size();
+    }
+    ASSERT_EQ(total_files, 3);
+    for (const auto& level : files) {
+      for (const auto& file : level) {
+        if (options.max_open_files == kSmallMaxOpenFiles) {
+          ASSERT_TRUE(file.table_reader_handle == nullptr);
+        } else {
+          ASSERT_TRUE(file.table_reader_handle != nullptr);
+        }
+      }
+    }
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithBlob) {
+  // Write a value that's below the prospective size limit for blobs and another
+  // one that's above. Note that blob files are not actually enabled at this
+  // point.
+  constexpr uint64_t min_blob_size = 10;
+
+  constexpr char short_value[] = "short";
+  static_assert(sizeof(short_value) - 1 < min_blob_size,
+                "short_value too long");
+
+  constexpr char long_value[] = "long_value";
+  static_assert(sizeof(long_value) - 1 >= min_blob_size,
+                "long_value too short");
+
+  ASSERT_OK(Put("key1", short_value));
+  ASSERT_OK(Put("key2", long_value));
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled. A new table file/blob file
+  // pair should be written during recovery.
+  Options options;
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  ASSERT_EQ(Get("key1"), short_value);
+  ASSERT_EQ(Get("key2"), long_value);
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_EQ(l0_files.size(), 1);
+
+  const FileMetaData* const table_file = l0_files[0];
+  ASSERT_NE(table_file, nullptr);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_EQ(blob_files.size(), 1);
+
+  const auto& blob_file = blob_files.front();
+  ASSERT_NE(blob_file, nullptr);
+
+  ASSERT_EQ(table_file->smallest.user_key(), "key1");
+  ASSERT_EQ(table_file->largest.user_key(), "key2");
+  ASSERT_EQ(table_file->fd.smallest_seqno, 1);
+  ASSERT_EQ(table_file->fd.largest_seqno, 2);
+  ASSERT_EQ(table_file->oldest_blob_file_number,
+            blob_file->GetBlobFileNumber());
+
+  ASSERT_EQ(blob_file->GetTotalBlobCount(), 1);
+
+#ifndef ROCKSDB_LITE
+  const InternalStats* const internal_stats = cfd->internal_stats();
+  ASSERT_NE(internal_stats, nullptr);
+
+  const auto& compaction_stats = internal_stats->TEST_GetCompactionStats();
+  ASSERT_FALSE(compaction_stats.empty());
+  ASSERT_EQ(compaction_stats[0].bytes_written, table_file->fd.GetFileSize());
+  ASSERT_EQ(compaction_stats[0].bytes_written_blob,
+            blob_file->GetTotalBlobBytes());
+  ASSERT_EQ(compaction_stats[0].num_output_files, 1);
+  ASSERT_EQ(compaction_stats[0].num_output_files_blob, 1);
+
+  const uint64_t* const cf_stats_value = internal_stats->TEST_GetCFStatsValue();
+  ASSERT_EQ(cf_stats_value[InternalStats::BYTES_FLUSHED],
+            compaction_stats[0].bytes_written +
+                compaction_stats[0].bytes_written_blob);
+#endif  // ROCKSDB_LITE
+}
+
+TEST_F(DBWALTest, RecoverWithBlobMultiSST) {
+  // Write several large (4 KB) values without flushing. Note that blob files
+  // are not actually enabled at this point.
+  std::string large_value(1 << 12, 'a');
+
+  constexpr int num_keys = 64;
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_OK(Put(Key(i), large_value));
+  }
+
+  // There should be no files just yet since we haven't flushed.
+  {
+    VersionSet* const versions = dbfull()->GetVersionSet();
+    ASSERT_NE(versions, nullptr);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    ASSERT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    ASSERT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    ASSERT_NE(storage_info, nullptr);
+
+    ASSERT_EQ(storage_info->num_non_empty_levels(), 0);
+    ASSERT_TRUE(storage_info->GetBlobFiles().empty());
+  }
+
+  // Reopen the database with blob files enabled and write buffer size set to a
+  // smaller value. Multiple table files+blob files should be written and added
+  // to the Version during recovery.
+  Options options;
+  options.write_buffer_size = 1 << 16;  // 64 KB
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  Reopen(options);
+
+  for (int i = 0; i < num_keys; ++i) {
+    ASSERT_EQ(Get(Key(i)), large_value);
+  }
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  ASSERT_NE(versions, nullptr);
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  ASSERT_NE(cfd, nullptr);
+
+  Version* const current = cfd->current();
+  ASSERT_NE(current, nullptr);
+
+  const VersionStorageInfo* const storage_info = current->storage_info();
+  ASSERT_NE(storage_info, nullptr);
+
+  const auto& l0_files = storage_info->LevelFiles(0);
+  ASSERT_GT(l0_files.size(), 1);
+
+  const auto& blob_files = storage_info->GetBlobFiles();
+  ASSERT_GT(blob_files.size(), 1);
+
+  ASSERT_EQ(l0_files.size(), blob_files.size());
+}
+
+TEST_F(DBWALTest, WALWithChecksumHandoff) {
+#ifndef ROCKSDB_ASSERT_STATUS_CHECKED
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem or non-encrypted environment");
+    return;
+  }
+  std::shared_ptr<FaultInjectionTestFS> fault_fs(
+      new FaultInjectionTestFS(FileSystem::Default()));
+  std::unique_ptr<Env> fault_fs_env(NewCompositeEnv(fault_fs));
+  do {
+    Options options = CurrentOptions();
+
+    options.checksum_handoff_file_types.Add(FileType::kWalFile);
+    options.env = fault_fs_env.get();
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+
+    CreateAndReopenWithCF({"pikachu"}, options);
+    WriteOptions writeOpt = WriteOptions();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1"));
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v1", Get(1, "foo"));
+    ASSERT_EQ("v1", Get(1, "bar"));
+
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2"));
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Both value's should be present.
+    ASSERT_EQ("v2", Get(1, "bar"));
+    ASSERT_EQ("v2", Get(1, "foo"));
+
+    writeOpt.disableWAL = true;
+    // This put, data is persisted by Flush
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    writeOpt.disableWAL = false;
+    // Data is persisted in the WAL
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "zoo", "v3"));
+    // The hash does not match, write fails
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kxxHash);
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3"));
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    // Due to the write failure, Get should not find
+    ASSERT_NE("v3", Get(1, "foo"));
+    ASSERT_EQ("v3", Get(1, "zoo"));
+    ASSERT_EQ("v3", Get(1, "bar"));
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kCRC32c);
+    // Each write will be similated as corrupted.
+    fault_fs->IngestDataCorruptionBeforeWrite();
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v4"));
+    writeOpt.disableWAL = false;
+    ASSERT_NOK(dbfull()->Put(writeOpt, handles_[1], "foo", "v4"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_NE("v4", Get(1, "foo"));
+    ASSERT_NE("v4", Get(1, "bar"));
+    fault_fs->NoDataCorruptionBeforeWrite();
+
+    fault_fs->SetChecksumHandoffFuncType(ChecksumType::kNoChecksum);
+    // The file system does not provide checksum method and verification.
+    writeOpt.disableWAL = true;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v5"));
+    writeOpt.disableWAL = false;
+    ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v5"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ("v5", Get(1, "foo"));
+    ASSERT_EQ("v5", Get(1, "bar"));
+
+    Destroy(options);
+  } while (ChangeWalOptions());
+#endif  // ROCKSDB_ASSERT_STATUS_CHECKED
+}
+
+class DBRecoveryTestBlobError
+    : public DBWALTest,
+      public testing::WithParamInterface<std::string> {
+ public:
+  DBRecoveryTestBlobError() : sync_point_(GetParam()) {}
+
+  std::string sync_point_;
+};
+
+INSTANTIATE_TEST_CASE_P(DBRecoveryTestBlobError, DBRecoveryTestBlobError,
+                        ::testing::ValuesIn(std::vector<std::string>{
+                            "BlobFileBuilder::WriteBlobToFile:AddRecord",
+                            "BlobFileBuilder::WriteBlobToFile:AppendFooter"}));
+
+TEST_P(DBRecoveryTestBlobError, RecoverWithBlobError) {
+  // Write a value. Note that blob files are not actually enabled at this point.
+  ASSERT_OK(Put("key", "blob"));
+
+  // Reopen with blob files enabled but make blob file writing fail during
+  // recovery.
+  SyncPoint::GetInstance()->SetCallBack(sync_point_, [this](void* arg) {
+    Status* const s = static_cast<Status*>(arg);
+    assert(s);
+
+    (*s) = Status::IOError(sync_point_);
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options;
+  options.enable_blob_files = true;
+  options.avoid_flush_during_recovery = false;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+
+  ASSERT_NOK(TryReopen(options));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Make sure the files generated by the failed recovery have been deleted.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t number = 0;
+    FileType type = kTableFile;
+
+    if (!ParseFileName(file, &number, &type)) {
+      continue;
+    }
+
+    ASSERT_NE(type, kTableFile);
+    ASSERT_NE(type, kBlobFile);
+  }
+}
+
+TEST_F(DBWALTest, IgnoreRecoveredLog) {
+  std::string backup_logs = dbname_ + "/backup_logs";
+
+  do {
+    // delete old files in backup_logs directory
+    ASSERT_OK(env_->CreateDirIfMissing(backup_logs));
+    std::vector<std::string> old_files;
+    ASSERT_OK(env_->GetChildren(backup_logs, &old_files));
+    for (auto& file : old_files) {
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + file));
+    }
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.merge_operator = MergeOperators::CreateUInt64AddOperator();
+    options.wal_dir = dbname_ + "/logs";
+    DestroyAndReopen(options);
+
+    // fill up the DB
+    std::string one, two;
+    PutFixed64(&one, 1);
+    PutFixed64(&two, 2);
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("foo"), Slice(one)));
+    ASSERT_OK(db_->Merge(WriteOptions(), Slice("bar"), Slice(one)));
+
+    // copy the logs to backup
+    std::vector<std::string> logs;
+    ASSERT_OK(env_->GetChildren(options.wal_dir, &logs));
+    for (auto& log : logs) {
+      CopyFile(options.wal_dir + "/" + log, backup_logs + "/" + log);
+    }
+
+    // recover the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+
+    // copy the logs from backup back to wal dir
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+    }
+    // this should ignore the log files, recovery should not happen again
+    // if the recovery happens, the same merge operator would be called twice,
+    // leading to incorrect results
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+    Close();
+    Destroy(options);
+    Reopen(options);
+    Close();
+
+    // copy the logs from backup back to wal dir
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+    }
+    // assert that we successfully recovered only from logs, even though we
+    // destroyed the DB
+    Reopen(options);
+    ASSERT_EQ(two, Get("foo"));
+    ASSERT_EQ(one, Get("bar"));
+
+    // Recovery will fail if DB directory doesn't exist.
+    Destroy(options);
+    // copy the logs from backup back to wal dir
+    ASSERT_OK(env_->CreateDirIfMissing(options.wal_dir));
+    for (auto& log : logs) {
+      CopyFile(backup_logs + "/" + log, options.wal_dir + "/" + log);
+      // we won't be needing this file no more
+      ASSERT_OK(env_->DeleteFile(backup_logs + "/" + log));
+    }
+    Status s = TryReopen(options);
+    ASSERT_NOK(s);
+    Destroy(options);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithEmptyLog) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v3"));
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    ASSERT_EQ("v3", Get(1, "foo"));
+  } while (ChangeWalOptions());
+}
+
+#if !(defined NDEBUG) || !defined(OS_WIN)
+TEST_F(DBWALTest, PreallocateBlock) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 10 * 1000 * 1000;
+  options.max_total_wal_size = 0;
+
+  size_t expected_preallocation_size = static_cast<size_t>(
+      options.write_buffer_size + options.write_buffer_size / 10);
+
+  DestroyAndReopen(options);
+
+  std::atomic<int> called(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.max_total_wal_size = 1000 * 1000;
+  expected_preallocation_size = static_cast<size_t>(options.max_total_wal_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  options.db_write_buffer_size = 800 * 1000;
+  expected_preallocation_size =
+      static_cast<size_t>(options.db_write_buffer_size);
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+
+  expected_preallocation_size = 700 * 1000;
+  std::shared_ptr<WriteBufferManager> write_buffer_manager =
+      std::make_shared<WriteBufferManager>(static_cast<uint64_t>(700 * 1000));
+  options.write_buffer_manager = write_buffer_manager;
+  Reopen(options);
+  called.store(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBTestWalFile.GetPreallocationStatus", [&](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        size_t preallocation_size = *(static_cast<size_t*>(arg));
+        ASSERT_EQ(expected_preallocation_size, preallocation_size);
+        called.fetch_add(1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(Put("", ""));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("", ""));
+  Close();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_EQ(2, called.load());
+}
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesRecycledLog) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
+  // For github issue #1303
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.recycle_log_file_num = 2;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("foo", "v1"));
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_GT(log_files.size(), 0);
+    ASSERT_OK(Flush());
+
+    // Now the original WAL is in log_files[0] and should be marked for
+    // recycling.
+    // Verify full purge cannot remove this file.
+    JobContext job_context(0);
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&job_context, true /* force */);
+    dbfull()->TEST_UnlockMutex();
+    dbfull()->PurgeObsoleteFiles(job_context);
+
+    if (i == 0) {
+      ASSERT_OK(
+          env_->FileExists(LogFileName(dbname_, log_files[0]->LogNumber())));
+    } else {
+      ASSERT_OK(env_->FileExists(
+          LogFileName(alternative_wal_dir_, log_files[0]->LogNumber())));
+    }
+  }
+}
+
+TEST_F(DBWALTest, DISABLED_FullPurgePreservesLogPendingReuse) {
+  // TODO(ajkr): Disabled until WAL recycling is fixed for
+  // `kPointInTimeRecovery`.
+
+  // Ensures full purge cannot delete a WAL while it's in the process of being
+  // recycled. In particular, we force the full purge after a file has been
+  // chosen for reuse, but before it has been renamed.
+  for (int i = 0; i < 2; ++i) {
+    Options options = CurrentOptions();
+    options.recycle_log_file_num = 1;
+    if (i != 0) {
+      options.wal_dir = alternative_wal_dir_;
+    }
+    DestroyAndReopen(options);
+
+    // The first flush creates a second log so writes can continue before the
+    // flush finishes.
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+
+    // The second flush can recycle the first log. Sync points enforce the
+    // full purge happens after choosing the log to recycle and before it is
+    // renamed.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"DBImpl::CreateWAL:BeforeReuseWritableFile1",
+         "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge"},
+        {"DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge",
+         "DBImpl::CreateWAL:BeforeReuseWritableFile2"},
+    });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ROCKSDB_NAMESPACE::port::Thread thread([&]() {
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PreFullPurge");
+      ASSERT_OK(db_->EnableFileDeletions(true));
+      TEST_SYNC_POINT(
+          "DBWALTest::FullPurgePreservesLogPendingReuse:PostFullPurge");
+    });
+    ASSERT_OK(Put("foo", "bar"));
+    ASSERT_OK(Flush());
+    thread.join();
+  }
+}
+
+TEST_F(DBWALTest, GetSortedWalFiles) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(0, log_files.size());
+
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    ASSERT_EQ(1, log_files.size());
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, GetCurrentWalFile) {
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+
+    std::unique_ptr<LogFile>* bad_log_file = nullptr;
+    ASSERT_NOK(dbfull()->GetCurrentWalFile(bad_log_file));
+
+    std::unique_ptr<LogFile> log_file;
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    // nothing has been written to the log yet
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_EQ(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // add some data and verify that the file size actually moves foward
+    ASSERT_OK(Put(0, "foo", "v1"));
+    ASSERT_OK(Put(0, "foo2", "v2"));
+    ASSERT_OK(Put(0, "foo3", "v3"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+    // force log files to cycle and add some more data, then check if
+    // log number moves forward
+
+    ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    for (int i = 0; i < 10; i++) {
+      ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+    }
+
+    ASSERT_OK(Put(0, "foo4", "v4"));
+    ASSERT_OK(Put(0, "foo5", "v5"));
+    ASSERT_OK(Put(0, "foo6", "v6"));
+
+    ASSERT_OK(dbfull()->GetCurrentWalFile(&log_file));
+
+    ASSERT_EQ(log_file->StartSequence(), 0);
+    ASSERT_GT(log_file->SizeFileBytes(), 0);
+    ASSERT_EQ(log_file->Type(), kAliveLogFile);
+    ASSERT_GT(log_file->LogNumber(), 0);
+
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoveryWithLogDataForSomeCFs) {
+  // Test for regression of WAL cleanup missing files that don't contain data
+  // for every column family.
+  do {
+    CreateAndReopenWithCF({"pikachu"}, CurrentOptions());
+    ASSERT_OK(Put(1, "foo", "v1"));
+    ASSERT_OK(Put(1, "foo", "v2"));
+    uint64_t earliest_log_nums[2];
+    for (int i = 0; i < 2; ++i) {
+      if (i > 0) {
+        ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions());
+      }
+      VectorLogPtr log_files;
+      ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+      if (log_files.size() > 0) {
+        earliest_log_nums[i] = log_files[0]->LogNumber();
+      } else {
+        earliest_log_nums[i] = std::numeric_limits<uint64_t>::max();
+      }
+    }
+    // Check at least the first WAL was cleaned up during the recovery.
+    ASSERT_LT(earliest_log_nums[0], earliest_log_nums[1]);
+  } while (ChangeWalOptions());
+}
+
+TEST_F(DBWALTest, RecoverWithLargeLog) {
+  do {
+    {
+      Options options = CurrentOptions();
+      CreateAndReopenWithCF({"pikachu"}, options);
+      ASSERT_OK(Put(1, "big1", std::string(200000, '1')));
+      ASSERT_OK(Put(1, "big2", std::string(200000, '2')));
+      ASSERT_OK(Put(1, "small3", std::string(10, '3')));
+      ASSERT_OK(Put(1, "small4", std::string(10, '4')));
+      ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0);
+    }
+
+    // Make sure that if we re-open with a small write buffer size that
+    // we flush table files in the middle of a large log file.
+    Options options;
+    options.write_buffer_size = 100000;
+    options = CurrentOptions(options);
+    ReopenWithColumnFamilies({"default", "pikachu"}, options);
+    ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3);
+    ASSERT_EQ(std::string(200000, '1'), Get(1, "big1"));
+    ASSERT_EQ(std::string(200000, '2'), Get(1, "big2"));
+    ASSERT_EQ(std::string(10, '3'), Get(1, "small3"));
+    ASSERT_EQ(std::string(10, '4'), Get(1, "small4"));
+    ASSERT_GT(NumTableFilesAtLevel(0, 1), 1);
+  } while (ChangeWalOptions());
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it was empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 5000000;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  // Since we will reopen DB with smaller write_buffer_size,
+  // each key will go to new SST file
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+  ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
+
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  // Make 'dobrynia' to be flushed and new WAL file to be created
+  ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2]));
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    // Make sure 'dobrynia' was flushed: check sst files amount
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+  }
+  // New WAL file
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+  ASSERT_OK(Put(3, Key(10), DummyString(1)));
+
+  options.write_buffer_size = 4096;
+  options.arena_block_size = 4096;
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    // No inserts => default is empty
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(0));
+    // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(5));
+    // 1 SST for big key + 1 SST for small one
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(2));
+    // 1 SST for all keys
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+// In https://reviews.facebook.net/D20661 we change
+// recovery behavior: previously for each log file each column family
+// memtable was flushed, even it wasn't empty. Now it's changed:
+// we try to create the smallest number of table files by merging
+// updates from multiple logs
+TEST_F(DBWALTest, RecoverCheckFileAmount) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000;
+  options.arena_block_size = 4 * 1024;
+  options.avoid_flush_during_recovery = false;
+  CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options);
+
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Make 'nikitich' memtable to be flushed
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+  // 4 memtable are not flushed, 1 sst file
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(1));
+  }
+  // Memtable for 'nikitich' has flushed, new WAL file has opened
+  // 4 memtable still not flushed
+
+  // Write to new WAL file
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  // Fill up 'nikitich' one more time
+  ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
+  // make it flush
+  ASSERT_OK(Put(3, Key(1), DummyString(1)));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3]));
+  // There are still 4 memtable not flushed, and 2 sst tables
+  ASSERT_OK(Put(0, Key(1), DummyString(1)));
+  ASSERT_OK(Put(1, Key(1), DummyString(1)));
+  ASSERT_OK(Put(2, Key(1), DummyString(1)));
+
+  {
+    auto tables = ListTableFiles(env_, dbname_);
+    ASSERT_EQ(tables.size(), static_cast<size_t>(2));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(2));
+  }
+
+  ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
+                           options);
+  {
+    std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
+    // Check, that records for 'default', 'dobrynia' and 'pikachu' from
+    // first, second and third WALs  went to the same SST.
+    // So, there is 6 SSTs: three  for 'nikitich', one for 'default', one for
+    // 'dobrynia', one for 'pikachu'
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"),
+              static_cast<uint64_t>(3));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"),
+              static_cast<uint64_t>(1));
+    ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"),
+              static_cast<uint64_t>(1));
+  }
+}
+
+TEST_F(DBWALTest, SyncMultipleLogs) {
+  const uint64_t kNumBatches = 2;
+  const int kBatchSize = 1000;
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.write_buffer_size = 4096;
+  Reopen(options);
+
+  WriteBatch batch;
+  WriteOptions wo;
+  wo.sync = true;
+
+  for (uint64_t b = 0; b < kNumBatches; b++) {
+    batch.Clear();
+    for (int i = 0; i < kBatchSize; i++) {
+      ASSERT_OK(batch.Put(Key(i), DummyString(128)));
+    }
+
+    ASSERT_OK(dbfull()->Write(wo, &batch));
+  }
+
+  ASSERT_OK(dbfull()->SyncWAL());
+}
+
+// Github issue 1339. Prior the fix we read sequence id from the first log to
+// a local variable, then keep increase the variable as we replay logs,
+// ignoring actual sequence id of the records. This is incorrect if some writes
+// come with WAL disabled.
+TEST_F(DBWALTest, PartOfWritesWithWALDisabled) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_env.get();
+  options.disable_auto_compactions = true;
+  WriteOptions wal_on, wal_off;
+  wal_on.sync = true;
+  wal_on.disableWAL = false;
+  wal_off.disableWAL = true;
+  CreateAndReopenWithCF({"dummy"}, options);
+  ASSERT_OK(Put(1, "dummy", "d1", wal_on));  // seq id 1
+  ASSERT_OK(Put(1, "dummy", "d2", wal_off));
+  ASSERT_OK(Put(1, "dummy", "d3", wal_off));
+  ASSERT_OK(Put(0, "key", "v4", wal_on));  // seq id 4
+  ASSERT_OK(Flush(0));
+  ASSERT_OK(Put(0, "key", "v5", wal_on));  // seq id 5
+  ASSERT_EQ("v5", Get(0, "key"));
+  ASSERT_OK(dbfull()->FlushWAL(false));
+  // Simulate a crash.
+  fault_env->SetFilesystemActive(false);
+  Close();
+  fault_env->ResetState();
+  ReopenWithColumnFamilies({"default", "dummy"}, options);
+  // Prior to the fix, we may incorrectly recover "v5" with sequence id = 3.
+  ASSERT_EQ("v5", Get(0, "key"));
+  // Destroy DB before destruct fault_env.
+  Destroy(options);
+}
+
+//
+// Test WAL recovery for the various modes available
+//
+class RecoveryTestHelper {
+ public:
+  // Number of WAL files to generate
+  static constexpr int kWALFilesCount = 10;
+  // Starting number for the WAL file name like 00010.log
+  static constexpr int kWALFileOffset = 10;
+  // Keys to be written per WAL file
+  static constexpr int kKeysPerWALFile = 133;
+  // Size of the value
+  static constexpr int kValueSize = 96;
+
+  // Create WAL files with values filled in
+  static void FillData(DBWALTestBase* test, const Options& options,
+                       const size_t wal_count, size_t* count) {
+    // Calling internal functions requires sanitized options.
+    Options sanitized_options = SanitizeOptions(test->dbname_, options);
+    const ImmutableDBOptions db_options(sanitized_options);
+
+    *count = 0;
+
+    std::shared_ptr<Cache> table_cache = NewLRUCache(50, 0);
+    FileOptions file_options;
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+
+    std::unique_ptr<VersionSet> versions;
+    std::unique_ptr<WalManager> wal_manager;
+    WriteController write_controller;
+
+    versions.reset(new VersionSet(
+        test->dbname_, &db_options, file_options, table_cache.get(),
+        &write_buffer_manager, &write_controller,
+        /*block_cache_tracer=*/nullptr,
+        /*io_tracer=*/nullptr, /*db_id*/ "", /*db_session_id*/ ""));
+
+    wal_manager.reset(
+        new WalManager(db_options, file_options, /*io_tracer=*/nullptr));
+
+    std::unique_ptr<log::Writer> current_log_writer;
+
+    for (size_t j = kWALFileOffset; j < wal_count + kWALFileOffset; j++) {
+      uint64_t current_log_number = j;
+      std::string fname = LogFileName(test->dbname_, current_log_number);
+      std::unique_ptr<WritableFileWriter> file_writer;
+      ASSERT_OK(WritableFileWriter::Create(db_options.env->GetFileSystem(),
+                                           fname, file_options, &file_writer,
+                                           nullptr));
+      log::Writer* log_writer =
+          new log::Writer(std::move(file_writer), current_log_number,
+                          db_options.recycle_log_file_num > 0, false,
+                          db_options.wal_compression);
+      ASSERT_OK(log_writer->AddCompressionTypeRecord());
+      current_log_writer.reset(log_writer);
+
+      WriteBatch batch;
+      for (int i = 0; i < kKeysPerWALFile; i++) {
+        std::string key = "key" + std::to_string((*count)++);
+        std::string value = test->DummyString(kValueSize);
+        ASSERT_NE(current_log_writer.get(), nullptr);
+        uint64_t seq = versions->LastSequence() + 1;
+        batch.Clear();
+        ASSERT_OK(batch.Put(key, value));
+        WriteBatchInternal::SetSequence(&batch, seq);
+        ASSERT_OK(current_log_writer->AddRecord(
+            WriteBatchInternal::Contents(&batch)));
+        versions->SetLastAllocatedSequence(seq);
+        versions->SetLastPublishedSequence(seq);
+        versions->SetLastSequence(seq);
+      }
+    }
+  }
+
+  // Recreate and fill the store with some data
+  static size_t FillData(DBWALTestBase* test, Options* options) {
+    options->create_if_missing = true;
+    test->DestroyAndReopen(*options);
+    test->Close();
+
+    size_t count = 0;
+    FillData(test, *options, kWALFilesCount, &count);
+    return count;
+  }
+
+  // Read back all the keys we wrote and return the number of keys found
+  static size_t GetData(DBWALTestBase* test) {
+    size_t count = 0;
+    for (size_t i = 0; i < kWALFilesCount * kKeysPerWALFile; i++) {
+      if (test->Get("key" + std::to_string(i)) != "NOT_FOUND") {
+        ++count;
+      }
+    }
+    return count;
+  }
+
+  // Manuall corrupt the specified WAL
+  static void CorruptWAL(DBWALTestBase* test, const Options& options,
+                         const double off, const double len,
+                         const int wal_file_id, const bool trunc = false) {
+    Env* env = options.env;
+    std::string fname = LogFileName(test->dbname_, wal_file_id);
+    uint64_t size;
+    ASSERT_OK(env->GetFileSize(fname, &size));
+    ASSERT_GT(size, 0);
+#ifdef OS_WIN
+    // Windows disk cache behaves differently. When we truncate
+    // the original content is still in the cache due to the original
+    // handle is still open. Generally, in Windows, one prohibits
+    // shared access to files and it is not needed for WAL but we allow
+    // it to induce corruption at various tests.
+    test->Close();
+#endif
+    if (trunc) {
+      ASSERT_OK(
+          test::TruncateFile(env, fname, static_cast<uint64_t>(size * off)));
+    } else {
+      ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(size * off + 8),
+                                  static_cast<int>(size * len), false));
+    }
+  }
+};
+
+class DBWALTestWithParams : public DBWALTestBase,
+                            public ::testing::WithParamInterface<
+                                std::tuple<bool, int, int, CompressionType>> {
+ public:
+  DBWALTestWithParams() : DBWALTestBase("/db_wal_test_with_params") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParams,
+    ::testing::Combine(::testing::Bool(), ::testing::Range(0, 4, 1),
+                       ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                                        RecoveryTestHelper::kWALFileOffset +
+                                            RecoveryTestHelper::kWALFilesCount,
+                                        1),
+                       ::testing::Values(CompressionType::kNoCompression,
+                                         CompressionType::kZSTD)));
+
+class DBWALTestWithParamsVaryingRecoveryMode
+    : public DBWALTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, int, int, WALRecoveryMode, CompressionType>> {
+ public:
+  DBWALTestWithParamsVaryingRecoveryMode()
+      : DBWALTestBase("/db_wal_test_with_params_mode") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Wal, DBWALTestWithParamsVaryingRecoveryMode,
+    ::testing::Combine(
+        ::testing::Bool(), ::testing::Range(0, 4, 1),
+        ::testing::Range(RecoveryTestHelper::kWALFileOffset,
+                         RecoveryTestHelper::kWALFileOffset +
+                             RecoveryTestHelper::kWALFilesCount,
+                         1),
+        ::testing::Values(WALRecoveryMode::kTolerateCorruptedTailRecords,
+                          WALRecoveryMode::kAbsoluteConsistency,
+                          WALRecoveryMode::kPointInTimeRecovery,
+                          WALRecoveryMode::kSkipAnyCorruptedRecords),
+        ::testing::Values(CompressionType::kNoCompression,
+                          CompressionType::kZSTD)));
+
+// Test scope:
+// - We expect to open the data store when there is incomplete trailing writes
+// at the end of any of the logs
+// - We do not expect to open the data store for corruption
+TEST_P(DBWALTestWithParams, kTolerateCorruptedTailRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  // test checksum failure or parsing
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  options.wal_recovery_mode = WALRecoveryMode::kTolerateCorruptedTailRecords;
+  if (trunc) {
+    options.create_if_missing = false;
+    ASSERT_OK(TryReopen(options));
+    const size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+    ASSERT_TRUE(corrupt_offset == 0 || recovered_row_count > 0);
+    ASSERT_LT(recovered_row_count, row_count);
+  } else {
+    ASSERT_NOK(TryReopen(options));
+  }
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any corruption
+// (leading, middle or trailing -- incomplete writes or corruption)
+TEST_P(DBWALTestWithParams, kAbsoluteConsistency) {
+  // Verify clean slate behavior
+  Options options = CurrentOptions();
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+  ASSERT_EQ(RecoveryTestHelper::GetData(this), row_count);
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+  options.wal_compression = compression_type;
+
+  if (trunc && corrupt_offset == 0) {
+    return;
+  }
+
+  // fill with new date
+  RecoveryTestHelper::FillData(this, &options);
+  // corrupt the wal
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // verify
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+  options.create_if_missing = false;
+  ASSERT_NOK(TryReopen(options));
+}
+
+// Test scope:
+// We don't expect the data store to be opened if there is any inconsistency
+// between WAL and SST files
+TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+
+  // Create DB with multiple column families.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(1, "key1", "val1"));
+  ASSERT_OK(Put(2, "key2", "val2"));
+
+  // Record the offset at this point
+  Env* env = options.env;
+  uint64_t wal_file_id = dbfull()->TEST_LogfileNumber();
+  std::string fname = LogFileName(dbname_, wal_file_id);
+  uint64_t offset_to_corrupt;
+  ASSERT_OK(env->GetFileSize(fname, &offset_to_corrupt));
+  ASSERT_GT(offset_to_corrupt, 0);
+
+  ASSERT_OK(Put(1, "key3", "val3"));
+  // Corrupt WAL at location of key3
+  ASSERT_OK(test::CorruptFile(env, fname, static_cast<int>(offset_to_corrupt),
+                              4, false));
+  ASSERT_OK(Put(2, "key4", "val4"));
+  ASSERT_OK(Put(1, "key5", "val5"));
+  ASSERT_OK(Flush(2));
+
+  // PIT recovery & verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
+}
+
+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+  // The following make sure there are two bg flush threads.
+  options.max_background_jobs = 8;
+
+  DestroyAndReopen(options);
+
+  const std::string cf1_name("cf1");
+  CreateAndReopenWithCF({cf1_name}, options);
+  assert(handles_.size() == 2);
+
+  {
+    dbfull()->TEST_LockMutex();
+    ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, handles_[1]));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(
+      /*wait=*/false, /*allow_write_stall=*/true, handles_[0]));
+
+  bool called = false;
+  std::atomic<int> bg_flush_threads{0};
+  std::atomic<bool> wal_synced{false};
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCallFlush:start", [&](void* /*arg*/) {
+        int cur = bg_flush_threads.load();
+        int desired = cur + 1;
+        if (cur > 0 ||
+            !bg_flush_threads.compare_exchange_strong(cur, desired)) {
+          while (!wal_synced.load()) {
+            // Wait until the other bg flush thread finishes committing WAL sync
+            // operation to the MANIFEST.
+          }
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushMemTableToOutputFile:CommitWal:1",
+      [&](void* /*arg*/) { wal_synced.store(true); });
+  // This callback will be called when the first bg flush thread reaches the
+  // point before entering the MANIFEST write queue after flushing the SST
+  // file.
+  // The purpose of the sync points here is to ensure both bg flush threads
+  // finish computing `min_wal_number_to_keep` before any of them updates the
+  // `log_number` for the column family that's being flushed.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+      [&](void* /*arg*/) {
+        dbfull()->mutex()->AssertHeld();
+        if (!called) {
+          // We are the first bg flush thread in the MANIFEST write queue.
+          // We set up the dependency between sync points for two threads that
+          // will be executing the same code.
+          // For the interleaving of events, see
+          // https://github.com/facebook/rocksdb/pull/9715.
+          // bg flush thread1 will release the db mutex while in the MANIFEST
+          // write queue. In the meantime, bg flush thread2 locks db mutex and
+          // computes the min_wal_number_to_keep (before thread1 writes to
+          // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+          // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+          // with writing to MANIFEST.
+          called = true;
+          SyncPoint::GetInstance()->LoadDependency({
+              {"VersionSet::LogAndApply:WriteManifestStart",
+               "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+              {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+               "VersionSet::LogAndApply:WriteManifest"},
+          });
+        } else {
+          // The other bg flush thread has already been in the MANIFEST write
+          // queue, and we are after.
+          TEST_SYNC_POINT(
+              "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  ASSERT_TRUE(called);
+
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  DB* db1 = nullptr;
+  Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+  ASSERT_OK(s);
+  assert(db1);
+  delete db1;
+}
+
+// Test scope:
+// - We expect to open data store under all circumstances
+// - We expect only data upto the point where the first error was encountered
+TEST_P(DBWALTestWithParams, kPointInTimeRecovery) {
+  const int maxkeys =
+      RecoveryTestHelper::kWALFilesCount * RecoveryTestHelper::kKeysPerWALFile;
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  options.wal_compression = compression_type;
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the wal
+  // The offset here was 0.3 which cuts off right at the end of a
+  // valid fragment after wal zstd compression checksum is enabled,
+  // so changed the value to 0.33.
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .33,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  // Verify a prefix of keys were recovered. But not in the case of full WAL
+  // truncation, because we have no way to know there was a corruption when
+  // truncation happened on record boundaries (preventing recovery holes in
+  // that case requires using `track_and_verify_wals_in_manifest`).
+  if (!trunc || corrupt_offset != 0) {
+    bool expect_data = true;
+    for (size_t k = 0; k < maxkeys; ++k) {
+      bool found = Get("key" + std::to_string(k)) != "NOT_FOUND";
+      if (expect_data && !found) {
+        expect_data = false;
+      }
+      ASSERT_EQ(found, expect_data);
+    }
+  }
+
+  const size_t min = RecoveryTestHelper::kKeysPerWALFile *
+                     (wal_file_id - RecoveryTestHelper::kWALFileOffset);
+  ASSERT_GE(recovered_row_count, min);
+  if (!trunc && corrupt_offset != 0) {
+    const size_t max = RecoveryTestHelper::kKeysPerWALFile *
+                       (wal_file_id - RecoveryTestHelper::kWALFileOffset + 1);
+    ASSERT_LE(recovered_row_count, max);
+  }
+}
+
+// Test scope:
+// - We expect to open the data store under all scenarios
+// - We expect to have recovered records past the corruption zone
+TEST_P(DBWALTestWithParams, kSkipAnyCorruptedRecords) {
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  // WAL compression type
+  CompressionType compression_type = std::get<3>(GetParam());
+
+  // Fill data for testing
+  Options options = CurrentOptions();
+  options.wal_compression = compression_type;
+  const size_t row_count = RecoveryTestHelper::FillData(this, &options);
+
+  // Corrupt the WAL
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+
+  // Verify behavior
+  options.wal_recovery_mode = WALRecoveryMode::kSkipAnyCorruptedRecords;
+  options.create_if_missing = false;
+  ASSERT_OK(TryReopen(options));
+
+  // Probe data for invariants
+  size_t recovered_row_count = RecoveryTestHelper::GetData(this);
+  ASSERT_LT(recovered_row_count, row_count);
+
+  if (!trunc) {
+    ASSERT_TRUE(corrupt_offset != 0 || recovered_row_count > 0);
+  }
+}
+
+TEST_F(DBWALTest, AvoidFlushDuringRecovery) {
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = false;
+
+  // Test with flush after recovery.
+  Reopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  ASSERT_OK(Put("bar", "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v3"));
+  ASSERT_OK(Put("bar", "v4"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. Check if WAL logs flushed.
+  Reopen(options);
+  ASSERT_EQ("v3", Get("foo"));
+  ASSERT_EQ("v4", Get("bar"));
+  ASSERT_EQ(2, TotalTableFiles());
+
+  // Test without flush after recovery.
+  options.avoid_flush_during_recovery = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v5"));
+  ASSERT_OK(Put("bar", "v6"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v7"));
+  ASSERT_OK(Put("bar", "v8"));
+  ASSERT_EQ(1, TotalTableFiles());
+  // Reopen DB. WAL logs should not be flushed this time.
+  Reopen(options);
+  ASSERT_EQ("v7", Get("foo"));
+  ASSERT_EQ("v8", Get("bar"));
+  ASSERT_EQ(1, TotalTableFiles());
+
+  // Force flush with allow_2pc.
+  options.avoid_flush_during_recovery = true;
+  options.allow_2pc = true;
+  ASSERT_OK(Put("foo", "v9"));
+  ASSERT_OK(Put("bar", "v10"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo", "v11"));
+  ASSERT_OK(Put("bar", "v12"));
+  Reopen(options);
+  ASSERT_EQ("v11", Get("foo"));
+  ASSERT_EQ("v12", Get("bar"));
+  ASSERT_EQ(3, TotalTableFiles());
+}
+
+TEST_F(DBWALTest, WalCleanupAfterAvoidFlushDuringRecovery) {
+  // Verifies WAL files that were present during recovery, but not flushed due
+  // to avoid_flush_during_recovery, will be considered for deletion at a later
+  // stage. We check at least one such file is deleted during Flush().
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.avoid_flush_during_recovery = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("foo", "v1"));
+  Reopen(options);
+  for (int i = 0; i < 2; ++i) {
+    if (i > 0) {
+      // Flush() triggers deletion of obsolete tracked files
+      ASSERT_OK(Flush());
+    }
+    VectorLogPtr log_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(log_files));
+    if (i == 0) {
+      ASSERT_GT(log_files.size(), 0);
+    } else {
+      ASSERT_EQ(0, log_files.size());
+    }
+  }
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlush) {
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  size_t count = RecoveryTestHelper::FillData(this, &options);
+  auto validateData = [this, count]() {
+    for (size_t i = 0; i < count; i++) {
+      ASSERT_NE(Get("key" + std::to_string(i)), "NOT_FOUND");
+    }
+  };
+  Reopen(options);
+  validateData();
+  // Insert some data without flush
+  ASSERT_OK(Put("foo", "foo_v1"));
+  ASSERT_OK(Put("bar", "bar_v1"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v1");
+  ASSERT_EQ(Get("bar"), "bar_v1");
+  // Insert again and reopen
+  ASSERT_OK(Put("foo", "foo_v2"));
+  ASSERT_OK(Put("bar", "bar_v2"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  // manual flush and insert again
+  ASSERT_OK(Flush());
+  ASSERT_EQ(Get("foo"), "foo_v2");
+  ASSERT_EQ(Get("bar"), "bar_v2");
+  ASSERT_OK(Put("foo", "foo_v3"));
+  ASSERT_OK(Put("bar", "bar_v3"));
+  Reopen(options);
+  validateData();
+  ASSERT_EQ(Get("foo"), "foo_v3");
+  ASSERT_EQ(Get("bar"), "bar_v3");
+}
+
+TEST_F(DBWALTest, RecoverWithoutFlushMultipleCF) {
+  const std::string kSmallValue = "v";
+  const std::string kLargeValue = DummyString(1024);
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+
+  auto countWalFiles = [this]() {
+    VectorLogPtr log_files;
+    if (!dbfull()->GetSortedWalFiles(log_files).ok()) {
+      return size_t{0};
+    }
+    return log_files.size();
+  };
+
+  // Create DB with multiple column families and multiple log files.
+  CreateAndReopenWithCF({"one", "two"}, options);
+  ASSERT_OK(Put(0, "key1", kSmallValue));
+  ASSERT_OK(Put(1, "key2", kLargeValue));
+  ASSERT_OK(Flush(1));
+  ASSERT_EQ(1, countWalFiles());
+  ASSERT_OK(Put(0, "key3", kSmallValue));
+  ASSERT_OK(Put(2, "key4", kLargeValue));
+  ASSERT_OK(Flush(2));
+  ASSERT_EQ(2, countWalFiles());
+
+  // Reopen, insert and flush.
+  options.db_write_buffer_size = 64 * 1024 * 1024;
+  ReopenWithColumnFamilies({"default", "one", "two"}, options);
+  ASSERT_EQ(Get(0, "key1"), kSmallValue);
+  ASSERT_EQ(Get(1, "key2"), kLargeValue);
+  ASSERT_EQ(Get(0, "key3"), kSmallValue);
+  ASSERT_EQ(Get(2, "key4"), kLargeValue);
+  // Insert more data.
+  ASSERT_OK(Put(0, "key5", kLargeValue));
+  ASSERT_OK(Put(1, "key6", kLargeValue));
+  ASSERT_EQ(3, countWalFiles());
+  ASSERT_OK(Flush(1));
+  ASSERT_OK(Put(2, "key7", kLargeValue));
+  ASSERT_OK(dbfull()->FlushWAL(false));
+  ASSERT_EQ(4, countWalFiles());
+
+  // Reopen twice and validate.
+  for (int i = 0; i < 2; i++) {
+    ReopenWithColumnFamilies({"default", "one", "two"}, options);
+    ASSERT_EQ(Get(0, "key1"), kSmallValue);
+    ASSERT_EQ(Get(1, "key2"), kLargeValue);
+    ASSERT_EQ(Get(0, "key3"), kSmallValue);
+    ASSERT_EQ(Get(2, "key4"), kLargeValue);
+    ASSERT_EQ(Get(0, "key5"), kLargeValue);
+    ASSERT_EQ(Get(1, "key6"), kLargeValue);
+    ASSERT_EQ(Get(2, "key7"), kLargeValue);
+    ASSERT_EQ(4, countWalFiles());
+  }
+}
+
+// In this test we are trying to do the following:
+//   1. Create a DB with corrupted WAL log;
+//   2. Open with avoid_flush_during_recovery = true;
+//   3. Append more data without flushing, which creates new WAL log.
+//   4. Open again. See if it can correctly handle previous corruption.
+TEST_P(DBWALTestWithParamsVaryingRecoveryMode,
+       RecoverFromCorruptedWALWithoutFlush) {
+  const int kAppendKeys = 100;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.create_if_missing = false;
+  options.disable_auto_compactions = true;
+  options.write_buffer_size = 64 * 1024 * 1024;
+
+  auto getAll = [this]() {
+    std::vector<std::pair<std::string, std::string>> data;
+    ReadOptions ropt;
+    Iterator* iter = dbfull()->NewIterator(ropt);
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      data.push_back(
+          std::make_pair(iter->key().ToString(), iter->value().ToString()));
+    }
+    delete iter;
+    return data;
+  };
+
+  bool trunc = std::get<0>(GetParam());  // Corruption style
+  // Corruption offset position
+  int corrupt_offset = std::get<1>(GetParam());
+  int wal_file_id = std::get<2>(GetParam());  // WAL file
+  WALRecoveryMode recovery_mode = std::get<3>(GetParam());
+  // WAL compression type
+  CompressionType compression_type = std::get<4>(GetParam());
+
+  options.wal_recovery_mode = recovery_mode;
+  options.wal_compression = compression_type;
+  // Create corrupted WAL
+  RecoveryTestHelper::FillData(this, &options);
+  RecoveryTestHelper::CorruptWAL(this, options, corrupt_offset * .3,
+                                 /*len%=*/.1, wal_file_id, trunc);
+  // Skip the test if DB won't open.
+  if (!TryReopen(options).ok()) {
+    ASSERT_TRUE(options.wal_recovery_mode ==
+                    WALRecoveryMode::kAbsoluteConsistency ||
+                (!trunc && options.wal_recovery_mode ==
+                               WALRecoveryMode::kTolerateCorruptedTailRecords));
+    return;
+  }
+  ASSERT_OK(TryReopen(options));
+  // Append some more data.
+  for (int k = 0; k < kAppendKeys; k++) {
+    std::string key = "extra_key" + std::to_string(k);
+    std::string value = DummyString(RecoveryTestHelper::kValueSize);
+    ASSERT_OK(Put(key, value));
+  }
+  // Save data for comparison.
+  auto data = getAll();
+  // Reopen. Verify data.
+  ASSERT_OK(TryReopen(options));
+  auto actual_data = getAll();
+  ASSERT_EQ(data, actual_data);
+}
+
+// Tests that total log size is recovered if we set
+// avoid_flush_during_recovery=true.
+// Flush should trigger if max_total_wal_size is reached.
+TEST_F(DBWALTest, RestoreTotalLogSizeAfterRecoverWithoutFlush) {
+  auto test_listener = std::make_shared<FlushCounterListener>();
+  test_listener->expected_flush_reason = FlushReason::kWalFull;
+
+  constexpr size_t kKB = 1024;
+  constexpr size_t kMB = 1024 * 1024;
+  Options options = CurrentOptions();
+  options.avoid_flush_during_recovery = true;
+  options.max_total_wal_size = 1 * kMB;
+  options.listeners.push_back(test_listener);
+  // Have to open DB in multi-CF mode to trigger flush when
+  // max_total_wal_size is reached.
+  CreateAndReopenWithCF({"one"}, options);
+  // Write some keys and we will end up with one log file which is slightly
+  // smaller than 1MB.
+  std::string value_100k(100 * kKB, 'v');
+  std::string value_300k(300 * kKB, 'v');
+  ASSERT_OK(Put(0, "foo", "v1"));
+  for (int i = 0; i < 9; i++) {
+    ASSERT_OK(Put(1, "key" + std::to_string(i), value_100k));
+  }
+  // Get log files before reopen.
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  uint64_t log_size_before = log_files_before[0]->SizeFileBytes();
+  ASSERT_GT(log_size_before, 900 * kKB);
+  ASSERT_LT(log_size_before, 1 * kMB);
+  ReopenWithColumnFamilies({"default", "one"}, options);
+  // Write one more value to make log larger than 1MB.
+  ASSERT_OK(Put(1, "bar", value_300k));
+  // Get log files again. A new log file will be opened.
+  VectorLogPtr log_files_after_reopen;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after_reopen));
+  ASSERT_EQ(2, log_files_after_reopen.size());
+  ASSERT_EQ(log_files_before[0]->LogNumber(),
+            log_files_after_reopen[0]->LogNumber());
+  ASSERT_GT(log_files_after_reopen[0]->SizeFileBytes() +
+                log_files_after_reopen[1]->SizeFileBytes(),
+            1 * kMB);
+  // Write one more key to trigger flush.
+  ASSERT_OK(Put(0, "foo", "v2"));
+  for (auto* h : handles_) {
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(h));
+  }
+  // Flushed two column families.
+  ASSERT_EQ(2, test_listener->count.load());
+}
+
+#if defined(ROCKSDB_PLATFORM_POSIX)
+#if defined(ROCKSDB_FALLOCATE_PRESENT)
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithoutFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  Reopen(options);
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  // The preallocated space should be truncated.
+  ASSERT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+}
+// Tests that we will truncate the preallocated space of the last log from
+// previous.
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWithFlush) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  options.avoid_flush_during_shutdown = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  ASSERT_GE(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  // The log file has preallocated space.
+  Close();
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // After the flush during Open, the log file should get deleted.  However,
+  // if  the process is in a crash loop, the log file may not get
+  // deleted and thte preallocated space will keep accumulating. So we need
+  // to ensure it gets trtuncated.
+  EXPECT_LT(GetAllocatedFileSize(dbname_ + file_before->PathName()),
+            preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(DBWALTest, TruncateLastLogAfterRecoverWALEmpty) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = false;
+  if (mem_env_ || encrypted_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem/non-encrypted  environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  Close();
+  std::vector<std::string> filenames;
+  std::string last_log;
+  uint64_t last_log_num = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &filenames));
+  for (auto fname : filenames) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(fname, &number, &type, nullptr)) {
+      if (type == kWalFile && number > last_log_num) {
+        last_log = fname;
+      }
+    }
+  }
+  ASSERT_NE(last_log, "");
+  last_log = dbname_ + '/' + last_log;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::PurgeObsoleteFiles:Begin",
+        "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover"},
+       {"DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate",
+        "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close",
+      [](void* arg) { *(reinterpret_cast<size_t*>(arg)) = 0; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  // Preallocate space for the empty log file. This could happen if WAL data
+  // was buffered in memory and the process crashed.
+  std::unique_ptr<WritableFile> log_file;
+  ASSERT_OK(env_->ReopenWritableFile(last_log, &log_file, EnvOptions()));
+  log_file->SetPreallocationBlockSize(preallocated_size);
+  log_file->PrepareWrite(0, 4096);
+  log_file.reset();
+
+  ASSERT_GE(GetAllocatedFileSize(last_log), preallocated_size);
+
+  port::Thread reopen_thread([&]() { Reopen(options); });
+
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterRecover");
+  // The preallocated space should be truncated.
+  EXPECT_LT(GetAllocatedFileSize(last_log), preallocated_size);
+  TEST_SYNC_POINT(
+      "DBWALTest::TruncateLastLogAfterRecoverWithFlush:AfterTruncate");
+  reopen_thread.join();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(DBWALTest, ReadOnlyRecoveryNoTruncate) {
+  constexpr size_t kKB = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  if (mem_env_) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mem environment");
+    return;
+  }
+  if (!IsFallocateSupported()) {
+    return;
+  }
+
+  // create DB and close with file truncate disabled
+  std::atomic_bool enable_truncate{false};
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "PosixWritableFile::Close", [&](void* arg) {
+        if (!enable_truncate) {
+          *(reinterpret_cast<size_t*>(arg)) = 0;
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+  size_t preallocated_size =
+      dbfull()->TEST_GetWalPreallocateBlockSize(options.write_buffer_size);
+  ASSERT_OK(Put("foo", "v1"));
+  VectorLogPtr log_files_before;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_before));
+  ASSERT_EQ(1, log_files_before.size());
+  auto& file_before = log_files_before[0];
+  ASSERT_LT(file_before->SizeFileBytes(), 1 * kKB);
+  // The log file has preallocated space.
+  auto db_size = GetAllocatedFileSize(dbname_ + file_before->PathName());
+  ASSERT_GE(db_size, preallocated_size);
+  Close();
+
+  // enable truncate and open DB as readonly, the file should not be truncated
+  // and DB size is not changed.
+  enable_truncate = true;
+  ASSERT_OK(ReadOnlyReopen(options));
+  VectorLogPtr log_files_after;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(log_files_after));
+  ASSERT_EQ(1, log_files_after.size());
+  ASSERT_LT(log_files_after[0]->SizeFileBytes(), 1 * kKB);
+  ASSERT_EQ(log_files_after[0]->PathName(), file_before->PathName());
+  // The preallocated space should NOT be truncated.
+  // the DB size is almost the same.
+  ASSERT_NEAR(GetAllocatedFileSize(dbname_ + file_before->PathName()), db_size,
+              db_size / 100);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+#endif  // ROCKSDB_FALLOCATE_PRESENT
+#endif  // ROCKSDB_PLATFORM_POSIX
+
+TEST_F(DBWALTest, WalInManifestButNotInSortedWals) {
+  Options options = CurrentOptions();
+  options.track_and_verify_wals_in_manifest = true;
+  options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency;
+
+  // Build a way to make wal files selectively go missing
+  bool wals_go_missing = false;
+  struct MissingWalFs : public FileSystemWrapper {
+    MissingWalFs(const std::shared_ptr<FileSystem>& t,
+                 bool* _wals_go_missing_flag)
+        : FileSystemWrapper(t), wals_go_missing_flag(_wals_go_missing_flag) {}
+    bool* wals_go_missing_flag;
+    IOStatus GetChildren(const std::string& dir, const IOOptions& io_opts,
+                         std::vector<std::string>* r,
+                         IODebugContext* dbg) override {
+      IOStatus s = target_->GetChildren(dir, io_opts, r, dbg);
+      if (s.ok() && *wals_go_missing_flag) {
+        for (size_t i = 0; i < r->size();) {
+          if (EndsWith(r->at(i), ".log")) {
+            r->erase(r->begin() + i);
+          } else {
+            ++i;
+          }
+        }
+      }
+      return s;
+    }
+    const char* Name() const override { return "MissingWalFs"; }
+  };
+  auto my_fs =
+      std::make_shared<MissingWalFs>(env_->GetFileSystem(), &wals_go_missing);
+  std::unique_ptr<Env> my_env(NewCompositeEnv(my_fs));
+  options.env = my_env.get();
+
+  CreateAndReopenWithCF({"blah"}, options);
+
+  // Currently necessary to get a WAL tracked in manifest; see
+  // https://github.com/facebook/rocksdb/issues/10080
+  ASSERT_OK(Put(0, "x", "y"));
+  ASSERT_OK(db_->SyncWAL());
+  ASSERT_OK(Put(1, "x", "y"));
+  ASSERT_OK(db_->SyncWAL());
+  ASSERT_OK(Flush(1));
+
+  ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty());
+  std::vector<std::unique_ptr<LogFile>> wals;
+  ASSERT_OK(db_->GetSortedWalFiles(wals));
+  wals_go_missing = true;
+  ASSERT_NOK(db_->GetSortedWalFiles(wals));
+  wals_go_missing = false;
+  Close();
+}
+
+#endif  // ROCKSDB_LITE
+
+TEST_F(DBWALTest, WalTermTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(1, "foo", "bar"));
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  batch.MarkWalTerminationPoint();
+  ASSERT_OK(batch.Put("foo2", "bar2"));
+
+  ASSERT_OK(dbfull()->Write(wo, &batch));
+
+  // make sure we can re-open it.
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
+  ASSERT_EQ("bar", Get(1, "foo"));
+  ASSERT_EQ("NOT_FOUND", Get(1, "foo2"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBWALTest, GetCompressedWalsAfterSync) {
+  if (db_->GetOptions().wal_compression == kNoCompression) {
+    ROCKSDB_GTEST_BYPASS("stream compression not present");
+    return;
+  }
+  Options options = GetDefaultOptions();
+  options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.avoid_flush_during_recovery = true;
+  options.track_and_verify_wals_in_manifest = true;
+  // Enable WAL compression so that the newly-created WAL will be non-empty
+  // after DB open, even if point-in-time WAL recovery encounters no
+  // corruption.
+  options.wal_compression = kZSTD;
+  DestroyAndReopen(options);
+
+  // Write something to memtable and WAL so that log_empty_ will be false after
+  // next DB::Open().
+  ASSERT_OK(Put("a", "v"));
+
+  Reopen(options);
+
+  // New WAL is created, thanks to !log_empty_.
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+
+  ASSERT_OK(Put("b", "v"));
+
+  ASSERT_OK(db_->SyncWAL());
+
+  VectorLogPtr wals;
+  Status s = dbfull()->GetSortedWalFiles(wals);
+  ASSERT_OK(s);
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_basic_test.cc b/src/rocksdb/db/db_with_timestamp_basic_test.cc
new file mode 100644
index 000000000..6ea1aaf46
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_basic_test.cc
@@ -0,0 +1,3880 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+#include "port/stack_trace.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/block_based/block_builder.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "test_util/testutil.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/merge_operators/string_append/stringappend2.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestamp : public DBBasicTestWithTimestampBase {
+ public:
+  DBBasicTestWithTimestamp()
+      : DBBasicTestWithTimestampBase("db_basic_test_with_timestamp") {}
+};
+
+TEST_F(DBBasicTestWithTimestamp, SanityChecks) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  options.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  options1.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options1.merge_operator = MergeOperators::CreateStringAppendTESTOperator();
+  assert(options1.comparator &&
+         options1.comparator->timestamp_size() == sizeof(uint64_t));
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  std::string dummy_ts(sizeof(uint64_t), '\0');
+  // Perform timestamp operations on default cf.
+  ASSERT_TRUE(
+      db_->Put(WriteOptions(), "key", dummy_ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), "key",
+                         dummy_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(db_->Delete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->SingleDelete(WriteOptions(), "key", dummy_ts).IsInvalidArgument());
+  ASSERT_TRUE(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "begin_key", "end_key", dummy_ts)
+                  .IsInvalidArgument());
+
+  // Perform non-timestamp operations on "data" cf.
+  ASSERT_TRUE(
+      db_->Put(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->Delete(WriteOptions(), handle, "key").IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->SingleDelete(WriteOptions(), handle, "key").IsInvalidArgument());
+
+  ASSERT_TRUE(
+      db_->Merge(WriteOptions(), handle, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key")
+                  .IsInvalidArgument());
+
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.Put(handle, "key", "value"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.Delete(handle, "key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.SingleDelete(handle, "key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+  {
+    WriteBatch wb;
+    ASSERT_OK(wb.DeleteRange(handle, "begin_key", "end_key"));
+    ASSERT_TRUE(db_->Write(WriteOptions(), &wb).IsInvalidArgument());
+  }
+
+  // Perform timestamp operations with timestamps of incorrect size.
+  const std::string wrong_ts(sizeof(uint32_t), '\0');
+  ASSERT_TRUE(db_->Put(WriteOptions(), handle, "key", wrong_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(db_->Merge(WriteOptions(), handle, "key", wrong_ts, "value")
+                  .IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->Delete(WriteOptions(), handle, "key", wrong_ts).IsInvalidArgument());
+  ASSERT_TRUE(db_->SingleDelete(WriteOptions(), handle, "key", wrong_ts)
+                  .IsInvalidArgument());
+  ASSERT_TRUE(
+      db_->DeleteRange(WriteOptions(), handle, "begin_key", "end_key", wrong_ts)
+          .IsInvalidArgument());
+
+  delete handle;
+}
+
+TEST_F(DBBasicTestWithTimestamp, MixedCfs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.avoid_flush_during_shutdown = true;
+  DestroyAndReopen(options);
+
+  Options options1 = CurrentOptions();
+  options1.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options1.comparator = &test_cmp;
+  ColumnFamilyHandle* handle = nullptr;
+  Status s = db_->CreateColumnFamily(options1, "data", &handle);
+  ASSERT_OK(s);
+
+  WriteBatch wb;
+  ASSERT_OK(wb.Put("a", "value"));
+  ASSERT_OK(wb.Put(handle, "a", "value"));
+  {
+    std::string ts = Timestamp(1, 0);
+    const auto ts_sz_func = [kTimestampSize, handle](uint32_t cf_id) {
+      assert(handle);
+      if (cf_id == 0) {
+        return static_cast<size_t>(0);
+      } else if (cf_id == handle->GetID()) {
+        return kTimestampSize;
+      } else {
+        assert(false);
+        return std::numeric_limits<size_t>::max();
+      }
+    };
+    ASSERT_OK(wb.UpdateTimestamps(ts, ts_sz_func));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+
+  const auto verify_db = [this](ColumnFamilyHandle* h, const std::string& key,
+                                const std::string& ts,
+                                const std::string& expected_value) {
+    ASSERT_EQ(expected_value, Get(key));
+    Slice read_ts_slice(ts);
+    ReadOptions read_opts;
+    read_opts.timestamp = &read_ts_slice;
+    std::string value;
+    ASSERT_OK(db_->Get(read_opts, h, key, &value));
+    ASSERT_EQ(expected_value, value);
+  };
+
+  verify_db(handle, "a", Timestamp(1, 0), "value");
+
+  delete handle;
+  Close();
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  cf_descs.emplace_back(kDefaultColumnFamilyName, options);
+  cf_descs.emplace_back("data", options1);
+  options.create_if_missing = false;
+  s = DB::Open(options, dbname_, cf_descs, &handles_, &db_);
+  ASSERT_OK(s);
+
+  verify_db(handles_[1], "a", Timestamp(1, 0), "value");
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, CompactRangeWithSpecifiedRange) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  std::string start_str = "foo";
+  std::string end_str = "foo2";
+  Slice start(start_str), end(end_str);
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end));
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GcPreserveLatestVersionBelowFullHistoryLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_str = Timestamp(1, 0);
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+  ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+  ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+
+  ts_str = Timestamp(2, 0);
+  ASSERT_OK(db_->Delete(wopts, "k3", ts_str));
+
+  ts_str = Timestamp(4, 0);
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v5"));
+
+  ts_str = Timestamp(5, 0);
+  ASSERT_OK(
+      db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k0", "k9", ts_str));
+
+  ts_str = Timestamp(3, 0);
+  Slice ts = ts_str;
+  CompactRangeOptions cro;
+  cro.full_history_ts_low = &ts;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions ropts;
+  ropts.timestamp = &ts;
+  std::string value;
+  Status s = db_->Get(ropts, "k1", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("v1", value);
+
+  std::string key_ts;
+  ASSERT_TRUE(db_->Get(ropts, "k3", &value, &key_ts).IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), key_ts);
+
+  ts_str = Timestamp(5, 0);
+  ts = ts_str;
+  ropts.timestamp = &ts;
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value, &key_ts).IsNotFound());
+  ASSERT_EQ(Timestamp(5, 0), key_ts);
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value).IsNotFound());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const std::string kKey = "test kKey";
+
+  // Test set ts_low first and flush()
+  int current_ts_low = 5;
+  std::string ts_low_str = Timestamp(current_ts_low, 0);
+  Slice ts_low = ts_low_str;
+  CompactRangeOptions comp_opts;
+  comp_opts.full_history_ts_low = &ts_low;
+  comp_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+
+  auto* cfd =
+      static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
+          ->cfd();
+  auto result_ts_low = cfd->GetFullHistoryTsLow();
+
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = 0; i < 10; i++) {
+    WriteOptions write_opts;
+    std::string ts = Timestamp(i, 0);
+    ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+  }
+  ASSERT_OK(Flush());
+
+  for (int i = 0; i < 10; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    if (i < current_ts_low) {
+      ASSERT_TRUE(status.IsInvalidArgument());
+    } else {
+      ASSERT_OK(status);
+      ASSERT_TRUE(value.compare(Key(i)) == 0);
+    }
+  }
+
+  // Test set ts_low and then trigger compaction
+  for (int i = 10; i < 20; i++) {
+    WriteOptions write_opts;
+    std::string ts = Timestamp(i, 0);
+    ASSERT_OK(db_->Put(write_opts, kKey, ts, Key(i)));
+  }
+
+  ASSERT_OK(Flush());
+
+  current_ts_low = 15;
+  ts_low_str = Timestamp(current_ts_low, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  ASSERT_OK(db_->CompactRange(comp_opts, nullptr, nullptr));
+  result_ts_low = cfd->GetFullHistoryTsLow();
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low, result_ts_low) == 0);
+
+  for (int i = current_ts_low; i < 20; i++) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(i, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::string value;
+    Status status = db_->Get(read_opts, kKey, &value);
+    ASSERT_OK(status);
+    ASSERT_TRUE(value.compare(Key(i)) == 0);
+  }
+
+  // Test invalid compaction with range
+  Slice start(kKey), end(kKey);
+  Status s = db_->CompactRange(comp_opts, &start, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, &start, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+  s = db_->CompactRange(comp_opts, nullptr, &end);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  // Test invalid compaction with the decreasing ts_low
+  ts_low_str = Timestamp(current_ts_low - 1, 0);
+  ts_low = ts_low_str;
+  comp_opts.full_history_ts_low = &ts_low;
+  s = db_->CompactRange(comp_opts, nullptr, nullptr);
+  ASSERT_TRUE(s.IsInvalidArgument());
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, UpdateFullHistoryTsLowWithPublicAPI) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  std::string ts_low_str = Timestamp(9, 0);
+  ASSERT_OK(
+      db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str));
+  std::string result_ts_low;
+  ASSERT_OK(db_->GetFullHistoryTsLow(nullptr, &result_ts_low));
+  ASSERT_TRUE(test_cmp.CompareTimestamp(ts_low_str, result_ts_low) == 0);
+  // test increase full_history_low backward
+  std::string ts_low_str_back = Timestamp(8, 0);
+  auto s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                         ts_low_str_back);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp whose length is longger
+  // than the cf's timestamp size
+  std::string ts_low_str_long(Timestamp(0, 0).size() + 1, 'a');
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_long);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow with a timestamp which is null
+  std::string ts_low_str_null = "";
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(),
+                                    ts_low_str_null);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test IncreaseFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  options.comparator = BytewiseComparator();
+  DestroyAndReopen(options);
+  ts_low_str = Timestamp(10, 0);
+  s = db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), ts_low_str);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  // test GetFullHistoryTsLow for a column family that does not enable
+  // timestamp
+  std::string current_ts_low;
+  s = db_->GetFullHistoryTsLow(db_->DefaultColumnFamily(), &current_ts_low);
+  ASSERT_EQ(s, Status::InvalidArgument());
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, GetApproximateSizes) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 100000000;  // Large write buffer
+  options.compression = kNoCompression;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto default_cf = db_->DefaultColumnFamily();
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  const int N = 128;
+  Random rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_OK(db_->Put(write_opts, Key(i), ts, rnd.RandomString(1024)));
+  }
+
+  uint64_t size;
+  std::string start = Key(50);
+  std::string end = Key(60);
+  Range r(start, end);
+  SizeApproximationOptions size_approx_options;
+  size_approx_options.include_memtables = true;
+  size_approx_options.include_files = true;
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 6000);
+  ASSERT_LT(size, 204800);
+
+  // test multiple ranges
+  std::vector<Range> ranges;
+  std::string start_tmp = Key(10);
+  std::string end_tmp = Key(20);
+  ranges.emplace_back(Range(start_tmp, end_tmp));
+  ranges.emplace_back(Range(start, end));
+  uint64_t range_sizes[2];
+  ASSERT_OK(db_->GetApproximateSizes(size_approx_options, default_cf,
+                                     ranges.data(), 2, range_sizes));
+
+  ASSERT_EQ(range_sizes[1], size);
+
+  // Zero if not including mem table
+  ASSERT_OK(db_->GetApproximateSizes(&r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  start = Key(500);
+  end = Key(600);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  // Test range boundaries
+  ASSERT_OK(db_->Put(write_opts, Key(1000), ts, rnd.RandomString(1024)));
+  // Should include start key
+  start = Key(1000);
+  end = Key(1100);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_GT(size, 0);
+
+  // Should exclude end key
+  start = Key(900);
+  end = Key(1000);
+  r = Range(start, end);
+  ASSERT_OK(
+      db_->GetApproximateSizes(size_approx_options, default_cf, &r, 1, &size));
+  ASSERT_EQ(size, 0);
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleIterate) {
+  const int kNumKeysPerFile = 128;
+  const uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<uint64_t> start_keys = {1, 0};
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = start_keys[i]; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    read_opts.timestamp = &read_ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    // Forward iterate.
+    for (it->Seek(Key1(0)), key = start_keys[i]; it->Valid();
+         it->Next(), ++count, ++key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    size_t expected_count = kMaxKey - start_keys[i] + 1;
+    ASSERT_EQ(expected_count, count);
+
+    // Backward iterate.
+    count = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                         "value" + std::to_string(i), write_timestamps[i]);
+    }
+    ASSERT_EQ(static_cast<size_t>(kMaxKey) - start_keys[i] + 1, count);
+
+    // SeekToFirst()/SeekToLast() with lower/upper bounds.
+    // Then iter with lower and upper bounds.
+    uint64_t l = 0;
+    uint64_t r = kMaxKey + 1;
+    while (l < r) {
+      std::string lb_str = Key1(l);
+      Slice lb = lb_str;
+      std::string ub_str = Key1(r);
+      Slice ub = ub_str;
+      read_opts.iterate_lower_bound = &lb;
+      read_opts.iterate_upper_bound = &ub;
+      it.reset(db_->NewIterator(read_opts));
+      for (it->SeekToFirst(), key = std::max(l, start_keys[i]), count = 0;
+           it->Valid(); it->Next(), ++key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      ASSERT_EQ(r - std::max(l, start_keys[i]), count);
+
+      for (it->SeekToLast(), key = std::min(r, kMaxKey + 1), count = 0;
+           it->Valid(); it->Prev(), --key, ++count) {
+        CheckIterUserEntry(it.get(), Key1(key - 1), kTypeValue,
+                           "value" + std::to_string(i), write_timestamps[i]);
+      }
+      l += (kMaxKey / 100);
+      r -= (kMaxKey / 100);
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, TrimHistoryTest) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  auto check_value_by_ts = [](DB* db, Slice key, std::string readTs,
+                              Status status, std::string checkValue,
+                              std::string expected_ts) {
+    ReadOptions ropts;
+    Slice ts = readTs;
+    ropts.timestamp = &ts;
+    std::string value;
+    std::string key_ts;
+    Status s = db->Get(ropts, key, &value, &key_ts);
+    ASSERT_TRUE(s == status);
+    if (s.ok()) {
+      ASSERT_EQ(checkValue, value);
+    }
+    if (s.ok() || s.IsNotFound()) {
+      ASSERT_EQ(expected_ts, key_ts);
+    }
+  };
+  // Construct data of different versions with different ts
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(2, 0), "v1"));
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(4, 0), "v2"));
+  ASSERT_OK(db_->Delete(WriteOptions(), "k1", Timestamp(5, 0)));
+  ASSERT_OK(db_->Put(WriteOptions(), "k1", Timestamp(6, 0), "v3"));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v3",
+                    Timestamp(6, 0));
+  ASSERT_OK(Flush());
+  Close();
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  DBOptions db_options(options);
+
+  // Trim data whose version > Timestamp(5, 0), read(k1, ts(7)) <- NOT_FOUND.
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(5, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::NotFound(), "",
+                    Timestamp(5, 0));
+  Close();
+
+  // Trim data whose timestamp > Timestamp(4, 0), read(k1, ts(7)) <- v2
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(4, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(7, 0), Status::OK(), "v2",
+                    Timestamp(4, 0));
+  Close();
+
+  Reopen(options);
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "k1",
+                             "k3", Timestamp(7, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::NotFound(), "",
+                    Timestamp(7, 0));
+  Close();
+  // Trim data whose timestamp > Timestamp(6, 0), read(k1, ts(8)) <- v2
+  ASSERT_OK(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                   &handles_, &db_, Timestamp(6, 0)));
+  check_value_by_ts(db_, "k1", Timestamp(8, 0), Status::OK(), "v2",
+                    Timestamp(4, 0));
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, OpenAndTrimHistoryInvalidOptionTest) {
+  Destroy(last_options_);
+
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  ColumnFamilyOptions cf_options(options);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.push_back(
+      ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+  DBOptions db_options(options);
+
+  // OpenAndTrimHistory should not work with avoid_flush_during_recovery
+  db_options.avoid_flush_during_recovery = true;
+  ASSERT_TRUE(DB::OpenAndTrimHistory(db_options, dbname_, column_families,
+                                     &handles_, &db_, Timestamp(0, 0))
+                  .IsInvalidArgument());
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(DBBasicTestWithTimestamp, GetTimestampTableProperties) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Create 2 tables
+  for (int table = 0; table < 2; ++table) {
+    for (int i = 0; i < 10; i++) {
+      std::string ts = Timestamp(i, 0);
+      ASSERT_OK(db_->Put(WriteOptions(), "key", ts, Key(i)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  TablePropertiesCollection props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+  ASSERT_EQ(2U, props.size());
+  for (const auto& item : props) {
+    auto& user_collected = item.second->user_collected_properties;
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_min") !=
+                user_collected.end());
+    ASSERT_TRUE(user_collected.find("rocksdb.timestamp_max") !=
+                user_collected.end());
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_min"), Timestamp(0, 0));
+    ASSERT_EQ(user_collected.at("rocksdb.timestamp_max"), Timestamp(9, 0));
+  }
+  Close();
+}
+#endif  // !ROCKSDB_LITE
+
+class DBBasicTestWithTimestampTableOptions
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<BlockBasedTableOptions::IndexType> {
+ public:
+  explicit DBBasicTestWithTimestampTableOptions()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_table_options") {}
+};
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampTableOptions,
+    testing::Values(
+        BlockBasedTableOptions::IndexType::kBinarySearch,
+        BlockBasedTableOptions::IndexType::kHashSearch,
+        BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+        BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey));
+
+TEST_P(DBBasicTestWithTimestampTableOptions, GetAndMultiGet) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator cmp(kTimestampSize);
+  options.comparator = &cmp;
+  DestroyAndReopen(options);
+  constexpr uint64_t kNumKeys = 1024;
+  for (uint64_t k = 0; k < kNumKeys; ++k) {
+    WriteOptions write_opts;
+    ASSERT_OK(db_->Put(write_opts, Key1(k), Timestamp(1, 0),
+                       "value" + std::to_string(k)));
+  }
+  ASSERT_OK(Flush());
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    std::string ts_str = Timestamp(2, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    // verify Get()
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      std::string value_from_get;
+      std::string key_str(it->key().data(), it->key().size());
+      std::string timestamp;
+      ASSERT_OK(db_->Get(read_opts, key_str, &value_from_get, &timestamp));
+      ASSERT_EQ(it->value(), value_from_get);
+      ASSERT_EQ(Timestamp(1, 0), timestamp);
+    }
+
+    // verify MultiGet()
+    constexpr uint64_t step = 2;
+    static_assert(0 == (kNumKeys % step),
+                  "kNumKeys must be a multiple of step");
+    for (uint64_t k = 0; k < kNumKeys; k += 2) {
+      std::vector<std::string> key_strs;
+      std::vector<Slice> keys;
+      for (size_t i = 0; i < step; ++i) {
+        key_strs.push_back(Key1(k + i));
+      }
+      for (size_t i = 0; i < step; ++i) {
+        keys.emplace_back(key_strs[i]);
+      }
+      std::vector<std::string> values;
+      std::vector<std::string> timestamps;
+      std::vector<Status> statuses =
+          db_->MultiGet(read_opts, keys, &values, &timestamps);
+      ASSERT_EQ(step, statuses.size());
+      ASSERT_EQ(step, values.size());
+      ASSERT_EQ(step, timestamps.size());
+      for (uint64_t i = 0; i < step; ++i) {
+        ASSERT_OK(statuses[i]);
+        ASSERT_EQ("value" + std::to_string(k + i), values[i]);
+        ASSERT_EQ(Timestamp(1, 0), timestamps[i]);
+      }
+    }
+  }
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithPrefixLessThanKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+
+    iter->Seek("bbb");
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithCappedPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  // All of the keys or this test must be longer than 3 characters
+  constexpr int kMinKeyLen = 3;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(kMinKeyLen));
+  options.memtable_whole_key_filtering = true;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo3", ts, "bar"));
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, SeekWithBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(2));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo1", ts, "bar1"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->Put(write_opts, "foo2", ts, "bar2"));
+  ASSERT_OK(Flush());
+
+  // Move sst file to next level
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (int i = 3; i < 9; ++i) {
+    ASSERT_OK(db_->Put(write_opts, "foo" + std::to_string(i), ts,
+                       "bar" + std::to_string(i)));
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_opts;
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string up_bound = "foo5";  // exclusive
+  Slice up_bound_slice = up_bound;
+  std::string lo_bound = "foo2";  // inclusive
+  Slice lo_bound_slice = lo_bound;
+  read_opts.iterate_upper_bound = &up_bound_slice;
+  read_opts.iterate_lower_bound = &lo_bound_slice;
+  read_opts.auto_prefix_mode = true;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    // Make sure the prefix extractor doesn't include timestamp, otherwise it
+    // may return invalid result.
+    iter->Seek("foo");
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), lo_bound, kTypeValue, "bar2",
+                       Timestamp(1, 0));
+    iter->SeekForPrev("g");
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo4", kTypeValue, "bar4", Timestamp(1, 0));
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ChangeIterationDirection) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor.reset(NewFixedPrefixTransform(1));
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  const std::vector<std::string> timestamps = {Timestamp(1, 1), Timestamp(0, 2),
+                                               Timestamp(4, 3)};
+  const std::vector<std::tuple<std::string, std::string>> kvs = {
+      std::make_tuple("aa", "value1"), std::make_tuple("ab", "value2")};
+  for (const auto& ts : timestamps) {
+    WriteBatch wb(0, 0, 0, kTimestampSize);
+    for (const auto& kv : kvs) {
+      const std::string& key = std::get<0>(kv);
+      const std::string& value = std::get<1>(kv);
+      ASSERT_OK(wb.Put(key, value));
+    }
+
+    ASSERT_OK(wb.UpdateTimestamps(
+        ts, [kTimestampSize](uint32_t) { return kTimestampSize; }));
+    ASSERT_OK(db_->Write(WriteOptions(), &wb));
+  }
+  std::string read_ts_str = Timestamp(5, 3);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+
+  it->SeekToFirst();
+  ASSERT_TRUE(it->Valid());
+  it->Prev();
+  ASSERT_FALSE(it->Valid());
+
+  it->SeekToLast();
+  ASSERT_TRUE(it->Valid());
+  uint64_t prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(0, prev_reseek_count);
+  it->Next();
+  ASSERT_FALSE(it->Valid());
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->Seek(std::get<0>(kvs[0]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  ASSERT_EQ(1, prev_reseek_count);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it->SeekForPrev(std::get<0>(kvs[1]));
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  it->Prev();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[0]), kTypeValue,
+                     std::get<1>(kvs[0]), Timestamp(4, 3));
+
+  prev_reseek_count =
+      options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION);
+  it->Next();
+  CheckIterUserEntry(it.get(), std::get<0>(kvs[1]), kTypeValue,
+                     std::get<1>(kvs[1]), Timestamp(4, 3));
+  ASSERT_EQ(1 + prev_reseek_count,
+            options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+  it.reset();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleForwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                     "value" + std::to_string(i), write_timestamps[i]);
+      if (i > 0) {
+        it->Next();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue,
+                       "value" + std::to_string(i - 1),
+                       write_timestamps[i - 1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->Seek(Key1(0)), key = 0; it->Valid(); it->Next(), ++count, ++key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_timestamp);
+      // Skip key@ts=3 and land on tombstone key@ts=5
+      it->Next();
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound) {
+  constexpr int kNumKeysPerFile = 128;
+  constexpr uint64_t kMaxKey = 1024;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  const std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                                     Timestamp(3, 0)};
+  const std::vector<std::string> read_timestamps = {Timestamp(2, 0),
+                                                    Timestamp(4, 0)};
+  const std::vector<std::string> read_timestamps_lb = {Timestamp(1, 0),
+                                                       Timestamp(1, 0)};
+  for (size_t i = 0; i < write_timestamps.size(); ++i) {
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key <= kMaxKey; ++key) {
+      Status s = db_->Put(write_opts, Key1(key), write_timestamps[i],
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  }
+  for (size_t i = 0; i < read_timestamps.size(); ++i) {
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamps[i];
+    Slice read_ts_lb = read_timestamps_lb[i];
+    read_opts.timestamp = &read_ts;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = 0;
+    for (it->SeekForPrev(Key1(kMaxKey)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue, "value0",
+                     write_timestamps[0]);
+      if (i > 0) {
+        it->Prev();
+        CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+                       write_timestamps[1]);
+      }
+    }
+    size_t expected_count = kMaxKey + 1;
+    ASSERT_EQ(expected_count, count);
+  }
+  // Delete all keys@ts=5 and check iteration result with start ts set
+  {
+    std::string write_timestamp = Timestamp(5, 0);
+    WriteOptions write_opts;
+    for (uint64_t key = 0; key < kMaxKey + 1; ++key) {
+      Status s = db_->Delete(write_opts, Key1(key), write_timestamp);
+      ASSERT_OK(s);
+    }
+
+    std::string read_timestamp = Timestamp(6, 0);
+    ReadOptions read_opts;
+    Slice read_ts = read_timestamp;
+    read_opts.timestamp = &read_ts;
+    std::string read_timestamp_lb = Timestamp(2, 0);
+    Slice read_ts_lb = read_timestamp_lb;
+    read_opts.iter_start_ts = &read_ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    int count = 0;
+    uint64_t key = kMaxKey;
+    for (it->SeekForPrev(Key1(key)), key = kMaxKey; it->Valid();
+         it->Prev(), ++count, --key) {
+      CheckIterEntry(it.get(), Key1(key), kTypeValue, "value1",
+                     Timestamp(3, 0));
+      it->Prev();
+      CheckIterEntry(it.get(), Key1(key), kTypeDeletionWithTimestamp, Slice(),
+                     write_timestamp);
+    }
+    ASSERT_EQ(kMaxKey + 1, count);
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, SimpleBackwardIterateLowerTsBound) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_ub_buf = Timestamp(5, 0);
+  Slice ts_ub = ts_ub_buf;
+  std::string ts_lb_buf = Timestamp(1, 0);
+  Slice ts_lb = ts_lb_buf;
+
+  {
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+
+    it->SeekForPrev("foo");
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  // Test iterate_upper_bound
+  ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(0, 0), "v0"));
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "a", Timestamp(1, 0)));
+
+  for (int i = 0; i < 5; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  {
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::string key_ub_str = "b";  // exclusive
+    Slice key_ub = key_ub_str;
+    read_opts.iterate_upper_bound = &key_ub;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    CheckIterEntry(it.get(), "a", kTypeSingleDeletion, Slice(),
+                   Timestamp(1, 0));
+
+    key_ub_str = "a";  // exclusive
+    key_ub = key_ub_str;
+    read_opts.iterate_upper_bound = &key_ub;
+    it.reset(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    ASSERT_FALSE(it->Valid());
+    ASSERT_OK(it->status());
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BackwardIterateLowerTsBound_Reseek) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.max_sequential_skip_in_iterations = 2;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "a", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), "b", Timestamp(i, 0),
+                       "v" + std::to_string(i)));
+  }
+
+  {
+    std::string ts_ub_buf = Timestamp(6, 0);
+    Slice ts_ub = ts_ub_buf;
+    std::string ts_lb_buf = Timestamp(4, 0);
+    Slice ts_lb = ts_lb_buf;
+
+    ReadOptions read_opts;
+    read_opts.timestamp = &ts_ub;
+    read_opts.iter_start_ts = &ts_lb;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    it->SeekToLast();
+    for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+      CheckIterEntry(it.get(), "b", kTypeValue, "v" + std::to_string(4 + i),
+                     Timestamp(4 + i, 0));
+    }
+    for (int i = 0; i < 3 && it->Valid(); it->Prev(), ++i) {
+      CheckIterEntry(it.get(), "a", kTypeValue, "v" + std::to_string(4 + i),
+                     Timestamp(4 + i, 0));
+    }
+  }
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToTargetTimestamp) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Insert kNumKeys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "foo", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue, "value0", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+
+    ts_str = Timestamp(kNumKeys, 0);
+    ts = ts_str;
+    read_opts.timestamp = &ts;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    CheckIterUserEntry(iter.get(), "foo", kTypeValue,
+                       "value" + std::to_string(kNumKeys - 1), ts_str);
+    ASSERT_EQ(
+        2, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToNextUserKey) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  // Write kNumKeys + 1 keys
+  WriteOptions write_opts;
+  Status s;
+  for (size_t i = 0; i != kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    WriteBatch batch(0, 0, 0, kTimestampSize);
+    { ASSERT_OK(batch.Put("a", "new_value")); }
+    { ASSERT_OK(batch.Put("b", "new_value")); }
+    s = batch.UpdateTimestamps(
+        ts_str, [kTimestampSize](uint32_t) { return kTimestampSize; });
+    ASSERT_OK(s);
+    s = db_->Write(write_opts, &batch);
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(static_cast<uint64_t>(kNumKeys + 1), 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->Seek("a");
+    iter->Next();
+    CheckIterUserEntry(iter.get(), "b", kTypeValue, "new_value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, ReseekToUserKeyBeforeSavedKey) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  constexpr size_t kNumKeys = 16;
+  options.max_sequential_skip_in_iterations = kNumKeys / 2;
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    WriteOptions write_opts;
+    Status s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    std::string ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+  }
+  {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    CheckIterUserEntry(iter.get(), "a", kTypeValue, "value", ts_str);
+    ASSERT_EQ(
+        1, options.statistics->getTickerCount(NUMBER_OF_RESEEKS_IN_ITERATION));
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetWithFastLocalBloom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+  for (auto& elem : values) {
+    elem.Reset();
+  }
+
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithPrefix) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(Timestamp(1, 0), timestamps[0]);
+  for (auto& elem : values) {
+    elem.Reset();
+  }
+
+  ASSERT_OK(db_->SingleDelete(WriteOptions(), "foo", Timestamp(2, 0)));
+  // TODO re-enable after fixing a bug of kHashSearch
+  if (GetParam() != BlockBasedTableOptions::IndexType::kHashSearch) {
+    ASSERT_OK(Flush());
+  }
+
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                timestamps.data(), statuses.data(), true);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(Timestamp(2, 0), timestamps[0]);
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetWithMemBloomFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(5));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetRangeFiltering) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  // random data
+  for (int i = 0; i < 3; i++) {
+    auto key = std::to_string(i * 10);
+    auto value = std::to_string(i * 10);
+    Slice key_slice = key;
+    Slice value_slice = value;
+    ASSERT_OK(db_->Put(write_opts, key_slice, ts, value_slice));
+    ASSERT_OK(Flush());
+  }
+
+  // Make num_levels to 2 to do key range filtering of sst files
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                statuses.data());
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, MultiGetPrefixFilter) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(NewCappedPrefixTransform(3));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = false;
+  bbto.index_type = GetParam();
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "bar"));
+
+  ASSERT_OK(Flush());
+  // Read with MultiGet
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  size_t batch_size = 1;
+  std::vector<Slice> keys(batch_size);
+  std::vector<std::string> values(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  keys[0] = "foo";
+  ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+  std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+  std::vector<Status> statuses =
+      db_->MultiGet(read_opts, cfhs, keys, &values, &timestamps);
+
+  ASSERT_OK(statuses[0]);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringNext) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts = Timestamp(1, 0);
+    ASSERT_OK(db_->Put(write_opts, "a", ts, "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "b", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    iter->Next();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MaxKeysSkippedDuringPrev) {
+  Options options = GetDefaultOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  constexpr size_t max_skippable_internal_keys = 2;
+  const size_t kNumKeys = max_skippable_internal_keys + 2;
+  WriteOptions write_opts;
+  Status s;
+  {
+    std::string ts = Timestamp(1, 0);
+    ASSERT_OK(db_->Put(write_opts, "b", ts, "value"));
+  }
+  for (size_t i = 0; i < kNumKeys; ++i) {
+    std::string ts = Timestamp(static_cast<uint64_t>(i + 1), 0);
+    s = db_->Put(write_opts, "a", ts, "value" + std::to_string(i));
+    ASSERT_OK(s);
+  }
+  {
+    ReadOptions read_opts;
+    read_opts.max_skippable_internal_keys = max_skippable_internal_keys;
+    std::string ts_str = Timestamp(1, 0);
+    Slice ts = ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    iter->SeekToLast();
+    iter->Prev();
+    ASSERT_TRUE(iter->status().IsIncomplete());
+  }
+  Close();
+}
+
+// Create two L0, and compact them to a new L1. In this test, L1 is L_bottom.
+// Two L0s:
+//       f1                                  f2
+// <a, 1, kTypeValue>    <a, 3, kTypeDeletionWithTimestamp>...<b, 2, kTypeValue>
+// Since f2.smallest < f1.largest < f2.largest
+// f1 and f2 will be the inputs of a real compaction instead of trivial move.
+TEST_F(DBBasicTestWithTimestamp, CompactDeletionWithTimestampMarkerToBottom) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 2;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, "a", ts, "value0"));
+  ASSERT_OK(Flush());
+
+  ts = Timestamp(2, 0);
+  ASSERT_OK(db_->Put(write_opts, "b", ts, "value0"));
+  ts = Timestamp(3, 0);
+  ASSERT_OK(db_->Delete(write_opts, "a", ts));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ReadOptions read_opts;
+  ts = Timestamp(1, 0);
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+
+  ts = Timestamp(3, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  std::string key_ts;
+  s = db_->Get(read_opts, "a", &value, &key_ts);
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(Timestamp(3, 0), key_ts);
+
+  // Time-travel to the past before deletion
+  ts = Timestamp(2, 0);
+  read_ts = ts;
+  read_opts.timestamp = &read_ts;
+  s = db_->Get(read_opts, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value0", value);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampFilterPrefixSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, bool, bool,
+                     std::shared_ptr<const SliceTransform>, bool, double,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampFilterPrefixSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_filter_prefix") {}
+};
+
+TEST_P(DBBasicTestWithTimestampFilterPrefixSettings, GetAndMultiGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = std::get<1>(GetParam());
+  bbto.cache_index_and_filter_blocks = std::get<2>(GetParam());
+  bbto.index_type = std::get<6>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.prefix_extractor = std::get<3>(GetParam());
+  options.memtable_whole_key_filtering = std::get<4>(GetParam());
+  options.memtable_prefix_bloom_size_ratio = std::get<5>(GetParam());
+
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  const int kMaxKey = 1000;
+
+  // Write any value
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+
+  int idx = 0;
+  for (; idx < kMaxKey / 4; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  for (; idx < kMaxKey / 2; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  ASSERT_OK(Flush());
+
+  for (; idx < kMaxKey; idx++) {
+    ASSERT_OK(db_->Put(write_opts, Key1(idx), ts, "bar"));
+    ASSERT_OK(db_->Put(write_opts, KeyWithPrefix("foo", idx), ts, "bar"));
+  }
+
+  // Read with MultiGet
+  ReadOptions read_opts;
+  Slice read_ts = ts;
+  read_opts.timestamp = &read_ts;
+
+  for (idx = 0; idx < kMaxKey; idx++) {
+    size_t batch_size = 4;
+    std::vector<std::string> keys_str(batch_size);
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+
+    keys_str[0] = Key1(idx);
+    keys_str[1] = KeyWithPrefix("foo", idx);
+    keys_str[2] = Key1(kMaxKey + idx);
+    keys_str[3] = KeyWithPrefix("foo", kMaxKey + idx);
+
+    auto keys = ConvertStrToSlice(keys_str);
+
+    db_->MultiGet(read_opts, cfh, batch_size, keys.data(), values.data(),
+                  statuses.data());
+
+    for (int i = 0; i < 2; i++) {
+      ASSERT_OK(statuses[i]);
+    }
+    for (int i = 2; i < 4; i++) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+    }
+
+    for (int i = 0; i < 2; i++) {
+      std::string value;
+      ASSERT_OK(db_->Get(read_opts, keys[i], &value));
+      std::unique_ptr<Iterator> it1(db_->NewIterator(read_opts));
+      ASSERT_NE(nullptr, it1);
+      ASSERT_OK(it1->status());
+      it1->Seek(keys[i]);
+      ASSERT_TRUE(it1->Valid());
+    }
+
+    for (int i = 2; i < 4; i++) {
+      std::string value;
+      Status s = db_->Get(read_opts, keys[i], &value);
+      ASSERT_TRUE(s.IsNotFound());
+    }
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampFilterPrefixSettings,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const FilterPolicy>(nullptr),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10, true)),
+            std::shared_ptr<const FilterPolicy>(NewBloomFilterPolicy(10,
+                                                                     false))),
+        ::testing::Bool(), ::testing::Bool(),
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Bool(), ::testing::Values(0, 0.1),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class DataVisibilityTest : public DBBasicTestWithTimestampBase {
+ public:
+  DataVisibilityTest() : DBBasicTestWithTimestampBase("data_visibility_test") {
+    // Initialize test data
+    for (int i = 0; i < kTestDataSize; i++) {
+      test_data_[i].key = "key" + std::to_string(i);
+      test_data_[i].value = "value" + std::to_string(i);
+      test_data_[i].timestamp = Timestamp(i, 0);
+      test_data_[i].ts = i;
+      test_data_[i].seq_num = kMaxSequenceNumber;
+    }
+  }
+
+ protected:
+  struct TestData {
+    std::string key;
+    std::string value;
+    int ts;
+    std::string timestamp;
+    SequenceNumber seq_num;
+  };
+
+  constexpr static int kTestDataSize = 3;
+  TestData test_data_[kTestDataSize];
+
+  void PutTestData(int index, ColumnFamilyHandle* cfh = nullptr) {
+    ASSERT_LE(index, kTestDataSize);
+    WriteOptions write_opts;
+
+    if (cfh == nullptr) {
+      ASSERT_OK(db_->Put(write_opts, test_data_[index].key,
+                         test_data_[index].timestamp, test_data_[index].value));
+      const Snapshot* snap = db_->GetSnapshot();
+      test_data_[index].seq_num = snap->GetSequenceNumber();
+      if (index > 0) {
+        ASSERT_GT(test_data_[index].seq_num, test_data_[index - 1].seq_num);
+      }
+      db_->ReleaseSnapshot(snap);
+    } else {
+      ASSERT_OK(db_->Put(write_opts, cfh, test_data_[index].key,
+                         test_data_[index].timestamp, test_data_[index].value));
+    }
+  }
+
+  void AssertVisibility(int ts, SequenceNumber seq,
+                        std::vector<Status> statuses) {
+    ASSERT_EQ(kTestDataSize, statuses.size());
+    for (int i = 0; i < kTestDataSize; i++) {
+      if (test_data_[i].seq_num <= seq && test_data_[i].ts <= ts) {
+        ASSERT_OK(statuses[i]);
+      } else {
+        ASSERT_TRUE(statuses[i].IsNotFound());
+      }
+    }
+  }
+
+  std::vector<Slice> GetKeys() {
+    std::vector<Slice> ret(kTestDataSize);
+    for (int i = 0; i < kTestDataSize; i++) {
+      ret[i] = test_data_[i].key;
+    }
+    return ret;
+  }
+
+  void VerifyDefaultCF(int ts, const Snapshot* snap = nullptr) {
+    ReadOptions read_opts;
+    std::string read_ts = Timestamp(ts, 0);
+    Slice read_ts_slice = read_ts;
+    read_opts.timestamp = &read_ts_slice;
+    read_opts.snapshot = snap;
+
+    ColumnFamilyHandle* cfh = db_->DefaultColumnFamily();
+    std::vector<ColumnFamilyHandle*> cfs(kTestDataSize, cfh);
+    SequenceNumber seq =
+        snap ? snap->GetSequenceNumber() : kMaxSequenceNumber - 1;
+
+    // There're several MultiGet interfaces with not exactly the same
+    // implementations, query data with all of them.
+    auto keys = GetKeys();
+    std::vector<std::string> values;
+    auto s1 = db_->MultiGet(read_opts, cfs, keys, &values);
+    AssertVisibility(ts, seq, s1);
+
+    auto s2 = db_->MultiGet(read_opts, keys, &values);
+    AssertVisibility(ts, seq, s2);
+
+    std::vector<std::string> timestamps;
+    auto s3 = db_->MultiGet(read_opts, cfs, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s3);
+
+    auto s4 = db_->MultiGet(read_opts, keys, &values, &timestamps);
+    AssertVisibility(ts, seq, s4);
+
+    std::vector<PinnableSlice> values_ps5(kTestDataSize);
+    std::vector<Status> s5(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps5.data(),
+                  s5.data());
+    AssertVisibility(ts, seq, s5);
+
+    std::vector<PinnableSlice> values_ps6(kTestDataSize);
+    std::vector<Status> s6(kTestDataSize);
+    std::vector<std::string> timestamps_array(kTestDataSize);
+    db_->MultiGet(read_opts, cfh, kTestDataSize, keys.data(), values_ps6.data(),
+                  timestamps_array.data(), s6.data());
+    AssertVisibility(ts, seq, s6);
+
+    std::vector<PinnableSlice> values_ps7(kTestDataSize);
+    std::vector<Status> s7(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps7.data(), s7.data());
+    AssertVisibility(ts, seq, s7);
+
+    std::vector<PinnableSlice> values_ps8(kTestDataSize);
+    std::vector<Status> s8(kTestDataSize);
+    db_->MultiGet(read_opts, kTestDataSize, cfs.data(), keys.data(),
+                  values_ps8.data(), timestamps_array.data(), s8.data());
+    AssertVisibility(ts, seq, s8);
+  }
+
+  void VerifyDefaultCF(const Snapshot* snap = nullptr) {
+    for (int i = 0; i <= kTestDataSize; i++) {
+      VerifyDefaultCF(i, snap);
+    }
+  }
+};
+constexpr int DataVisibilityTest::kTestDataSize;
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot1:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=90
+//           ts=100
+//           seq=10
+//                               seq'=11
+//                               write finishes
+//                               Flush
+//         GetImpl(ts,seq)
+// It is OK to return <k, t1, s1> if ts>=t1 AND seq>=s1. If ts>=t1 but seq<s1,
+// the key should not be returned.
+TEST_F(DataVisibilityTest, PointLookupWithoutSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::GetImpl:3",
+       "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut",
+       "DBImpl::GetImpl:4"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT(
+        "DataVisibilityTest::PointLookupWithoutSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts = Timestamp(2, 0);
+    s = db_->Put(write_opts, "bar", write_ts, "value");
+    ASSERT_OK(s);
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithoutSnapshot2:AfterPut");
+  });
+  ReadOptions read_opts;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+  ASSERT_TRUE(s.IsNotFound());
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot1) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot1:BeforePut"},
+      {"DataVisibilityTest::PointLookupWithSnapshot1:AfterPut",
+       "DBImpl::GetImpl:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value");
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot1:AfterPut");
+    ASSERT_OK(s);
+  });
+  ReadOptions read_opts;
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot1:AfterTakingSnap");
+  read_opts.snapshot = snap;
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  writer_thread.join();
+
+  ASSERT_TRUE(s.IsNotFound());
+
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader               writer
+//       seq=10
+//                            ts'=90
+//       ts=100
+//                            seq'=11
+//                            write finishes
+//                            Flush
+//       GetImpl(ts,seq)
+// Since application specifies both timestamp and snapshot, application expects
+// to see data that visible in BOTH timestamp and sequence number. Therefore,
+// <k, t1, s1> can be returned only if t1<=ts AND s1<=seq.
+TEST_F(DataVisibilityTest, PointLookupWithSnapshot2) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap",
+       "DataVisibilityTest::PointLookupWithSnapshot2:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    std::string write_ts = Timestamp(1, 0);
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::PointLookupWithSnapshot2:BeforePut");
+    Status s = db_->Put(write_opts, "foo", write_ts, "value1");
+    ASSERT_OK(s);
+    ASSERT_OK(Flush());
+
+    write_ts = Timestamp(2, 0);
+    s = db_->Put(write_opts, "bar", write_ts, "value2");
+    ASSERT_OK(s);
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::PointLookupWithSnapshot2:AfterTakingSnap");
+  writer_thread.join();
+  std::string read_ts_str = Timestamp(3, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.timestamp = &read_ts;
+  std::string value;
+  Status s = db_->Get(read_opts, "foo", &value);
+  ASSERT_TRUE(s.IsNotFound());
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//      reader                writer
+//                            ts'=90
+//      ts=100
+//      seq=10
+//                            seq'=11
+//                            write finishes
+//      scan(ts,seq)
+// <k, t1, s1> can be seen in scan as long as ts>=t1 AND seq>=s1. If ts>=t1 but
+// seq<s1, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::NewIterator:3",
+       "DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithoutSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts = Timestamp(i + 1, 0);
+      Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  writer_thread.join();
+  it->SeekToFirst();
+  ASSERT_FALSE(it->Valid());
+  delete it;
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+//       reader         writer
+//       seq=10
+//                      ts'=90
+//       ts=100         seq'=11
+//                      write finishes
+//       scan(ts,seq)
+// <k, t1, s1> can be seen by the scan only if t1<=ts AND s1<=seq. If t1<=ts
+// but s1>seq, then the key should not be returned.
+TEST_F(DataVisibilityTest, RangeScanWithSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot",
+       "DataVisibilityTest::RangeScanWithSnapshot:BeforePut"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    WriteOptions write_opts;
+    TEST_SYNC_POINT("DataVisibilityTest::RangeScanWithSnapshot:BeforePut");
+    for (int i = 0; i < 3; ++i) {
+      std::string write_ts = Timestamp(i + 1, 0);
+      Status s = db_->Put(write_opts, "key" + std::to_string(i), write_ts,
+                          "value" + std::to_string(i));
+      ASSERT_OK(s);
+    }
+  });
+  const Snapshot* snap = db_->GetSnapshot();
+  TEST_SYNC_POINT(
+      "DataVisibilityTest::RangeScanWithSnapshot:AfterTakingSnapshot");
+
+  writer_thread.join();
+
+  std::string read_ts_str = Timestamp(10, 0);
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.snapshot = snap;
+  read_opts.total_order_seek = true;
+  read_opts.timestamp = &read_ts;
+  Iterator* it = db_->NewIterator(read_opts);
+  ASSERT_NE(nullptr, it);
+  it->Seek("key0");
+  ASSERT_FALSE(it->Valid());
+
+  delete it;
+  db_->ReleaseSnapshot(snap);
+  Close();
+}
+
+// Application specifies both timestamp and snapshot.
+// Query each combination and make sure for MultiGet key <k, t1, s1>, only
+// return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithTimestamp) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+
+  Close();
+}
+
+// Application specifies timestamp but not snapshot.
+//           reader              writer
+//                               ts'=0, 1
+//           ts=3
+//           seq=10
+//                               seq'=11, 12
+//                               write finishes
+//         MultiGet(ts,seq)
+// For MultiGet <k, t1, s1>, only return keys that ts>=t1 AND seq>=s1.
+TEST_F(DataVisibilityTest, MultiGetWithoutSnapshot) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::MultiGet:AfterGetSeqNum1",
+       "DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut"},
+      {"DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut",
+       "DBImpl::MultiGet:AfterGetSeqNum2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer_thread([this]() {
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:BeforePut");
+    PutTestData(0);
+    PutTestData(1);
+    TEST_SYNC_POINT("DataVisibilityTest::MultiGetWithoutSnapshot:AfterPut");
+  });
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  auto keys = GetKeys();
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, keys, &values);
+
+  writer_thread.join();
+  for (auto s : ss) {
+    ASSERT_TRUE(s.IsNotFound());
+  }
+  VerifyDefaultCF();
+  Close();
+}
+
+TEST_F(DataVisibilityTest, MultiGetCrossCF) {
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  CreateAndReopenWithCF({"second"}, options);
+  ColumnFamilyHandle* second_cf = handles_[1];
+
+  const Snapshot* snap0 = db_->GetSnapshot();
+  PutTestData(0);
+  PutTestData(0, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+
+  const Snapshot* snap1 = db_->GetSnapshot();
+  PutTestData(1);
+  PutTestData(1, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+
+  ASSERT_OK(Flush());
+
+  const Snapshot* snap2 = db_->GetSnapshot();
+  PutTestData(2);
+  PutTestData(2, second_cf);
+  VerifyDefaultCF();
+  VerifyDefaultCF(snap0);
+  VerifyDefaultCF(snap1);
+  VerifyDefaultCF(snap2);
+
+  ReadOptions read_opts;
+  std::string read_ts = Timestamp(kTestDataSize, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  read_opts.snapshot = snap1;
+  auto keys = GetKeys();
+  auto keys2 = GetKeys();
+  keys.insert(keys.end(), keys2.begin(), keys2.end());
+  std::vector<ColumnFamilyHandle*> cfs(kTestDataSize,
+                                       db_->DefaultColumnFamily());
+  std::vector<ColumnFamilyHandle*> cfs2(kTestDataSize, second_cf);
+  cfs.insert(cfs.end(), cfs2.begin(), cfs2.end());
+
+  std::vector<std::string> values;
+  auto ss = db_->MultiGet(read_opts, cfs, keys, &values);
+  for (int i = 0; i < 2 * kTestDataSize; i++) {
+    if (i % 3 == 0) {
+      // only the first key for each column family should be returned
+      ASSERT_OK(ss[i]);
+    } else {
+      ASSERT_TRUE(ss[i].IsNotFound());
+    }
+  }
+
+  db_->ReleaseSnapshot(snap0);
+  db_->ReleaseSnapshot(snap1);
+  db_->ReleaseSnapshot(snap2);
+  Close();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class DBBasicTestWithTimestampCompressionSettings
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const FilterPolicy>, CompressionType,
+                     uint32_t, uint32_t>> {
+ public:
+  DBBasicTestWithTimestampCompressionSettings()
+      : DBBasicTestWithTimestampBase(
+            "db_basic_test_with_timestamp_compression") {}
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGet) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 4;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+        ASSERT_OK(
+            db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+                     "value_" + std::to_string(j) + "_" + std::to_string(i)));
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        for (size_t j = 0; j != (kNumKeysPerFile - 1) / kNumTimestamps; ++j) {
+          std::string value;
+          ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value));
+          ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                    value);
+        }
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutDeleteGet) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  const int kNumKeysPerFile = 1024;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  options.target_file_size_base = 1 << 26;  // 64MB
+
+  DestroyAndReopen(options);
+
+  const size_t kNumL0Files =
+      static_cast<size_t>(Options().level0_file_num_compaction_trigger);
+  {
+    // Half of the keys will go through Deletion and remaining half with
+    // SingleDeletion. Generate enough L0 files with ts=1 to trigger compaction
+    // to L1
+    std::string ts = Timestamp(1, 0);
+    WriteOptions wopts;
+    for (size_t i = 0; i < kNumL0Files; ++i) {
+      for (int j = 0; j < kNumKeysPerFile; ++j) {
+        ASSERT_OK(db_->Put(wopts, Key1(j), ts, "value" + std::to_string(i)));
+      }
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+    // Generate another L0 at ts=3
+    ts = Timestamp(3, 0);
+    for (int i = 0; i < kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 0) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key, ts));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+        }
+      } else {
+        ASSERT_OK(db_->Put(wopts, key, ts, "new_value"));
+      }
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+    // Populate memtable at ts=5
+    ts = Timestamp(5, 0);
+    for (int i = 0; i != kNumKeysPerFile; ++i) {
+      std::string key_str = Key1(i);
+      Slice key(key_str);
+      if ((i % 3) == 1) {
+        if (i < kNumKeysPerFile / 2) {
+          ASSERT_OK(db_->Delete(wopts, key, ts));
+        } else {
+          ASSERT_OK(db_->SingleDelete(wopts, key, ts));
+        }
+      } else if ((i % 3) == 2) {
+        ASSERT_OK(db_->Put(wopts, key, ts, "new_value_2"));
+      }
+    }
+  }
+  {
+    std::string ts_str = Timestamp(6, 0);
+    Slice ts = ts_str;
+    ReadOptions ropts;
+    ropts.timestamp = &ts;
+    for (uint64_t i = 0; i != static_cast<uint64_t>(kNumKeysPerFile); ++i) {
+      std::string value;
+      std::string key_ts;
+      Status s = db_->Get(ropts, Key1(i), &value, &key_ts);
+      if ((i % 3) == 2) {
+        ASSERT_OK(s);
+        ASSERT_EQ("new_value_2", value);
+        ASSERT_EQ(Timestamp(5, 0), key_ts);
+      } else if ((i % 3) == 1) {
+        ASSERT_TRUE(s.IsNotFound());
+        ASSERT_EQ(Timestamp(5, 0), key_ts);
+      } else {
+        ASSERT_TRUE(s.IsNotFound());
+        ASSERT_EQ(Timestamp(3, 0), key_ts);
+      }
+    }
+  }
+}
+
+#ifndef ROCKSDB_LITE
+// A class which remembers the name of each flushed file.
+class FlushedFileCollector : public EventListener {
+ public:
+  FlushedFileCollector() {}
+  ~FlushedFileCollector() override {}
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.push_back(info.file_path);
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::vector<std::string> result;
+    {
+      InstrumentedMutexLock lock(&mutex_);
+      result = flushed_files_;
+    }
+    return result;
+  }
+
+  void ClearFlushedFiles() {
+    InstrumentedMutexLock lock(&mutex_);
+    flushed_files_.clear();
+  }
+
+ private:
+  std::vector<std::string> flushed_files_;
+  InstrumentedMutex mutex_;
+};
+
+TEST_P(DBBasicTestWithTimestampCompressionSettings, PutAndGetWithCompaction) {
+  const int kNumKeysPerFile = 1024;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  const size_t kSplitPosBase = kNumKeysPerTimestamp / 2;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+
+  FlushedFileCollector* collector = new FlushedFileCollector();
+  options.listeners.emplace_back(collector);
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<0>(GetParam());
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+
+  const CompressionType comp_type = std::get<1>(GetParam());
+#if LZ4_VERSION_NUMBER < 10400  // r124+
+  if (comp_type == kLZ4Compression || comp_type == kLZ4HCCompression) {
+    return;
+  }
+#endif  // LZ4_VERSION_NUMBER >= 10400
+  if (!ZSTD_Supported() && comp_type == kZSTD) {
+    return;
+  }
+  if (!Zlib_Supported() && comp_type == kZlibCompression) {
+    return;
+  }
+
+  options.compression = comp_type;
+  options.compression_opts.max_dict_bytes = std::get<2>(GetParam());
+  if (comp_type == kZSTD) {
+    options.compression_opts.zstd_max_train_bytes = std::get<2>(GetParam());
+  }
+  options.compression_opts.parallel_threads = std::get<3>(GetParam());
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, size_t begin, size_t end,
+                                        ColumnFamilyHandle* cfh) {
+    std::string value;
+    std::string timestamp;
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp =
+        std::string(write_ts_list[i].data(), write_ts_list[i].size());
+
+    for (size_t j = begin; j <= end; ++j) {
+      ASSERT_OK(db_->Get(ropts, cfh, Key1(j), &value, &timestamp));
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i), value);
+      ASSERT_EQ(expected_timestamp, timestamp);
+    }
+  };
+
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice write_ts = write_ts_list.back();
+    WriteOptions wopts;
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      size_t memtable_get_start = 0;
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        ASSERT_OK(
+            db_->Put(wopts, handles_[cf], Key1(j), write_ts,
+                     "value_" + std::to_string(j) + "_" + std::to_string(i)));
+        if (j == kSplitPosBase + i || j == kNumKeysPerTimestamp - 1) {
+          verify_records_func(i, memtable_get_start, j, handles_[cf]);
+          memtable_get_start = j + 1;
+
+          // flush all keys with the same timestamp to two sst files, split at
+          // incremental positions such that lowerlevel[1].smallest.userkey ==
+          // higherlevel[0].largest.userkey
+          ASSERT_OK(Flush(cf));
+          ASSERT_OK(dbfull()->TEST_WaitForCompact());  // wait for flush (which
+                                                       // is also a compaction)
+
+          // compact files (2 at each level) to a lower level such that all
+          // keys with the same timestamp is at one level, with newer versions
+          // at higher levels.
+          CompactionOptions compact_opt;
+          compact_opt.compression = kNoCompression;
+          ASSERT_OK(db_->CompactFiles(compact_opt, handles_[cf],
+                                      collector->GetFlushedFiles(),
+                                      static_cast<int>(kNumTimestamps - i)));
+          collector->ClearFlushedFiles();
+        }
+      }
+    }
+  }
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      std::string expected_timestamp(write_ts_list[i].data(),
+                                     write_ts_list[i].size());
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, 0, kNumKeysPerTimestamp - 1, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, BatchWriteAndMultiGet) {
+  const int kNumKeysPerFile = 8192;
+  const size_t kNumTimestamps = 2;
+  const size_t kNumKeysPerTimestamp = (kNumKeysPerFile - 1) / kNumTimestamps;
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  options.env = env_;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  options.memtable_prefix_bloom_size_ratio = 0.1;
+  options.memtable_whole_key_filtering = true;
+
+  size_t ts_sz = Timestamp(0, 0).size();
+  TestComparator test_cmp(ts_sz);
+  options.comparator = &test_cmp;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(
+      10 /*bits_per_key*/, false /*use_block_based_builder*/));
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+  CreateAndReopenWithCF({"pikachu"}, options);
+  size_t num_cfs = handles_.size();
+  ASSERT_EQ(2, num_cfs);
+  std::vector<std::string> write_ts_list;
+  std::vector<std::string> read_ts_list;
+
+  const auto& verify_records_func = [&](size_t i, ColumnFamilyHandle* cfh) {
+    std::vector<Slice> keys;
+    std::vector<std::string> key_vals;
+    std::vector<std::string> values;
+    std::vector<std::string> timestamps;
+
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      key_vals.push_back(Key1(j));
+    }
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      keys.push_back(key_vals[j]);
+    }
+
+    ReadOptions ropts;
+    const Slice read_ts = read_ts_list[i];
+    ropts.timestamp = &read_ts;
+    std::string expected_timestamp(write_ts_list[i].data(),
+                                   write_ts_list[i].size());
+
+    std::vector<ColumnFamilyHandle*> cfhs(keys.size(), cfh);
+    std::vector<Status> statuses =
+        db_->MultiGet(ropts, cfhs, keys, &values, &timestamps);
+    for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+      ASSERT_OK(statuses[j]);
+      ASSERT_EQ("value_" + std::to_string(j) + "_" + std::to_string(i),
+                values[j]);
+      ASSERT_EQ(expected_timestamp, timestamps[j]);
+    }
+  };
+
+  const std::string dummy_ts(ts_sz, '\0');
+  for (size_t i = 0; i != kNumTimestamps; ++i) {
+    write_ts_list.push_back(Timestamp(i * 2, 0));
+    read_ts_list.push_back(Timestamp(1 + i * 2, 0));
+    const Slice& write_ts = write_ts_list.back();
+    for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+      WriteOptions wopts;
+      WriteBatch batch(0, 0, 0, ts_sz);
+      for (size_t j = 0; j != kNumKeysPerTimestamp; ++j) {
+        const std::string key = Key1(j);
+        const std::string value =
+            "value_" + std::to_string(j) + "_" + std::to_string(i);
+        ASSERT_OK(batch.Put(handles_[cf], key, value));
+      }
+      ASSERT_OK(batch.UpdateTimestamps(write_ts,
+                                       [ts_sz](uint32_t) { return ts_sz; }));
+      ASSERT_OK(db_->Write(wopts, &batch));
+
+      verify_records_func(i, handles_[cf]);
+
+      ASSERT_OK(Flush(cf));
+    }
+  }
+
+  const auto& verify_db_func = [&]() {
+    for (size_t i = 0; i != kNumTimestamps; ++i) {
+      ReadOptions ropts;
+      const Slice read_ts = read_ts_list[i];
+      ropts.timestamp = &read_ts;
+      for (int cf = 0; cf != static_cast<int>(num_cfs); ++cf) {
+        ColumnFamilyHandle* cfh = handles_[cf];
+        verify_records_func(i, cfh);
+      }
+    }
+  };
+  verify_db_func();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MultiGetNoReturnTs) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, "foo", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "bar", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "fooxxxxxxxxxxxxxxxx", ts, "value"));
+  ASSERT_OK(db_->Put(write_opts, "barxxxxxxxxxxxxxxxx", ts, "value"));
+  ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily();
+  ts = Timestamp(2, 0);
+  Slice read_ts = ts;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh};
+    Slice keys[] = {"foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/2, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  {
+    ColumnFamilyHandle* column_families[] = {cfh, cfh, cfh, cfh};
+    // Make user keys longer than configured timestamp size (16 bytes) to
+    // verify RocksDB does not use the trailing bytes 'x' as timestamp.
+    Slice keys[] = {"fooxxxxxxxxxxxxxxxx", "barxxxxxxxxxxxxxxxx", "foo", "bar"};
+    PinnableSlice values[] = {PinnableSlice(), PinnableSlice(), PinnableSlice(),
+                              PinnableSlice()};
+    Status statuses[] = {Status::OK(), Status::OK(), Status::OK(),
+                         Status::OK()};
+    dbfull()->MultiGet(read_opts, /*num_keys=*/4, &column_families[0], &keys[0],
+                       &values[0], &statuses[0], /*sorted_input=*/false);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+  }
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampCompressionSettings,
+    ::testing::Combine(
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false))),
+        ::testing::Values(kNoCompression, kZlibCompression, kLZ4Compression,
+                          kLZ4HCCompression, kZSTD),
+        ::testing::Values(0, 1 << 14), ::testing::Values(1, 4)));
+
+class DBBasicTestWithTimestampPrefixSeek
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, bool,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTimestampPrefixSeek()
+      : DBBasicTestWithTimestampBase(
+            "/db_basic_test_with_timestamp_prefix_seek") {}
+};
+
+TEST_P(DBBasicTestWithTimestampPrefixSeek, IterateWithPrefix) {
+  const size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  DestroyAndReopen(options);
+
+  const uint64_t kMaxKey = 0xffffffffffffffff;
+  const uint64_t kMinKey = 0xfffffffffffff000;
+  const std::vector<std::string> write_ts_list = {Timestamp(3, 0xffffffff),
+                                                  Timestamp(6, 0xffffffff)};
+  WriteOptions write_opts;
+  {
+    for (size_t i = 0; i != write_ts_list.size(); ++i) {
+      for (uint64_t key = kMaxKey; key >= kMinKey; --key) {
+        Status s = db_->Put(write_opts, Key1(key), write_ts_list[i],
+                            "value" + std::to_string(i));
+        ASSERT_OK(s);
+      }
+    }
+  }
+  const std::vector<std::string> read_ts_list = {Timestamp(5, 0xffffffff),
+                                                 Timestamp(9, 0xffffffff)};
+  {
+    ReadOptions read_opts;
+    read_opts.total_order_seek = false;
+    read_opts.prefix_same_as_start = std::get<2>(GetParam());
+    fprintf(stdout, "%s %s %d\n", options.prefix_extractor->Name(),
+            bbto.filter_policy ? bbto.filter_policy->Name() : "null",
+            static_cast<int>(read_opts.prefix_same_as_start));
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+
+      // Seek to kMaxKey
+      iter->Seek(Key1(kMaxKey));
+      CheckIterUserEntry(iter.get(), Key1(kMaxKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+
+      // Seek to kMinKey
+      iter->Seek(Key1(kMinKey));
+      CheckIterUserEntry(iter.get(), Key1(kMinKey), kTypeValue,
+                         "value" + std::to_string(i), write_ts_list[i]);
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+    }
+    const std::vector<uint64_t> targets = {kMinKey, kMinKey + 0x10,
+                                           kMinKey + 0x100, kMaxKey};
+    const SliceTransform* const pe = options.prefix_extractor.get();
+    ASSERT_NE(nullptr, pe);
+    const size_t kPrefixShift =
+        8 * (Key1(0).size() - pe->Transform(Key1(0)).size());
+    const uint64_t kPrefixMask =
+        ~((static_cast<uint64_t>(1) << kPrefixShift) - 1);
+    const uint64_t kNumKeysWithinPrefix =
+        (static_cast<uint64_t>(1) << kPrefixShift);
+    for (size_t i = 0; i != read_ts_list.size(); ++i) {
+      Slice read_ts = read_ts_list[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      // Forward and backward iterate.
+      for (size_t j = 0; j != targets.size(); ++j) {
+        std::string start_key = Key1(targets[j]);
+        uint64_t expected_ub =
+            (targets[j] & kPrefixMask) - 1 + kNumKeysWithinPrefix;
+        uint64_t expected_key = targets[j];
+        size_t count = 0;
+        it->Seek(Key1(targets[j]));
+        while (it->Valid()) {
+          std::string saved_prev_key;
+          saved_prev_key.assign(it->key().data(), it->key().size());
+
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(saved_prev_key) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          ++expected_key;
+          it->Next();
+        }
+        ASSERT_EQ(expected_ub - targets[j] + 1, count);
+
+        count = 0;
+        expected_key = targets[j];
+        it->SeekForPrev(start_key);
+        uint64_t expected_lb = (targets[j] & kPrefixMask);
+        while (it->Valid()) {
+          // Out of prefix
+          if (!read_opts.prefix_same_as_start &&
+              pe->Transform(it->key()) != pe->Transform(start_key)) {
+            break;
+          }
+          CheckIterUserEntry(it.get(), Key1(expected_key), kTypeValue,
+                             "value" + std::to_string(i), write_ts_list[i]);
+          ++count;
+          --expected_key;
+          it->Prev();
+        }
+        ASSERT_EQ(targets[j] - std::max(expected_lb, kMinKey) + 1, count);
+      }
+    }
+  }
+  Close();
+}
+
+// TODO(yanqin): consider handling non-fixed-length prefix extractors, e.g.
+// NoopTransform.
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTimestampPrefixSeek,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(1)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(4)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10 /*bits_per_key*/, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20 /*bits_per_key*/,
+                                                   false))),
+        ::testing::Bool(),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+
+class DBBasicTestWithTsIterTombstones
+    : public DBBasicTestWithTimestampBase,
+      public testing::WithParamInterface<
+          std::tuple<std::shared_ptr<const SliceTransform>,
+                     std::shared_ptr<const FilterPolicy>, int,
+                     BlockBasedTableOptions::IndexType>> {
+ public:
+  DBBasicTestWithTsIterTombstones()
+      : DBBasicTestWithTimestampBase("/db_basic_ts_iter_tombstones") {}
+};
+
+TEST_P(DBBasicTestWithTsIterTombstones, IterWithDelete) {
+  constexpr size_t kNumKeysPerFile = 128;
+  Options options = CurrentOptions();
+  options.env = env_;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.prefix_extractor = std::get<0>(GetParam());
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy = std::get<1>(GetParam());
+  bbto.index_type = std::get<3>(GetParam());
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.num_levels = std::get<2>(GetParam());
+  DestroyAndReopen(options);
+  std::vector<std::string> write_ts_strs = {Timestamp(2, 0), Timestamp(4, 0)};
+  constexpr uint64_t kMaxKey = 0xffffffffffffffff;
+  constexpr uint64_t kMinKey = 0xfffffffffffff000;
+  // Insert kMinKey...kMaxKey
+  uint64_t key = kMinKey;
+  WriteOptions write_opts;
+  Slice ts = write_ts_strs[0];
+  do {
+    Status s = db_->Put(write_opts, Key1(key), write_ts_strs[0],
+                        "value" + std::to_string(key));
+    ASSERT_OK(s);
+    if (kMaxKey == key) {
+      break;
+    }
+    ++key;
+  } while (true);
+
+  for (key = kMaxKey; key >= kMinKey; --key) {
+    Status s;
+    if (0 != (key % 2)) {
+      s = db_->Put(write_opts, Key1(key), write_ts_strs[1],
+                   "value1" + std::to_string(key));
+    } else {
+      s = db_->Delete(write_opts, Key1(key), write_ts_strs[1]);
+    }
+    ASSERT_OK(s);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  {
+    std::string read_ts = Timestamp(4, 0);
+    ts = read_ts;
+    ReadOptions read_opts;
+    read_opts.total_order_seek = true;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    size_t count = 0;
+    key = kMinKey + 1;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next(), ++count, key += 2) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+
+    for (iter->SeekToLast(), count = 0, key = kMaxKey; iter->Valid();
+         key -= 2, ++count, iter->Prev()) {
+      ASSERT_EQ(Key1(key), iter->key());
+      ASSERT_EQ("value1" + std::to_string(key), iter->value());
+    }
+    ASSERT_EQ((kMaxKey - kMinKey + 1) / 2, count);
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Timestamp, DBBasicTestWithTsIterTombstones,
+    ::testing::Combine(
+        ::testing::Values(
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(7)),
+            std::shared_ptr<const SliceTransform>(NewFixedPrefixTransform(8))),
+        ::testing::Values(std::shared_ptr<const FilterPolicy>(nullptr),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(10, false)),
+                          std::shared_ptr<const FilterPolicy>(
+                              NewBloomFilterPolicy(20, false))),
+        ::testing::Values(2, 6),
+        ::testing::Values(
+            BlockBasedTableOptions::IndexType::kBinarySearch,
+            BlockBasedTableOptions::IndexType::kHashSearch,
+            BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch,
+            BlockBasedTableOptions::IndexType::kBinarySearchWithFirstKey)));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+class UpdateFullHistoryTsLowTest : public DBBasicTestWithTimestampBase {
+ public:
+  UpdateFullHistoryTsLowTest()
+      : DBBasicTestWithTimestampBase("/update_full_history_ts_low_test") {}
+};
+
+TEST_F(UpdateFullHistoryTsLowTest, ConcurrentUpdate) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  std::string lower_ts_low = Timestamp(10, 0);
+  std::string higher_ts_low = Timestamp(25, 0);
+  const size_t kTimestampSize = lower_ts_low.size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // This workaround swaps `lower_ts_low` originally used for update by the
+  // caller to `higher_ts_low` after its writer is queued to make sure
+  // the caller will always get a TryAgain error.
+  // It mimics cases where two threads update full_history_ts_low concurrently
+  // with one thread writing a higher ts_low and one thread writing a lower
+  // ts_low.
+  VersionEdit* version_edit;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit",
+      [&](void* arg) { version_edit = reinterpret_cast<VersionEdit*>(arg); });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting",
+      [&](void* /*arg*/) { version_edit->SetFullHistoryTsLow(higher_ts_low); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_TRUE(
+      db_->IncreaseFullHistoryTsLow(db_->DefaultColumnFamily(), lower_ts_low)
+          .IsTryAgain());
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+       GCPreserveRangeTombstoneWhenNoOrSmallFullHistoryLow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+
+  std::string ts_str = Timestamp(1, 0);
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", ts_str, "v1"));
+  ASSERT_OK(db_->Put(wopts, "k2", ts_str, "v2"));
+  ASSERT_OK(db_->Put(wopts, "k3", ts_str, "v3"));
+  ts_str = Timestamp(2, 0);
+  ASSERT_OK(
+      db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3", ts_str));
+
+  ts_str = Timestamp(3, 0);
+  Slice ts = ts_str;
+  ReadOptions ropts;
+  ropts.timestamp = &ts;
+  CompactRangeOptions cro;
+  cro.full_history_ts_low = nullptr;
+  std::string value, key_ts;
+  Status s;
+  auto verify = [&] {
+    s = db_->Get(ropts, "k1", &value);
+    ASSERT_TRUE(s.IsNotFound());
+
+    s = db_->Get(ropts, "k2", &value, &key_ts);
+    ASSERT_TRUE(s.IsNotFound());
+    ASSERT_EQ(key_ts, Timestamp(2, 0));
+
+    ASSERT_OK(db_->Get(ropts, "k3", &value, &key_ts));
+    ASSERT_EQ(value, "v3");
+    ASSERT_EQ(Timestamp(1, 0), key_ts);
+
+    size_t batch_size = 3;
+    std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+    std::vector<Slice> keys{key_strs.begin(), key_strs.end()};
+    std::vector<PinnableSlice> values(batch_size);
+    std::vector<Status> statuses(batch_size);
+    db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                  values.data(), statuses.data(), true /* sorted_input */);
+    ASSERT_TRUE(statuses[0].IsNotFound());
+    ASSERT_TRUE(statuses[1].IsNotFound());
+    ASSERT_OK(statuses[2]);
+    ;
+    ASSERT_EQ(values[2], "v3");
+  };
+  verify();
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  verify();
+  std::string lb = Timestamp(0, 0);
+  Slice lb_slice = lb;
+  cro.full_history_ts_low = &lb_slice;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  verify();
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp,
+       GCRangeTombstonesAndCoveredKeysRespectingTslow) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.create_if_missing = true;
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.cache_index_and_filter_blocks = true;
+  bbto.whole_key_filtering = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  WriteOptions wopts;
+  ASSERT_OK(db_->Put(wopts, "k1", Timestamp(1, 0), "v1"));
+  ASSERT_OK(db_->Delete(wopts, "k2", Timestamp(2, 0)));
+  ASSERT_OK(db_->DeleteRange(wopts, db_->DefaultColumnFamily(), "k1", "k3",
+                             Timestamp(3, 0)));
+  ASSERT_OK(db_->Put(wopts, "k3", Timestamp(4, 0), "v3"));
+
+  ReadOptions ropts;
+  std::string read_ts = Timestamp(5, 0);
+  Slice read_ts_slice = read_ts;
+  ropts.timestamp = &read_ts_slice;
+  size_t batch_size = 3;
+  std::vector<std::string> key_strs = {"k1", "k2", "k3"};
+  std::vector<Slice> keys = {key_strs.begin(), key_strs.end()};
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_EQ(timestamps[0], Timestamp(3, 0));
+  ASSERT_TRUE(statuses[1].IsNotFound());
+  // DeleteRange has a higher timestamp than Delete for "k2"
+  ASSERT_EQ(timestamps[1], Timestamp(3, 0));
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v3");
+  ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+  CompactRangeOptions cro;
+  // Range tombstone has timestamp >= full_history_ts_low, covered keys
+  // are not dropped.
+  std::string compaction_ts_str = Timestamp(2, 0);
+  Slice compaction_ts = compaction_ts_str;
+  cro.full_history_ts_low = &compaction_ts;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ropts.timestamp = &compaction_ts;
+  std::string value, ts;
+  ASSERT_OK(db_->Get(ropts, "k1", &value, &ts));
+  ASSERT_EQ(value, "v1");
+  // timestamp is below full_history_ts_low, zeroed out as the key goes into
+  // bottommost level
+  ASSERT_EQ(ts, Timestamp(0, 0));
+  ASSERT_TRUE(db_->Get(ropts, "k2", &value, &ts).IsNotFound());
+  ASSERT_EQ(ts, Timestamp(2, 0));
+
+  compaction_ts_str = Timestamp(4, 0);
+  compaction_ts = compaction_ts_str;
+  cro.full_history_ts_low = &compaction_ts;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ropts.timestamp = &read_ts_slice;
+  // k1, k2 and the range tombstone should be dropped
+  // k3 should still exist
+  db_->MultiGet(ropts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  ASSERT_TRUE(statuses[0].IsNotFound());
+  ASSERT_TRUE(timestamps[0].empty());
+  ASSERT_TRUE(statuses[1].IsNotFound());
+  ASSERT_TRUE(timestamps[1].empty());
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v3");
+  ASSERT_EQ(timestamps[2], Timestamp(4, 0));
+
+  Close();
+}
+
+TEST_P(DBBasicTestWithTimestampTableOptions, DeleteRangeBaiscReadAndIterate) {
+  const int kNum = 200, kRangeBegin = 50, kRangeEnd = 150, kNumPerFile = 25;
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  options.compression = kNoCompression;
+  BlockBasedTableOptions bbto;
+  bbto.index_type = GetParam();
+  bbto.block_size = 100;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  options.env = env_;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  DestroyAndReopen(options);
+
+  // Write half of the keys before the tombstone and half after the tombstone.
+  // Only covered keys (i.e., within the range and older than the tombstone)
+  // should be deleted.
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kNum / 2) {
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key1(kRangeBegin), Key1(kRangeEnd),
+                                 Timestamp(i, 0)));
+    }
+    ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+                       "val" + std::to_string(i)));
+    if (i == kNum - kNumPerFile) {
+      ASSERT_OK(Flush());
+    }
+  }
+
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  std::string read_ts = Timestamp(kNum, 0);
+  Slice read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    ASSERT_OK(iter->status());
+
+    int expected = 0;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      if (expected == kRangeBegin - 1) {
+        expected = kNum / 2;
+      } else {
+        ++expected;
+      }
+    }
+    ASSERT_EQ(kNum, expected);
+
+    expected = kNum / 2;
+    for (iter->Seek(Key1(kNum / 2)); iter->Valid(); iter->Next()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      ++expected;
+    }
+    ASSERT_EQ(kNum, expected);
+
+    expected = kRangeBegin - 1;
+    for (iter->SeekForPrev(Key1(kNum / 2 - 1)); iter->Valid(); iter->Prev()) {
+      ASSERT_EQ(Key1(expected), iter->key());
+      --expected;
+    }
+    ASSERT_EQ(-1, expected);
+
+    read_ts = Timestamp(0, 0);
+    read_ts_slice = read_ts;
+    read_opts.timestamp = &read_ts_slice;
+    iter.reset(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->key(), Key1(0));
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  }
+
+  read_ts = Timestamp(kNum, 0);
+  read_ts_slice = read_ts;
+  read_opts.timestamp = &read_ts_slice;
+  std::string value, timestamp;
+  Status s;
+  for (int i = 0; i < kNum; ++i) {
+    s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+    if (i >= kRangeBegin && i < kNum / 2) {
+      ASSERT_TRUE(s.IsNotFound());
+      ASSERT_EQ(timestamp, Timestamp(kNum / 2, 0));
+    } else {
+      ASSERT_OK(s);
+      ASSERT_EQ(value, "val" + std::to_string(i));
+      ASSERT_EQ(timestamp, Timestamp(i, 0));
+    }
+  }
+
+  size_t batch_size = kNum;
+  std::vector<std::string> key_strs(batch_size);
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  for (int i = 0; i < kNum; ++i) {
+    key_strs[i] = Key1(i);
+    keys[i] = key_strs[i];
+  }
+  db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size, keys.data(),
+                values.data(), timestamps.data(), statuses.data(),
+                true /* sorted_input */);
+  for (int i = 0; i < kNum; ++i) {
+    if (i >= kRangeBegin && i < kNum / 2) {
+      ASSERT_TRUE(statuses[i].IsNotFound());
+      ASSERT_EQ(timestamps[i], Timestamp(kNum / 2, 0));
+    } else {
+      ASSERT_OK(statuses[i]);
+      ASSERT_EQ(values[i], "val" + std::to_string(i));
+      ASSERT_EQ(timestamps[i], Timestamp(i, 0));
+    }
+  }
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, DeleteRangeGetIteratorWithSnapshot) {
+  // 4 keys 0, 1, 2, 3 at timestamps 0, 1, 2, 3 respectively.
+  // A range tombstone [1, 3) at timestamp 1 and has a sequence number between
+  // key 1 and 2.
+  Options options = CurrentOptions();
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  std::string put_ts = Timestamp(0, 0);
+  const int kNum = 4, kNumPerFile = 1, kRangeBegin = 1, kRangeEnd = 3;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(kNumPerFile));
+  const Snapshot* before_tombstone = nullptr;
+  const Snapshot* after_tombstone = nullptr;
+  for (int i = 0; i < kNum; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), Key1(i), Timestamp(i, 0),
+                       "val" + std::to_string(i)));
+    if (i == kRangeBegin) {
+      before_tombstone = db_->GetSnapshot();
+      ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                                 Key1(kRangeBegin), Key1(kRangeEnd),
+                                 Timestamp(kRangeBegin, 0)));
+    }
+    if (i == kNum / 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  assert(before_tombstone);
+  after_tombstone = db_->GetSnapshot();
+  // snapshot and ts before tombstone
+  std::string read_ts_str = Timestamp(kRangeBegin - 1, 0);  // (0, 0)
+  Slice read_ts = read_ts_str;
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = before_tombstone;
+  std::vector<Status> expected_status = {
+      Status::OK(), Status::NotFound(), Status::NotFound(), Status::NotFound()};
+  std::vector<std::string> expected_values(kNum);
+  expected_values[0] = "val" + std::to_string(0);
+  std::vector<std::string> expected_timestamps(kNum);
+  expected_timestamps[0] = Timestamp(0, 0);
+
+  size_t batch_size = kNum;
+  std::vector<std::string> key_strs(batch_size);
+  std::vector<Slice> keys(batch_size);
+  std::vector<PinnableSlice> values(batch_size);
+  std::vector<Status> statuses(batch_size);
+  std::vector<std::string> timestamps(batch_size);
+  for (int i = 0; i < kNum; ++i) {
+    key_strs[i] = Key1(i);
+    keys[i] = key_strs[i];
+  }
+
+  auto verify = [&] {
+    db_->MultiGet(read_opts, db_->DefaultColumnFamily(), batch_size,
+                  keys.data(), values.data(), timestamps.data(),
+                  statuses.data(), true /* sorted_input */);
+    std::string value, timestamp;
+    Status s;
+    for (int i = 0; i < kNum; ++i) {
+      s = db_->Get(read_opts, Key1(i), &value, &timestamp);
+      ASSERT_EQ(s, expected_status[i]);
+      ASSERT_EQ(statuses[i], expected_status[i]);
+      if (s.ok()) {
+        ASSERT_EQ(value, expected_values[i]);
+        ASSERT_EQ(values[i], expected_values[i]);
+      }
+      if (!timestamp.empty()) {
+        ASSERT_EQ(timestamp, expected_timestamps[i]);
+        ASSERT_EQ(timestamps[i], expected_timestamps[i]);
+      } else {
+        ASSERT_TRUE(timestamps[i].empty());
+      }
+    }
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_opts));
+    std::unique_ptr<Iterator> iter_for_seek(db_->NewIterator(read_opts));
+    iter->SeekToFirst();
+    for (int i = 0; i < kNum; ++i) {
+      if (expected_status[i].ok()) {
+        auto verify_iter = [&](Iterator* iter_ptr) {
+          ASSERT_TRUE(iter_ptr->Valid());
+          ASSERT_EQ(iter_ptr->key(), keys[i]);
+          ASSERT_EQ(iter_ptr->value(), expected_values[i]);
+          ASSERT_EQ(iter_ptr->timestamp(), expected_timestamps[i]);
+        };
+        verify_iter(iter.get());
+        iter->Next();
+
+        iter_for_seek->Seek(keys[i]);
+        verify_iter(iter_for_seek.get());
+
+        iter_for_seek->SeekForPrev(keys[i]);
+        verify_iter(iter_for_seek.get());
+      }
+    }
+    ASSERT_FALSE(iter->Valid());
+    ASSERT_OK(iter->status());
+  };
+
+  verify();
+
+  // snapshot before tombstone and ts after tombstone
+  read_ts_str = Timestamp(kNum, 0);  // (4, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = before_tombstone;
+  expected_status[1] = Status::OK();
+  expected_timestamps[1] = Timestamp(1, 0);
+  expected_values[1] = "val" + std::to_string(1);
+  verify();
+
+  // snapshot after tombstone and ts before tombstone
+  read_ts_str = Timestamp(kRangeBegin - 1, 0);  // (0, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = after_tombstone;
+  expected_status[1] = Status::NotFound();
+  expected_timestamps[1].clear();
+  expected_values[1].clear();
+  verify();
+
+  // snapshot and ts after tombstone
+  read_ts_str = Timestamp(kNum, 0);  // (4, 0)
+  read_ts = read_ts_str;
+  read_opts.timestamp = &read_ts;
+  read_opts.snapshot = after_tombstone;
+  for (int i = 0; i < kNum; ++i) {
+    if (i == kRangeBegin) {
+      expected_status[i] = Status::NotFound();
+      expected_values[i].clear();
+    } else {
+      expected_status[i] = Status::OK();
+      expected_values[i] = "val" + std::to_string(i);
+    }
+    expected_timestamps[i] = Timestamp(i, 0);
+  }
+  verify();
+
+  db_->ReleaseSnapshot(before_tombstone);
+  db_->ReleaseSnapshot(after_tombstone);
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeBasic) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+  DestroyAndReopen(options);
+
+  const std::array<std::string, 3> write_ts_strs = {
+      Timestamp(100, 0), Timestamp(200, 0), Timestamp(300, 0)};
+  constexpr size_t kNumOfUniqKeys = 100;
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+
+  for (size_t i = 0; i < write_ts_strs.size(); ++i) {
+    for (size_t j = 0; j < kNumOfUniqKeys; ++j) {
+      Status s;
+      if (i == 0) {
+        const std::string val = "v" + std::to_string(j) + "_0";
+        s = db_->Put(WriteOptions(), Key1(j), write_ts_strs[i], val);
+      } else {
+        const std::string merge_op = std::to_string(i);
+        s = db_->Merge(WriteOptions(), default_cf, Key1(j), write_ts_strs[i],
+                       merge_op);
+      }
+      ASSERT_OK(s);
+    }
+  }
+
+  std::array<std::string, 3> read_ts_strs = {
+      Timestamp(150, 0), Timestamp(250, 0), Timestamp(350, 0)};
+
+  const auto verify_db_with_get = [&]() {
+    for (size_t i = 0; i < kNumOfUniqKeys; ++i) {
+      const std::string base_val = "v" + std::to_string(i) + "_0";
+      const std::array<std::string, 3> expected_values = {
+          base_val, base_val + ".1", base_val + ".1.2"};
+      const std::array<std::string, 3>& expected_ts = write_ts_strs;
+      ReadOptions read_opts;
+      for (size_t j = 0; j < read_ts_strs.size(); ++j) {
+        Slice read_ts = read_ts_strs[j];
+        read_opts.timestamp = &read_ts;
+        std::string value;
+        std::string ts;
+        const Status s = db_->Get(read_opts, Key1(i), &value, &ts);
+        ASSERT_OK(s);
+        ASSERT_EQ(expected_values[j], value);
+        ASSERT_EQ(expected_ts[j], ts);
+
+        // Do Seek/SeekForPrev
+        std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+        it->Seek(Key1(i));
+        ASSERT_TRUE(it->Valid());
+        ASSERT_EQ(expected_values[j], it->value());
+        ASSERT_EQ(expected_ts[j], it->timestamp());
+
+        it->SeekForPrev(Key1(i));
+        ASSERT_TRUE(it->Valid());
+        ASSERT_EQ(expected_values[j], it->value());
+        ASSERT_EQ(expected_ts[j], it->timestamp());
+      }
+    }
+  };
+
+  const auto verify_db_with_iterator = [&]() {
+    std::string value_suffix;
+    for (size_t i = 0; i < read_ts_strs.size(); ++i) {
+      ReadOptions read_opts;
+      Slice read_ts = read_ts_strs[i];
+      read_opts.timestamp = &read_ts;
+      std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+      size_t key_int_val = 0;
+      for (it->SeekToFirst(); it->Valid(); it->Next(), ++key_int_val) {
+        const std::string key = Key1(key_int_val);
+        const std::string value =
+            "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+        ASSERT_EQ(key, it->key());
+        ASSERT_EQ(value, it->value());
+        ASSERT_EQ(write_ts_strs[i], it->timestamp());
+      }
+      ASSERT_EQ(kNumOfUniqKeys, key_int_val);
+
+      key_int_val = kNumOfUniqKeys - 1;
+      for (it->SeekToLast(); it->Valid(); it->Prev(), --key_int_val) {
+        const std::string key = Key1(key_int_val);
+        const std::string value =
+            "v" + std::to_string(key_int_val) + "_0" + value_suffix;
+        ASSERT_EQ(key, it->key());
+        ASSERT_EQ(value, it->value());
+        ASSERT_EQ(write_ts_strs[i], it->timestamp());
+      }
+      ASSERT_EQ(std::numeric_limits<size_t>::max(), key_int_val);
+
+      value_suffix = value_suffix + "." + std::to_string(i + 1);
+    }
+  };
+
+  verify_db_with_get();
+  verify_db_with_iterator();
+
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  verify_db_with_get();
+  verify_db_with_iterator();
+
+  Close();
+}
+
+TEST_F(DBBasicTestWithTimestamp, MergeAfterDeletion) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+  options.merge_operator = std::make_shared<StringAppendTESTOperator>('.');
+  DestroyAndReopen(options);
+
+  ColumnFamilyHandle* const column_family = db_->DefaultColumnFamily();
+
+  const size_t num_keys_per_file = 10;
+  const size_t num_merges_per_key = 2;
+  for (size_t i = 0; i < num_keys_per_file; ++i) {
+    std::string ts = Timestamp(i + 10000, 0);
+    Status s = db_->Delete(WriteOptions(), Key1(i), ts);
+    ASSERT_OK(s);
+    for (size_t j = 1; j <= num_merges_per_key; ++j) {
+      ts = Timestamp(i + 10000 + j, 0);
+      s = db_->Merge(WriteOptions(), column_family, Key1(i), ts,
+                     std::to_string(j));
+      ASSERT_OK(s);
+    }
+  }
+
+  const auto verify_db = [&]() {
+    ReadOptions read_opts;
+    std::string read_ts_str = Timestamp(20000, 0);
+    Slice ts = read_ts_str;
+    read_opts.timestamp = &ts;
+    std::unique_ptr<Iterator> it(db_->NewIterator(read_opts));
+    size_t count = 0;
+    for (it->SeekToFirst(); it->Valid(); it->Next(), ++count) {
+      std::string key = Key1(count);
+      ASSERT_EQ(key, it->key());
+      std::string value;
+      for (size_t j = 1; j <= num_merges_per_key; ++j) {
+        value.append(std::to_string(j));
+        if (j < num_merges_per_key) {
+          value.push_back('.');
+        }
+      }
+      ASSERT_EQ(value, it->value());
+      std::string ts1 = Timestamp(count + 10000 + num_merges_per_key, 0);
+      ASSERT_EQ(ts1, it->timestamp());
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(num_keys_per_file, count);
+    for (it->SeekToLast(); it->Valid(); it->Prev(), --count) {
+      std::string key = Key1(count - 1);
+      ASSERT_EQ(key, it->key());
+      std::string value;
+      for (size_t j = 1; j <= num_merges_per_key; ++j) {
+        value.append(std::to_string(j));
+        if (j < num_merges_per_key) {
+          value.push_back('.');
+        }
+      }
+      ASSERT_EQ(value, it->value());
+      std::string ts1 = Timestamp(count - 1 + 10000 + num_merges_per_key, 0);
+      ASSERT_EQ(ts1, it->timestamp());
+    }
+    ASSERT_OK(it->status());
+    ASSERT_EQ(0, count);
+  };
+
+  verify_db();
+
+  Close();
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_compaction_test.cc b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
new file mode 100644
index 000000000..d28f67e05
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_compaction_test.cc
@@ -0,0 +1,334 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/compaction/compaction.h"
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+std::string Key1(uint64_t key) {
+  std::string ret;
+  PutFixed64(&ret, key);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string Timestamp(uint64_t ts) {
+  std::string ret;
+  PutFixed64(&ret, ts);
+  return ret;
+}
+}  // anonymous namespace
+
+class TimestampCompatibleCompactionTest : public DBTestBase {
+ public:
+  TimestampCompatibleCompactionTest()
+      : DBTestBase("ts_compatible_compaction_test", /*env_do_fsync=*/true) {}
+
+  std::string Get(const std::string& key, uint64_t ts) {
+    ReadOptions read_opts;
+    std::string ts_str = Timestamp(ts);
+    Slice ts_slice = ts_str;
+    read_opts.timestamp = &ts_slice;
+    std::string value;
+    Status s = db_->Get(read_opts, key, &value);
+    if (s.IsNotFound()) {
+      value.assign("NOT_FOUND");
+    } else if (!s.ok()) {
+      value.assign(s.ToString());
+    }
+    return value;
+  }
+};
+
+TEST_F(TimestampCompatibleCompactionTest, UserKeyCrossFileBoundary) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleLevel;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.level0_file_num_compaction_trigger = 3;
+  constexpr size_t kNumKeysPerFile = 101;
+  options.memtable_factory.reset(
+      test::NewSpecialSkipListFactory(kNumKeysPerFile));
+  DestroyAndReopen(options);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "LevelCompactionPicker::PickCompaction:Return", [&](void* arg) {
+        const auto* compaction = reinterpret_cast<Compaction*>(arg);
+        ASSERT_NE(nullptr, compaction);
+        ASSERT_EQ(0, compaction->start_level());
+        ASSERT_EQ(1, compaction->num_input_levels());
+        // Check that all 3 L0 ssts are picked for level compaction.
+        ASSERT_EQ(3, compaction->num_input_files(0));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  // Write a L0 with keys 0, 1, ..., 99 with ts from 100 to 199.
+  uint64_t ts = 100;
+  uint64_t key = 0;
+  WriteOptions write_opts;
+  for (; key < kNumKeysPerFile - 1; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "foo_" + std::to_string(key)));
+  }
+  // Write another L0 with keys 99 with newer ts.
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts1 = ts++;
+  key = 99;
+  for (int i = 0; i < 4; ++i, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "bar_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  uint64_t saved_read_ts2 = ts++;
+  // Write another L0 with keys 99, 100, 101, ..., 150
+  for (; key <= 150; ++key, ++ts) {
+    std::string ts_str = Timestamp(ts);
+    ASSERT_OK(
+        db_->Put(write_opts, Key1(key), ts_str, "foo1_" + std::to_string(key)));
+  }
+  ASSERT_OK(Flush());
+  // Wait for compaction to finish
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  uint64_t read_ts = ts;
+  ASSERT_EQ("foo_99", Get(Key1(99), saved_read_ts1));
+  ASSERT_EQ("bar_99", Get(Key1(99), saved_read_ts2));
+  ASSERT_EQ("foo1_99", Get(Key1(99), read_ts));
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(TimestampCompatibleCompactionTest, MultipleSubCompactions) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.compaction_style = kCompactionStyleUniversal;
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.level0_file_num_compaction_trigger = 3;
+  options.max_subcompactions = 3;
+  options.target_file_size_base = 1024;
+  options.statistics = CreateDBStatistics();
+  DestroyAndReopen(options);
+
+  uint64_t ts = 100;
+  uint64_t key = 0;
+  WriteOptions write_opts;
+
+  // Write keys 0, 1, ..., 499 with ts from 100 to 599.
+  {
+    for (; key <= 499; ++key, ++ts) {
+      std::string ts_str = Timestamp(ts);
+      ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+                         "foo_" + std::to_string(key)));
+    }
+  }
+
+  // Write keys 500, ..., 999 with ts from 600 to 1099.
+  {
+    for (; key <= 999; ++key, ++ts) {
+      std::string ts_str = Timestamp(ts);
+      ASSERT_OK(db_->Put(write_opts, Key1(key), ts_str,
+                         "foo_" + std::to_string(key)));
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for compaction to finish
+  {
+    ASSERT_OK(dbfull()->RunManualCompaction(
+        static_cast_with_check<ColumnFamilyHandleImpl>(
+            db_->DefaultColumnFamily())
+            ->cfd(),
+        0 /* input_level */, 1 /* output_level */, CompactRangeOptions(),
+        nullptr /* begin */, nullptr /* end */, true /* exclusive */,
+        true /* disallow_trivial_move */,
+        std::numeric_limits<uint64_t>::max() /* max_file_num_to_ignore */,
+        "" /*trim_ts*/));
+  }
+
+  // Check stats to make sure multiple subcompactions were scheduled for
+  // boundaries not to be nullptr.
+  {
+    HistogramData num_sub_compactions;
+    options.statistics->histogramData(NUM_SUBCOMPACTIONS_SCHEDULED,
+                                      &num_sub_compactions);
+    ASSERT_GT(num_sub_compactions.sum, 1);
+  }
+
+  for (key = 0; key <= 999; ++key) {
+    ASSERT_EQ("foo_" + std::to_string(key), Get(Key1(key), ts));
+  }
+}
+
+class TestFilePartitioner : public SstPartitioner {
+ public:
+  explicit TestFilePartitioner() {}
+  ~TestFilePartitioner() override {}
+
+  const char* Name() const override { return "TestFilePartitioner"; }
+  PartitionerResult ShouldPartition(
+      const PartitionerRequest& /*request*/) override {
+    return PartitionerResult::kRequired;
+  }
+  bool CanDoTrivialMove(const Slice& /*smallest_user_key*/,
+                        const Slice& /*largest_user_key*/) override {
+    return false;
+  }
+};
+
+class TestFilePartitionerFactory : public SstPartitionerFactory {
+ public:
+  explicit TestFilePartitionerFactory() {}
+  std::unique_ptr<SstPartitioner> CreatePartitioner(
+      const SstPartitioner::Context& /*context*/) const override {
+    std::unique_ptr<SstPartitioner> ret =
+        std::make_unique<TestFilePartitioner>();
+    return ret;
+  }
+  const char* Name() const override { return "TestFilePartitionerFactory"; }
+};
+
+#ifndef ROCKSDB_LITE
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL0) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.sst_partitioner_factory =
+      std::make_shared<TestFilePartitionerFactory>();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  constexpr int kNumFiles = 10;
+  constexpr int kKeysPerFile = 2;
+  const std::string user_key = "foo";
+  constexpr uint64_t start_ts = 10000;
+
+  uint64_t cur_ts = start_ts;
+  for (int k = 0; k < kNumFiles; ++k) {
+    for (int i = 0; i < kKeysPerFile; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+                         "v" + std::to_string(i)));
+      ++cur_ts;
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+
+  std::vector<std::string> input_files{};
+  {
+    std::vector<std::string> files;
+    ASSERT_OK(env_->GetChildren(dbname_, &files));
+    for (const auto& f : files) {
+      uint64_t file_num = 0;
+      FileType file_type = FileType::kWalFile;
+      if (!ParseFileName(f, &file_num, &file_type) ||
+          file_type != FileType::kTableFile) {
+        continue;
+      }
+      input_files.emplace_back(f);
+    }
+    // sorting here by name, which also happens to sort by generation date.
+    std::sort(input_files.begin(), input_files.end());
+    assert(kNumFiles == input_files.size());
+    std::vector<std::string> tmp;
+    tmp.emplace_back(input_files[input_files.size() / 2]);
+    input_files.swap(tmp);
+  }
+
+  {
+    std::vector<std::string> output_file_names;
+    CompactionJobInfo compaction_job_info;
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), input_files,
+                                /*output_level=*/1, /*output_path_id=*/-1,
+                                &output_file_names, &compaction_job_info));
+    // We expect the L0 files older than the original provided input were all
+    // included in the compaction.
+    ASSERT_EQ(static_cast<size_t>(kNumFiles / 2 + 1),
+              compaction_job_info.input_files.size());
+  }
+}
+
+TEST_F(TimestampCompatibleCompactionTest, CompactFilesRangeCheckL1) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.sst_partitioner_factory =
+      std::make_shared<TestFilePartitionerFactory>();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  constexpr int kNumFiles = 4;
+  options.level0_file_num_compaction_trigger = kNumFiles;
+
+  DestroyAndReopen(options);
+
+  constexpr int kKeysPerFile = 2;
+  const std::string user_key = "foo";
+  constexpr uint64_t start_ts = 10000;
+
+  uint64_t cur_ts = start_ts;
+  // Generate some initial files in both L0 and L1.
+  for (int k = 0; k < kNumFiles; ++k) {
+    for (int i = 0; i < kKeysPerFile; ++i) {
+      ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts),
+                         "v" + std::to_string(i)));
+      ++cur_ts;
+    }
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(0, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+  ASSERT_EQ(kNumFiles * kKeysPerFile,
+            NumTableFilesAtLevel(/*level=*/1, /*cf=*/0));
+
+  constexpr int additional_l0s = 2;
+  for (int i = 0; i < additional_l0s; ++i, ++cur_ts) {
+    ASSERT_OK(db_->Put(WriteOptions(), user_key, Timestamp(cur_ts), "v"));
+    ASSERT_OK(db_->Flush(FlushOptions()));
+  }
+  ASSERT_EQ(additional_l0s, NumTableFilesAtLevel(/*level=*/0, /*cf=*/0));
+
+  std::vector<std::string> inputs;
+  {
+    std::vector<LiveFileMetaData> fmetas;
+    db_->GetLiveFilesMetaData(&fmetas);
+    bool included_one_l1 = false;
+    for (const auto& meta : fmetas) {
+      if (meta.level == 0) {
+        inputs.emplace_back(meta.relative_filename);
+      } else if (!included_one_l1) {
+        inputs.emplace_back(meta.relative_filename);
+        included_one_l1 = true;
+      }
+    }
+  }
+  ASSERT_EQ(static_cast<size_t>(3), inputs.size());
+  {
+    std::vector<std::string> output_file_names;
+    CompactionJobInfo compaction_job_info;
+
+    ASSERT_OK(db_->CompactFiles(CompactionOptions(), inputs, /*output_level=*/1,
+                                /*output_path_id=*/-1, &output_file_names,
+                                &compaction_job_info));
+    ASSERT_EQ(kNumFiles * kKeysPerFile + 2, output_file_names.size());
+    ASSERT_EQ(kNumFiles * kKeysPerFile + 2,
+              static_cast<int>(compaction_job_info.input_files.size()));
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.cc b/src/rocksdb/db/db_with_timestamp_test_util.cc
new file mode 100644
index 000000000..f562bcb48
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_with_timestamp_test_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+std::string DBBasicTestWithTimestampBase::Key1(uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::KeyWithPrefix(std::string prefix,
+                                                        uint64_t k) {
+  std::string ret;
+  PutFixed64(&ret, k);
+  std::reverse(ret.begin(), ret.end());
+  return prefix + ret;
+}
+
+std::vector<Slice> DBBasicTestWithTimestampBase::ConvertStrToSlice(
+    std::vector<std::string>& strings) {
+  std::vector<Slice> ret;
+  for (const auto& s : strings) {
+    ret.emplace_back(s);
+  }
+  return ret;
+}
+
+std::string DBBasicTestWithTimestampBase::Timestamp(uint64_t low,
+                                                    uint64_t high) {
+  std::string ts;
+  PutFixed64(&ts, low);
+  PutFixed64(&ts, high);
+  return ts;
+}
+
+void DBBasicTestWithTimestampBase::CheckIterUserEntry(
+    const Iterator* it, const Slice& expected_key,
+    ValueType expected_value_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  ASSERT_EQ(expected_key, it->key());
+  if (kTypeValue == expected_value_type) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, SequenceNumber expected_seq,
+    ValueType expected_val_type, const Slice& expected_value,
+    const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(ukey_and_ts, parsed_ikey.user_key);
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(expected_seq, parsed_ikey.sequence);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+
+void DBBasicTestWithTimestampBase::CheckIterEntry(
+    const Iterator* it, const Slice& expected_ukey, ValueType expected_val_type,
+    const Slice& expected_value, const Slice& expected_ts) const {
+  ASSERT_TRUE(it->Valid());
+  ASSERT_OK(it->status());
+  std::string ukey_and_ts;
+  ukey_and_ts.assign(expected_ukey.data(), expected_ukey.size());
+  ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+  ParsedInternalKey parsed_ikey;
+  ASSERT_OK(ParseInternalKey(it->key(), &parsed_ikey, true /* log_err_key */));
+  ASSERT_EQ(expected_val_type, parsed_ikey.type);
+  ASSERT_EQ(Slice(ukey_and_ts), parsed_ikey.user_key);
+  if (expected_val_type == kTypeValue) {
+    ASSERT_EQ(expected_value, it->value());
+  }
+  ASSERT_EQ(expected_ts, it->timestamp());
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_with_timestamp_test_util.h b/src/rocksdb/db/db_with_timestamp_test_util.h
new file mode 100644
index 000000000..8a0d8e4e3
--- /dev/null
+++ b/src/rocksdb/db/db_with_timestamp_test_util.h
@@ -0,0 +1,126 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+class DBBasicTestWithTimestampBase : public DBTestBase {
+ public:
+  explicit DBBasicTestWithTimestampBase(const std::string& dbname)
+      : DBTestBase(dbname, /*env_do_fsync=*/true) {}
+
+ protected:
+  static std::string Key1(uint64_t k);
+
+  static std::string KeyWithPrefix(std::string prefix, uint64_t k);
+
+  static std::vector<Slice> ConvertStrToSlice(
+      std::vector<std::string>& strings);
+
+  class TestComparator : public Comparator {
+   private:
+    const Comparator* cmp_without_ts_;
+
+   public:
+    explicit TestComparator(size_t ts_sz)
+        : Comparator(ts_sz), cmp_without_ts_(nullptr) {
+      cmp_without_ts_ = BytewiseComparator();
+    }
+
+    const char* Name() const override { return "TestComparator"; }
+
+    void FindShortSuccessor(std::string*) const override {}
+
+    void FindShortestSeparator(std::string*, const Slice&) const override {}
+
+    int Compare(const Slice& a, const Slice& b) const override {
+      int r = CompareWithoutTimestamp(a, b);
+      if (r != 0 || 0 == timestamp_size()) {
+        return r;
+      }
+      return -CompareTimestamp(
+          Slice(a.data() + a.size() - timestamp_size(), timestamp_size()),
+          Slice(b.data() + b.size() - timestamp_size(), timestamp_size()));
+    }
+
+    using Comparator::CompareWithoutTimestamp;
+    int CompareWithoutTimestamp(const Slice& a, bool a_has_ts, const Slice& b,
+                                bool b_has_ts) const override {
+      if (a_has_ts) {
+        assert(a.size() >= timestamp_size());
+      }
+      if (b_has_ts) {
+        assert(b.size() >= timestamp_size());
+      }
+      Slice lhs = a_has_ts ? StripTimestampFromUserKey(a, timestamp_size()) : a;
+      Slice rhs = b_has_ts ? StripTimestampFromUserKey(b, timestamp_size()) : b;
+      return cmp_without_ts_->Compare(lhs, rhs);
+    }
+
+    int CompareTimestamp(const Slice& ts1, const Slice& ts2) const override {
+      if (!ts1.data() && !ts2.data()) {
+        return 0;
+      } else if (ts1.data() && !ts2.data()) {
+        return 1;
+      } else if (!ts1.data() && ts2.data()) {
+        return -1;
+      }
+      assert(ts1.size() == ts2.size());
+      uint64_t low1 = 0;
+      uint64_t low2 = 0;
+      uint64_t high1 = 0;
+      uint64_t high2 = 0;
+      const size_t kSize = ts1.size();
+      std::unique_ptr<char[]> ts1_buf(new char[kSize]);
+      memcpy(ts1_buf.get(), ts1.data(), ts1.size());
+      std::unique_ptr<char[]> ts2_buf(new char[kSize]);
+      memcpy(ts2_buf.get(), ts2.data(), ts2.size());
+      Slice ts1_copy = Slice(ts1_buf.get(), kSize);
+      Slice ts2_copy = Slice(ts2_buf.get(), kSize);
+      auto* ptr1 = const_cast<Slice*>(&ts1_copy);
+      auto* ptr2 = const_cast<Slice*>(&ts2_copy);
+      if (!GetFixed64(ptr1, &low1) || !GetFixed64(ptr1, &high1) ||
+          !GetFixed64(ptr2, &low2) || !GetFixed64(ptr2, &high2)) {
+        assert(false);
+      }
+      if (high1 < high2) {
+        return -1;
+      } else if (high1 > high2) {
+        return 1;
+      }
+      if (low1 < low2) {
+        return -1;
+      } else if (low1 > low2) {
+        return 1;
+      }
+      return 0;
+    }
+  };
+
+  std::string Timestamp(uint64_t low, uint64_t high);
+
+  void CheckIterUserEntry(const Iterator* it, const Slice& expected_key,
+                          ValueType expected_value_type,
+                          const Slice& expected_value,
+                          const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      SequenceNumber expected_seq, ValueType expected_val_type,
+                      const Slice& expected_value,
+                      const Slice& expected_ts) const;
+
+  void CheckIterEntry(const Iterator* it, const Slice& expected_ukey,
+                      ValueType expected_val_type, const Slice& expected_value,
+                      const Slice& expected_ts) const;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/db_write_buffer_manager_test.cc b/src/rocksdb/db/db_write_buffer_manager_test.cc
new file mode 100644
index 000000000..4c31a7824
--- /dev/null
+++ b/src/rocksdb/db/db_write_buffer_manager_test.cc
@@ -0,0 +1,862 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/db_test_util.h"
+#include "db/write_thread.h"
+#include "port/stack_trace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWriteBufferManagerTest : public DBTestBase,
+                                 public testing::WithParamInterface<bool> {
+ public:
+  DBWriteBufferManagerTest()
+      : DBTestBase("db_write_buffer_manager_test", /*env_do_fsync=*/false) {}
+  bool cost_cache_;
+};
+
+TEST_P(DBWriteBufferManagerTest, SharedBufferAcrossCFs1) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3".
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  // This make sures write will go through and if stall was in effect, it will
+  // end.
+  ASSERT_OK(Put(0, Key(2), DummyString(1), wo));
+}
+
+// Test Single DB with multiple writer threads get blocked when
+// WriteBufferManager execeeds buffer_size_ and flush is waiting to be
+// finished.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferAcrossCFs2) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write is
+  // completed.
+
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.SignalAll();
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        InstrumentedMutexLock lock(&mutex);
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        w_set.insert(w);
+        // Allow the flush to continue if all writer threads are blocked.
+        if (w_set.size() == (unsigned long)num_writers) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  std::function<void(int)> writer = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    Status tmp = Put(cf, Slice(key), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  and they will be blocked.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(writer, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_writers; i++) {
+    threads.emplace_back(writer, i % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_writers);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple DBs get blocked when WriteBufferManager limit exceeds and flush
+// is waiting to be finished but DBs tries to write meanwhile.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager Limit exceeded.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.Signal();
+          // Since this is the last DB, signal Flush to continue.
+          if (wait_count_db == num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s = true;
+
+  // Write to DB.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s = s && tmp.ok();
+  };
+
+  // Flow:
+  // db_ will write and will be blocked (as Flush will on hold and will create
+  // stall in effect).
+  //  |
+  //  multiple dbs writers will be created to write to that db and they will be
+  //  blocked.
+  //  |
+  //  |
+  //  Last writer will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+
+  threads.emplace_back(write_db, db_);
+  // Wait untill first DB is blocked and then create the multiple writers for
+  // different DBs which will be blocked from getting added to the queue because
+  // stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+  for (int i = 0; i < num_dbs; i++) {
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s);
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple DBs and multiple columns get
+// blocked when stall by WriteBufferManager is in effect.
+TEST_P(DBWriteBufferManagerTest, SharedWriteBufferLimitAcrossDB1) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 3;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          thread_num.fetch_add(1);
+          cv.Signal();
+          // Allow the flush to continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        {
+          InstrumentedMutexLock lock(&mutex);
+          w_set.insert(w);
+          thread_num.fetch_add(1);
+          // Allow the flush continue if all writer threads are blocked.
+          if (thread_num.load(std::memory_order_relaxed) == 2 * num_dbs + 1) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  // Write to multiple columns of db_.
+  std::function<void(int)> write_cf = [&](int cf) {
+    Status tmp = Put(cf, Key(3), DummyString(1), wo);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+  // Write to multiple DBs.
+  std::function<void(DB*)> write_db = [&](DB* db) {
+    Status tmp = db->Put(wo, Key(3), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s2 = s2 && tmp.ok();
+  };
+
+  // Flow:
+  // thread will write to db_ will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded and will create stall in effect).
+  //  |
+  //  |
+  //  multiple writers threads writing to different DBs and to db_ across
+  //  multiple columns will be created and they will be blocked due to stall.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked it will signal Flush to
+  //  continue to clear the stall.
+  threads.emplace_back(write_db, db_);
+  // Wait untill first thread is blocked and then create the multiple writer
+  // threads.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i++) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_cf, i % 3);
+    // Write to different dbs.
+    threads.emplace_back(write_db, dbs[i]);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+
+  // Number of DBs blocked.
+  ASSERT_EQ(num_dbs + 1, wait_count_db);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_set.size(), num_dbs);
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ by passing
+// different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsSingleDB) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  CreateAndReopenWithCF({"cf1", "cf2", "cf3"}, options);
+
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  Flush(3);
+  ASSERT_OK(Put(3, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(1), wo));
+  Flush(0);
+
+  // Write to "Default", "cf2" and "cf3". No flush will be triggered.
+  ASSERT_OK(Put(3, Key(1), DummyString(30000), wo));
+  ASSERT_OK(Put(0, Key(1), DummyString(40000), wo));
+  ASSERT_OK(Put(2, Key(1), DummyString(1), wo));
+  ASSERT_OK(Put(3, Key(2), DummyString(40000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // db_ is completed.
+
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  int num_writers = 4;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          wait_count_db++;
+          cv.SignalAll();
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        {
+          InstrumentedMutexLock lock(&mutex);
+          WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+          w_slowdown_set.insert(w);
+          // Allow the flush continue if all writer threads are blocked.
+          if (w_slowdown_set.size() + (unsigned long)w_no_slowdown.load(
+                                          std::memory_order_relaxed) ==
+              (unsigned long)num_writers) {
+            TEST_SYNC_POINT(
+                "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+          }
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+
+  std::function<void(int)> write_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(int)> write_no_slow_down = [&](int cf) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = Put(cf, Slice(key), DummyString(1), write_op);
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      // Allow the flush continue if all writer threads are blocked.
+      if (w_slowdown_set.size() +
+              (unsigned long)w_no_slowdown.load(std::memory_order_relaxed) ==
+          (unsigned long)num_writers) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // main_writer thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  with different values of WriteOptions.no_slowdown. Some of them will
+  //  be blocked and some of them will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, 1);
+  // Wait untill first thread (main_writer) writing to DB is blocked and then
+  // create the multiple writers which will be blocked from getting added to the
+  // queue because stall is in effect.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_writers; i += 2) {
+    threads.emplace_back(write_no_slow_down, (i) % 4);
+    threads.emplace_back(write_slow_down, (i + 1) % 4);
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ(wait_count_db, 1);
+  // Number of Writer threads blocked.
+  ASSERT_EQ(w_slowdown_set.size(), num_writers / 2);
+  // Number of Writer threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_writers / 2);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+// Test multiple threads writing across multiple columns of db_ and different
+// dbs by passing different values to WriteOption.no_slown_down.
+TEST_P(DBWriteBufferManagerTest, MixedSlowDownOptionsMultipleDB) {
+  std::vector<std::string> dbnames;
+  std::vector<DB*> dbs;
+  int num_dbs = 4;
+
+  for (int i = 0; i < num_dbs; i++) {
+    dbs.push_back(nullptr);
+    dbnames.push_back(
+        test::PerThreadDBPath("db_shared_wb_db" + std::to_string(i)));
+  }
+
+  Options options = CurrentOptions();
+  options.arena_block_size = 4096;
+  options.write_buffer_size = 500000;  // this is never hit
+  std::shared_ptr<Cache> cache = NewLRUCache(4 * 1024 * 1024, 2);
+  ASSERT_LT(cache->GetUsage(), 256 * 1024);
+  cost_cache_ = GetParam();
+
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, cache, true));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(100000, nullptr, true));
+  }
+  CreateAndReopenWithCF({"cf1", "cf2"}, options);
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    ASSERT_OK(DB::Open(options, dbnames[i], &(dbs[i])));
+  }
+  WriteOptions wo;
+  wo.disableWAL = true;
+
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Put(wo, Key(1), DummyString(20000)));
+  }
+  // Insert to db_.
+  ASSERT_OK(Put(0, Key(1), DummyString(30000), wo));
+
+  // WriteBufferManager::buffer_size_ has exceeded after the previous write to
+  // dbs[0] is completed.
+  std::vector<port::Thread> threads;
+  int wait_count_db = 0;
+  InstrumentedMutex mutex;
+  InstrumentedCondVar cv(&mutex);
+  std::unordered_set<WriteThread::Writer*> w_slowdown_set;
+  std::vector<port::Thread> writer_threads;
+  std::atomic<int> thread_num(0);
+  std::atomic<int> w_no_slowdown(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0",
+        "DBImpl::BackgroundCallFlush:start"}});
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WBMStallInterface::BlockDB", [&](void*) {
+        InstrumentedMutexLock lock(&mutex);
+        wait_count_db++;
+        cv.Signal();
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::WriteStall::Wait", [&](void* arg) {
+        WriteThread::Writer* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        InstrumentedMutexLock lock(&mutex);
+        w_slowdown_set.insert(w);
+        // Allow the flush continue if all writer threads are blocked.
+        if (w_slowdown_set.size() +
+                (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                                wait_count_db) ==
+            (unsigned long)(2 * num_dbs + 1)) {
+          TEST_SYNC_POINT(
+              "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  bool s1 = true, s2 = true;
+  std::function<void(DB*)> write_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = false;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    InstrumentedMutexLock lock(&mutex);
+    s1 = s1 && tmp.ok();
+  };
+
+  std::function<void(DB*)> write_no_slow_down = [&](DB* db) {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions write_op;
+    write_op.no_slowdown = true;
+    Status tmp = db->Put(write_op, Slice(key), DummyString(1));
+    {
+      InstrumentedMutexLock lock(&mutex);
+      s2 = s2 && !tmp.ok();
+      w_no_slowdown.fetch_add(1);
+      if (w_slowdown_set.size() +
+              (unsigned long)(w_no_slowdown.load(std::memory_order_relaxed) +
+                              wait_count_db) ==
+          (unsigned long)(2 * num_dbs + 1)) {
+        TEST_SYNC_POINT(
+            "DBWriteBufferManagerTest::SharedWriteBufferAcrossCFs:0");
+      }
+    }
+  };
+
+  // Flow:
+  // first thread will write but will be blocked (as Flush will on hold,
+  // buffer_size_ has exceeded, thus will create stall in effect).
+  //  |
+  //  |
+  //  multiple writer threads will be created to write across multiple columns
+  //  of db_ and different DBs with different values of
+  //  WriteOptions.no_slowdown. Some of them will be blocked and some of them
+  //  will return with Incomplete status.
+  //  |
+  //  |
+  //  Last writer thread will write and when its blocked/return it will signal
+  //  Flush to continue to clear the stall.
+  threads.emplace_back(write_slow_down, db_);
+  // Wait untill first thread writing to DB is blocked and then
+  // create the multiple writers.
+  {
+    InstrumentedMutexLock lock(&mutex);
+    while (wait_count_db != 1) {
+      cv.Wait();
+    }
+  }
+
+  for (int i = 0; i < num_dbs; i += 2) {
+    // Write to multiple columns of db_.
+    writer_threads.emplace_back(write_slow_down, db_);
+    writer_threads.emplace_back(write_no_slow_down, db_);
+    // Write to different DBs.
+    threads.emplace_back(write_slow_down, dbs[i]);
+    threads.emplace_back(write_no_slow_down, dbs[i + 1]);
+  }
+
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  for (auto& t : writer_threads) {
+    t.join();
+  }
+
+  ASSERT_TRUE(s1);
+  ASSERT_TRUE(s2);
+  // Number of DBs blocked.
+  ASSERT_EQ((num_dbs / 2) + 1, wait_count_db);
+  // Number of writer threads writing to db_ blocked from getting added to the
+  // queue.
+  ASSERT_EQ(w_slowdown_set.size(), num_dbs / 2);
+  // Number of threads with WriteOptions.no_slowdown = true.
+  ASSERT_EQ(w_no_slowdown.load(std::memory_order_relaxed), num_dbs);
+
+  // Clean up DBs.
+  for (int i = 0; i < num_dbs; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    ASSERT_OK(DestroyDB(dbnames[i], options));
+    delete dbs[i];
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+#ifndef ROCKSDB_LITE
+
+// Tests a `WriteBufferManager` constructed with `allow_stall == false` does not
+// thrash memtable switching when full and a CF receives multiple writes.
+// Instead, we expect to switch a CF's memtable for flush only when that CF does
+// not have any pending or running flush.
+//
+// This test uses multiple DBs each with a single CF instead of a single DB
+// with multiple CFs. That way we can control which CF is considered for switch
+// by writing to that CF's DB.
+//
+// Not supported in LITE mode due to `GetProperty()` unavailable.
+TEST_P(DBWriteBufferManagerTest, StopSwitchingMemTablesOnceFlushing) {
+  Options options = CurrentOptions();
+  options.arena_block_size = 4 << 10;   // 4KB
+  options.write_buffer_size = 1 << 20;  // 1MB
+  std::shared_ptr<Cache> cache =
+      NewLRUCache(4 << 20 /* capacity (4MB) */, 2 /* num_shard_bits */);
+  ASSERT_LT(cache->GetUsage(), 256 << 10 /* 256KB */);
+  cost_cache_ = GetParam();
+  if (cost_cache_) {
+    options.write_buffer_manager.reset(new WriteBufferManager(
+        512 << 10 /* buffer_size (512KB) */, cache, false /* allow_stall */));
+  } else {
+    options.write_buffer_manager.reset(
+        new WriteBufferManager(512 << 10 /* buffer_size (512KB) */,
+                               nullptr /* cache */, false /* allow_stall */));
+  }
+
+  Reopen(options);
+  std::string dbname = test::PerThreadDBPath("db_shared_wbm_db");
+  DB* shared_wbm_db = nullptr;
+
+  ASSERT_OK(DestroyDB(dbname, options));
+  ASSERT_OK(DB::Open(options, dbname, &shared_wbm_db));
+
+  // The last write will make WBM need flush, but it won't flush yet.
+  ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+  ASSERT_FALSE(options.write_buffer_manager->ShouldFlush());
+  ASSERT_OK(Put(Key(1), DummyString(256 << 10 /* 256KB */), WriteOptions()));
+  ASSERT_TRUE(options.write_buffer_manager->ShouldFlush());
+
+  // Flushes will be pending, not running because flush threads are blocked.
+  test::SleepingBackgroundTask sleeping_task_high;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_high, Env::Priority::HIGH);
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(
+        shared_wbm_db->Put(WriteOptions(), Key(1), DummyString(1 /* len */)));
+    std::string prop;
+    ASSERT_TRUE(
+        shared_wbm_db->GetProperty("rocksdb.num-immutable-mem-table", &prop));
+    ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+    ASSERT_TRUE(
+        shared_wbm_db->GetProperty("rocksdb.mem-table-flush-pending", &prop));
+    ASSERT_EQ(std::to_string(i > 0 ? 1 : 0), prop);
+  }
+
+  // Clean up DBs.
+  sleeping_task_high.WakeUp();
+  sleeping_task_high.WaitUntilDone();
+  ASSERT_OK(shared_wbm_db->Close());
+  ASSERT_OK(DestroyDB(dbname, options));
+  delete shared_wbm_db;
+}
+
+#endif  // ROCKSDB_LITE
+
+INSTANTIATE_TEST_CASE_P(DBWriteBufferManagerTest, DBWriteBufferManagerTest,
+                        testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/db_write_test.cc b/src/rocksdb/db/db_write_test.cc
new file mode 100644
index 000000000..1011d5c9e
--- /dev/null
+++ b/src/rocksdb/db/db_write_test.cc
@@ -0,0 +1,679 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <atomic>
+#include <fstream>
+#include <memory>
+#include <thread>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/write_batch_internal.h"
+#include "db/write_thread.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Test variations of WriteImpl.
+class DBWriteTest : public DBTestBase, public testing::WithParamInterface<int> {
+ public:
+  DBWriteTest() : DBTestBase("db_write_test", /*env_do_fsync=*/true) {}
+
+  Options GetOptions() { return DBTestBase::GetOptions(GetParam()); }
+
+  void Open() { DBTestBase::Reopen(GetOptions()); }
+};
+
+class DBWriteTestUnparameterized : public DBTestBase {
+ public:
+  explicit DBWriteTestUnparameterized()
+      : DBTestBase("pipelined_write_test", /*env_do_fsync=*/false) {}
+};
+
+// It is invalid to do sync write while disabling WAL.
+TEST_P(DBWriteTest, SyncAndDisableWAL) {
+  WriteOptions write_options;
+  write_options.sync = true;
+  write_options.disableWAL = true;
+  ASSERT_TRUE(dbfull()->Put(write_options, "foo", "bar").IsInvalidArgument());
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  ASSERT_TRUE(dbfull()->Write(write_options, &batch).IsInvalidArgument());
+}
+
+TEST_P(DBWriteTest, WriteStallRemoveNoSlowdownWrite) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+      4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
+  };
+  std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+    mutex.Lock();
+    ++writers;
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteStallRemoveNoSlowdownWrite:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:2",
+        "DBImplWrite::PipelinedWriteImpl:AfterJoinBatchGroup"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteStallRemoveNoSlowdownWrite:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  ASSERT_OK(dbfull()->Flush(fopt));
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 1) {
+    cv.Wait();
+  }
+
+  // Second leader. Will stall writes
+  // Build a writers list with no slowdown in the middle:
+  //  +-------------+
+  //  | slowdown    +<----+ newest
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | no slowdown |
+  //  +--+----------+
+  //     |
+  //     v
+  //  +--+----------+
+  //  | slowdown    +
+  //  +-------------+
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 2) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_no_slowdown_func);
+  while (writers != 3) {
+    cv.Wait();
+  }
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 4) {
+    cv.Wait();
+  }
+
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:1");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:2");
+  // The leader is going to create missing newer links. When the leader
+  // finishes, the next leader is going to delay writes and fail writers with
+  // no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteStallRemoveNoSlowdownWrite:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, WriteThreadHangOnWriteStall) {
+  Options options = GetOptions();
+  options.level0_stop_writes_trigger = options.level0_slowdown_writes_trigger =
+      4;
+  std::vector<port::Thread> threads;
+  std::atomic<int> thread_num(0);
+  port::Mutex mutex;
+  port::CondVar cv(&mutex);
+  // Guarded by mutex
+  int writers = 0;
+
+  Reopen(options);
+
+  std::function<void()> write_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = false;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+  };
+  std::function<void()> write_no_slowdown_func = [&]() {
+    int a = thread_num.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    wo.no_slowdown = true;
+    Status s = dbfull()->Put(wo, key, "bar");
+    ASSERT_TRUE(s.ok() || s.IsIncomplete());
+  };
+  std::function<void(void*)> unblock_main_thread_func = [&](void*) {
+    mutex.Lock();
+    ++writers;
+    cv.SignalAll();
+    mutex.Unlock();
+  };
+
+  // Create 3 L0 files and schedule 4th without waiting
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("foo" + std::to_string(thread_num.fetch_add(1)), "bar"));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Start", unblock_main_thread_func);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBWriteTest::WriteThreadHangOnWriteStall:1",
+        "DBImpl::BackgroundCallFlush:start"},
+       {"DBWriteTest::WriteThreadHangOnWriteStall:2",
+        "DBImpl::WriteImpl:BeforeLeaderEnters"},
+       // Make compaction start wait for the write stall to be detected and
+       // implemented by a write group leader
+       {"DBWriteTest::WriteThreadHangOnWriteStall:3",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Schedule creation of 4th L0 file without waiting. This will seal the
+  // memtable and then wait for a sync point before writing the file. We need
+  // to do it this way because SwitchMemtable() needs to enter the
+  // write_thread
+  FlushOptions fopt;
+  fopt.wait = false;
+  ASSERT_OK(dbfull()->Flush(fopt));
+
+  // Create a mix of slowdown/no_slowdown write threads
+  mutex.Lock();
+  // First leader
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 1) {
+    cv.Wait();
+  }
+  // Second leader. Will stall writes
+  threads.emplace_back(write_slowdown_func);
+  threads.emplace_back(write_no_slowdown_func);
+  threads.emplace_back(write_slowdown_func);
+  threads.emplace_back(write_no_slowdown_func);
+  threads.emplace_back(write_slowdown_func);
+  while (writers != 6) {
+    cv.Wait();
+  }
+  mutex.Unlock();
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:1");
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(nullptr));
+  // This would have triggered a write stall. Unblock the write group leader
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:2");
+  // The leader is going to create missing newer links. When the leader
+  // finishes, the next leader is going to delay writes and fail writers with
+  // no_slowdown
+
+  TEST_SYNC_POINT("DBWriteTest::WriteThreadHangOnWriteStall:3");
+  for (auto& t : threads) {
+    t.join();
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWritePropagateToWriteThreadFollower) {
+  constexpr int kNumThreads = 5;
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  std::atomic<int> ready_count{0};
+  std::atomic<int> leader_count{0};
+  std::vector<port::Thread> threads;
+  mock_env->SetFilesystemActive(false);
+
+  // Wait until all threads linked to write threads, to make sure
+  // all threads join the same batch group.
+  SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        ready_count++;
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          leader_count++;
+          while (ready_count < kNumThreads) {
+            // busy waiting
+          }
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  for (int i = 0; i < kNumThreads; i++) {
+    threads.push_back(port::Thread(
+        [&](int index) {
+          // All threads should fail.
+          auto res = Put("key" + std::to_string(index), "value");
+          if (options.manual_wal_flush) {
+            ASSERT_TRUE(res.ok());
+            // we should see fs error when we do the flush
+
+            // TSAN reports a false alarm for lock-order-inversion but Open and
+            // FlushWAL are not run concurrently. Disabling this until TSAN is
+            // fixed.
+            // res = dbfull()->FlushWAL(false);
+            // ASSERT_FALSE(res.ok());
+          } else {
+            ASSERT_FALSE(res.ok());
+          }
+        },
+        i));
+  }
+  for (int i = 0; i < kNumThreads; i++) {
+    threads[i].join();
+  }
+  ASSERT_EQ(1, leader_count);
+
+  // The Failed PUT operations can cause a BG error to be set.
+  // Mark it as Checked for the ASSERT_STATUS_CHECKED
+  dbfull()->Resume().PermitUncheckedError();
+
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_F(DBWriteTestUnparameterized, PipelinedWriteRace) {
+  // This test was written to trigger a race in ExitAsBatchGroupLeader in case
+  // enable_pipelined_write_ was true.
+  // Writers for which ShouldWriteToMemtable() evaluates to false are removed
+  // from the write_group via CompleteFollower/ CompleteLeader. Writers in the
+  // middle of the group are fully unlinked, but if that writers is the
+  // last_writer, then we did not update the predecessor's link_older, i.e.,
+  // this writer was still reachable via newest_writer_.
+  //
+  // But the problem was, that CompleteFollower already wakes up the thread
+  // owning that writer before the writer has been removed. This resulted in a
+  // race - if the leader thread was fast enough, then everything was fine.
+  // However, if the woken up thread finished the current write operation and
+  // then performed yet another write, then a new writer instance was added
+  // to newest_writer_. It is possible that the new writer is located on the
+  // same address on stack, and if this happened, then we had a problem,
+  // because the old code tried to find the last_writer in the list to unlink
+  // it, which in this case produced a cycle in the list.
+  // Whether two invocations of PipelinedWriteImpl() by the same thread actually
+  // allocate the writer on the same address depends on the OS and/or compiler,
+  // so it is rather hard to create a deterministic test for this.
+
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_pipelined_write = true;
+  std::vector<port::Thread> threads;
+
+  std::atomic<int> write_counter{0};
+  std::atomic<int> active_writers{0};
+  std::atomic<bool> second_write_starting{false};
+  std::atomic<bool> second_write_in_progress{false};
+  std::atomic<WriteThread::Writer*> leader{nullptr};
+  std::atomic<bool> finished_WAL_write{false};
+
+  DestroyAndReopen(options);
+
+  auto write_one_doc = [&]() {
+    int a = write_counter.fetch_add(1);
+    std::string key = "foo" + std::to_string(a);
+    WriteOptions wo;
+    ASSERT_OK(dbfull()->Put(wo, key, "bar"));
+    --active_writers;
+  };
+
+  auto write_two_docs = [&]() {
+    write_one_doc();
+    second_write_starting = true;
+    write_one_doc();
+  };
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+        if (second_write_starting.load()) {
+          second_write_in_progress = true;
+          return;
+        }
+        auto* w = reinterpret_cast<WriteThread::Writer*>(arg);
+        if (w->state == WriteThread::STATE_GROUP_LEADER) {
+          active_writers++;
+          if (leader.load() == nullptr) {
+            leader.store(w);
+            while (active_writers.load() < 2) {
+              // wait for another thread to join the write_group
+            }
+          }
+        } else {
+          // we disable the memtable for all followers so that they they are
+          // removed from the write_group before enqueuing it for the memtable
+          // write
+          w->disable_memtable = true;
+          active_writers++;
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::ExitAsBatchGroupLeader:Start", [&](void* arg) {
+        auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+        if (wg->leader == leader && !finished_WAL_write) {
+          finished_WAL_write = true;
+          while (active_writers.load() < 3) {
+            // wait for the new writer to be enqueued
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+      [&](void* arg) {
+        auto* wg = reinterpret_cast<WriteThread::WriteGroup*>(arg);
+        if (wg->leader == leader) {
+          while (!second_write_in_progress.load()) {
+            // wait for the old follower thread to start the next write
+          }
+        }
+      });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // start leader + one follower
+  threads.emplace_back(write_one_doc);
+  while (leader.load() == nullptr) {
+    // wait for leader
+  }
+
+  // we perform two writes in the follower, so that for the second write
+  // the thread reinserts a Writer with the same address
+  threads.emplace_back(write_two_docs);
+
+  // wait for the leader to enter ExitAsBatchGroupLeader
+  while (!finished_WAL_write.load()) {
+    // wait for write_group to have finished the WAL writes
+  }
+
+  // start another writer thread to be enqueued before the leader can
+  // complete the writers from its write_group
+  threads.emplace_back(write_one_doc);
+
+  for (auto& t : threads) {
+    t.join();
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(DBWriteTest, ManualWalFlushInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+  // try the 2nd wal created during SwitchWAL
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+  ASSERT_TRUE(Put("key" + std::to_string(0), "value").ok());
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_TRUE(dbfull()->FlushWAL(false).ok());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty());
+}
+
+TEST_P(DBWriteTest, UnflushedPutRaceWithTrackedWalSync) {
+  // Repro race condition bug where unflushed WAL data extended the synced size
+  // recorded to MANIFEST despite being unrecoverable.
+  Options options = GetOptions();
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_env.get();
+  options.manual_wal_flush = true;
+  options.track_and_verify_wals_in_manifest = true;
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "val1"));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncWAL:Begin",
+      [this](void* /* arg */) { ASSERT_OK(Put("key2", "val2")); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  // Ensure callback ran.
+  ASSERT_EQ("val2", Get("key2"));
+
+  Close();
+
+  // Simulate full loss of unsynced data. This drops "key2" -> "val2" from the
+  // DB WAL.
+  fault_env->DropUnsyncedFileData();
+
+  Reopen(options);
+
+  // Need to close before `fault_env` goes out of scope.
+  Close();
+}
+
+TEST_P(DBWriteTest, InactiveWalFullySyncedBeforeUntracked) {
+  // Repro bug where a WAL is appended and switched after
+  // `FlushWAL(true /* sync */)`'s sync finishes and before it untracks fully
+  // synced inactive logs. Previously such a WAL would be wrongly untracked
+  // so the final append would never be synced.
+  Options options = GetOptions();
+  std::unique_ptr<FaultInjectionTestEnv> fault_env(
+      new FaultInjectionTestEnv(env_));
+  options.env = fault_env.get();
+  Reopen(options);
+
+  ASSERT_OK(Put("key1", "val1"));
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncWAL:BeforeMarkLogsSynced:1", [this](void* /* arg */) {
+        ASSERT_OK(Put("key2", "val2"));
+        ASSERT_OK(dbfull()->TEST_SwitchMemtable());
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_OK(Put("key3", "val3"));
+
+  ASSERT_OK(db_->FlushWAL(true /* sync */));
+
+  Close();
+
+  // Simulate full loss of unsynced data. This should drop nothing since we did
+  // `FlushWAL(true /* sync */)` before `Close()`.
+  fault_env->DropUnsyncedFileData();
+
+  Reopen(options);
+
+  ASSERT_EQ("val1", Get("key1"));
+  ASSERT_EQ("val2", Get("key2"));
+  ASSERT_EQ("val3", Get("key3"));
+
+  // Need to close before `fault_env` goes out of scope.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnWALWriteTriggersReadOnlyMode) {
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  Reopen(options);
+  for (int i = 0; i < 2; i++) {
+    // Forcibly fail WAL write for the first Put only. Subsequent Puts should
+    // fail due to read-only mode
+    mock_env->SetFilesystemActive(i != 0);
+    auto res = Put("key" + std::to_string(i), "value");
+    // TSAN reports a false alarm for lock-order-inversion but Open and
+    // FlushWAL are not run concurrently. Disabling this until TSAN is
+    // fixed.
+    /*
+    if (options.manual_wal_flush && i == 0) {
+      // even with manual_wal_flush the 2nd Put should return error because of
+      // the read-only mode
+      ASSERT_TRUE(res.ok());
+      // we should see fs error when we do the flush
+      res = dbfull()->FlushWAL(false);
+    }
+    */
+    if (!options.manual_wal_flush) {
+      ASSERT_NOK(res);
+    } else {
+      ASSERT_OK(res);
+    }
+  }
+  // Close before mock_env destruct.
+  Close();
+}
+
+TEST_P(DBWriteTest, IOErrorOnSwitchMemtable) {
+  Random rnd(301);
+  std::unique_ptr<FaultInjectionTestEnv> mock_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = GetOptions();
+  options.env = mock_env.get();
+  options.writable_file_max_buffer_size = 4 * 1024 * 1024;
+  options.write_buffer_size = 3 * 512 * 1024;
+  options.wal_bytes_per_sync = 256 * 1024;
+  options.manual_wal_flush = true;
+  Reopen(options);
+  mock_env->SetFilesystemActive(false, Status::IOError("Not active"));
+  Status s;
+  for (int i = 0; i < 4 * 512; ++i) {
+    s = Put(Key(i), rnd.RandomString(1024));
+    if (!s.ok()) {
+      break;
+    }
+  }
+  ASSERT_EQ(s.severity(), Status::Severity::kFatalError);
+
+  mock_env->SetFilesystemActive(true);
+  // Close before mock_env destruct.
+  Close();
+}
+
+// Test that db->LockWAL() flushes the WAL after locking.
+TEST_P(DBWriteTest, LockWalInEffect) {
+  Options options = GetOptions();
+  Reopen(options);
+  // try the 1st WAL created during open
+  ASSERT_OK(Put("key" + std::to_string(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+  // try the 2nd wal created during SwitchWAL
+  ASSERT_OK(dbfull()->TEST_SwitchWAL());
+  ASSERT_OK(Put("key" + std::to_string(0), "value"));
+  ASSERT_TRUE(options.manual_wal_flush != dbfull()->WALBufferIsEmpty());
+  ASSERT_OK(dbfull()->LockWAL());
+  ASSERT_TRUE(dbfull()->WALBufferIsEmpty(false));
+  ASSERT_OK(dbfull()->UnlockWAL());
+}
+
+TEST_P(DBWriteTest, ConcurrentlyDisabledWAL) {
+  Options options = GetOptions();
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  options.statistics->set_stats_level(StatsLevel::kAll);
+  Reopen(options);
+  std::string wal_key_prefix = "WAL_KEY_";
+  std::string no_wal_key_prefix = "K_";
+  // 100 KB value each for NO-WAL operation
+  std::string no_wal_value(1024 * 100, 'X');
+  // 1B value each for WAL operation
+  std::string wal_value = "0";
+  std::thread threads[10];
+  for (int t = 0; t < 10; t++) {
+    threads[t] = std::thread([t, wal_key_prefix, wal_value, no_wal_key_prefix,
+                              no_wal_value, this] {
+      for (int i = 0; i < 10; i++) {
+        ROCKSDB_NAMESPACE::WriteOptions write_option_disable;
+        write_option_disable.disableWAL = true;
+        ROCKSDB_NAMESPACE::WriteOptions write_option_default;
+        std::string no_wal_key =
+            no_wal_key_prefix + std::to_string(t) + "_" + std::to_string(i);
+        ASSERT_OK(this->Put(no_wal_key, no_wal_value, write_option_disable));
+        std::string wal_key =
+            wal_key_prefix + std::to_string(i) + "_" + std::to_string(i);
+        ASSERT_OK(this->Put(wal_key, wal_value, write_option_default));
+        ASSERT_OK(dbfull()->SyncWAL());
+      }
+      return;
+    });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  uint64_t bytes_num = options.statistics->getTickerCount(
+      ROCKSDB_NAMESPACE::Tickers::WAL_FILE_BYTES);
+  // written WAL size should less than 100KB (even included HEADER & FOOTER
+  // overhead)
+  ASSERT_LE(bytes_num, 1024 * 100);
+}
+
+INSTANTIATE_TEST_CASE_P(DBWriteTestInstance, DBWriteTest,
+                        testing::Values(DBTestBase::kDefault,
+                                        DBTestBase::kConcurrentWALWrites,
+                                        DBTestBase::kPipelinedWrite));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/dbformat.cc b/src/rocksdb/db/dbformat.cc
new file mode 100644
index 000000000..b0ac6c339
--- /dev/null
+++ b/src/rocksdb/db/dbformat.cc
@@ -0,0 +1,188 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#include "db/dbformat.h"
+
+#include <stdio.h>
+
+#include <cinttypes>
+
+#include "db/lookup_key.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// kValueTypeForSeek defines the ValueType that should be passed when
+// constructing a ParsedInternalKey object for seeking to a particular
+// sequence number (since we sort sequence numbers in decreasing order
+// and the value type is embedded as the low 8 bits in the sequence
+// number in internal keys, we need to use the highest-numbered
+// ValueType, not the lowest).
+const ValueType kValueTypeForSeek = kTypeWideColumnEntity;
+const ValueType kValueTypeForSeekForPrev = kTypeDeletion;
+const std::string kDisableUserTimestamp("");
+
+EntryType GetEntryType(ValueType value_type) {
+  switch (value_type) {
+    case kTypeValue:
+      return kEntryPut;
+    case kTypeDeletion:
+      return kEntryDelete;
+    case kTypeDeletionWithTimestamp:
+      return kEntryDeleteWithTimestamp;
+    case kTypeSingleDeletion:
+      return kEntrySingleDelete;
+    case kTypeMerge:
+      return kEntryMerge;
+    case kTypeRangeDeletion:
+      return kEntryRangeDeletion;
+    case kTypeBlobIndex:
+      return kEntryBlobIndex;
+    case kTypeWideColumnEntity:
+      return kEntryWideColumnEntity;
+    default:
+      return kEntryOther;
+  }
+}
+
+void AppendInternalKey(std::string* result, const ParsedInternalKey& key) {
+  result->append(key.user_key.data(), key.user_key.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyWithDifferentTimestamp(std::string* result,
+                                             const ParsedInternalKey& key,
+                                             const Slice& ts) {
+  assert(key.user_key.size() >= ts.size());
+  result->append(key.user_key.data(), key.user_key.size() - ts.size());
+  result->append(ts.data(), ts.size());
+  PutFixed64(result, PackSequenceAndType(key.sequence, key.type));
+}
+
+void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                             ValueType t) {
+  PutFixed64(result, PackSequenceAndType(s, t));
+}
+
+void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMin(ts_sz, static_cast<unsigned char>(0));
+  result->append(key.data(), key.size());
+  result->append(kTsMin.data(), ts_sz);
+}
+
+void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                               size_t ts_sz) {
+  assert(ts_sz > 0);
+  const std::string kTsMax(ts_sz, static_cast<unsigned char>(0xff));
+  result->append(key.data(), key.size());
+  result->append(kTsMax.data(), ts_sz);
+}
+
+void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                   size_t ts_sz) {
+  assert(ts_sz > 0);
+  result->append(key.data(), key.size() - ts_sz);
+
+  static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+  if (ts_sz < strlen(kTsMax)) {
+    result->append(kTsMax, ts_sz);
+  } else {
+    result->append(std::string(ts_sz, '\xff'));
+  }
+}
+
+std::string ParsedInternalKey::DebugString(bool log_err_key, bool hex) const {
+  std::string result = "'";
+  if (log_err_key) {
+    result += user_key.ToString(hex);
+  } else {
+    result += "<redacted>";
+  }
+
+  char buf[50];
+  snprintf(buf, sizeof(buf), "' seq:%" PRIu64 ", type:%d", sequence,
+           static_cast<int>(type));
+
+  result += buf;
+  return result;
+}
+
+std::string InternalKey::DebugString(bool hex) const {
+  std::string result;
+  ParsedInternalKey parsed;
+  if (ParseInternalKey(rep_, &parsed, false /* log_err_key */).ok()) {
+    result = parsed.DebugString(true /* log_err_key */, hex);  // TODO
+  } else {
+    result = "(bad)";
+    result.append(EscapeString(rep_));
+  }
+  return result;
+}
+
+int InternalKeyComparator::Compare(const ParsedInternalKey& a,
+                                   const ParsedInternalKey& b) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(a.user_key, b.user_key);
+  if (r == 0) {
+    if (a.sequence > b.sequence) {
+      r = -1;
+    } else if (a.sequence < b.sequence) {
+      r = +1;
+    } else if (a.type > b.type) {
+      r = -1;
+    } else if (a.type < b.type) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s,
+                     const Slice* ts) {
+  size_t usize = _user_key.size();
+  size_t ts_sz = (nullptr == ts) ? 0 : ts->size();
+  size_t needed = usize + ts_sz + 13;  // A conservative estimate
+  char* dst;
+  if (needed <= sizeof(space_)) {
+    dst = space_;
+  } else {
+    dst = new char[needed];
+  }
+  start_ = dst;
+  // NOTE: We don't support users keys of more than 2GB :)
+  dst = EncodeVarint32(dst, static_cast<uint32_t>(usize + ts_sz + 8));
+  kstart_ = dst;
+  memcpy(dst, _user_key.data(), usize);
+  dst += usize;
+  if (nullptr != ts) {
+    memcpy(dst, ts->data(), ts_sz);
+    dst += ts_sz;
+  }
+  EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek));
+  dst += 8;
+  end_ = dst;
+}
+
+void IterKey::EnlargeBuffer(size_t key_size) {
+  // If size is smaller than buffer size, continue using current buffer,
+  // or the static allocated one, as default
+  assert(key_size > buf_size_);
+  // Need to enlarge the buffer.
+  ResetBuffer();
+  buf_ = new char[key_size];
+  buf_size_ = key_size;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat.h b/src/rocksdb/db/dbformat.h
new file mode 100644
index 000000000..8c1fc7055
--- /dev/null
+++ b/src/rocksdb/db/dbformat.h
@@ -0,0 +1,865 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdio.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "rocksdb/comparator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "util/coding.h"
+#include "util/user_comparator_wrapper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// The file declares data structures and functions that deal with internal
+// keys.
+// Each internal key contains a user key, a sequence number (SequenceNumber)
+// and a type (ValueType), and they are usually encoded together.
+// There are some related helper classes here.
+
+class InternalKey;
+
+// Value types encoded as the last component of internal keys.
+// DO NOT CHANGE THESE ENUM VALUES: they are embedded in the on-disk
+// data structures.
+// The highest bit of the value type needs to be reserved to SST tables
+// for them to do more flexible encoding.
+enum ValueType : unsigned char {
+  kTypeDeletion = 0x0,
+  kTypeValue = 0x1,
+  kTypeMerge = 0x2,
+  kTypeLogData = 0x3,               // WAL only.
+  kTypeColumnFamilyDeletion = 0x4,  // WAL only.
+  kTypeColumnFamilyValue = 0x5,     // WAL only.
+  kTypeColumnFamilyMerge = 0x6,     // WAL only.
+  kTypeSingleDeletion = 0x7,
+  kTypeColumnFamilySingleDeletion = 0x8,  // WAL only.
+  kTypeBeginPrepareXID = 0x9,             // WAL only.
+  kTypeEndPrepareXID = 0xA,               // WAL only.
+  kTypeCommitXID = 0xB,                   // WAL only.
+  kTypeRollbackXID = 0xC,                 // WAL only.
+  kTypeNoop = 0xD,                        // WAL only.
+  kTypeColumnFamilyRangeDeletion = 0xE,   // WAL only.
+  kTypeRangeDeletion = 0xF,               // meta block
+  kTypeColumnFamilyBlobIndex = 0x10,      // Blob DB only
+  kTypeBlobIndex = 0x11,                  // Blob DB only
+  // When the prepared record is also persisted in db, we use a different
+  // record. This is to ensure that the WAL that is generated by a WritePolicy
+  // is not mistakenly read by another, which would result into data
+  // inconsistency.
+  kTypeBeginPersistedPrepareXID = 0x12,  // WAL only.
+  // Similar to kTypeBeginPersistedPrepareXID, this is to ensure that WAL
+  // generated by WriteUnprepared write policy is not mistakenly read by
+  // another.
+  kTypeBeginUnprepareXID = 0x13,  // WAL only.
+  kTypeDeletionWithTimestamp = 0x14,
+  kTypeCommitXIDAndTimestamp = 0x15,  // WAL only
+  kTypeWideColumnEntity = 0x16,
+  kTypeColumnFamilyWideColumnEntity = 0x17,  // WAL only
+  kTypeMaxValid,    // Should be after the last valid type, only used for
+                    // validation
+  kMaxValue = 0x7F  // Not used for storing records.
+};
+
+// Defined in dbformat.cc
+extern const ValueType kValueTypeForSeek;
+extern const ValueType kValueTypeForSeekForPrev;
+
+// Checks whether a type is an inline value type
+// (i.e. a type used in memtable skiplist and sst file datablock).
+inline bool IsValueType(ValueType t) {
+  return t <= kTypeMerge || kTypeSingleDeletion == t || kTypeBlobIndex == t ||
+         kTypeDeletionWithTimestamp == t || kTypeWideColumnEntity == t;
+}
+
+// Checks whether a type is from user operation
+// kTypeRangeDeletion is in meta block so this API is separated from above
+inline bool IsExtendedValueType(ValueType t) {
+  return IsValueType(t) || t == kTypeRangeDeletion;
+}
+
+// We leave eight bits empty at the bottom so a type and sequence#
+// can be packed together into 64-bits.
+static const SequenceNumber kMaxSequenceNumber = ((0x1ull << 56) - 1);
+
+static const SequenceNumber kDisableGlobalSequenceNumber =
+    std::numeric_limits<uint64_t>::max();
+
+constexpr uint64_t kNumInternalBytes = 8;
+
+// Defined in dbformat.cc
+extern const std::string kDisableUserTimestamp;
+
+// The data structure that represents an internal key in the way that user_key,
+// sequence number and type are stored in separated forms.
+struct ParsedInternalKey {
+  Slice user_key;
+  SequenceNumber sequence;
+  ValueType type;
+
+  ParsedInternalKey()
+      : sequence(kMaxSequenceNumber),
+        type(kTypeDeletion)  // Make code analyzer happy
+  {}                         // Intentionally left uninitialized (for speed)
+  // u contains timestamp if user timestamp feature is enabled.
+  ParsedInternalKey(const Slice& u, const SequenceNumber& seq, ValueType t)
+      : user_key(u), sequence(seq), type(t) {}
+  std::string DebugString(bool log_err_key, bool hex) const;
+
+  void clear() {
+    user_key.clear();
+    sequence = 0;
+    type = kTypeDeletion;
+  }
+
+  void SetTimestamp(const Slice& ts) {
+    assert(ts.size() <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts.size();
+    memcpy(const_cast<char*>(addr), ts.data(), ts.size());
+  }
+
+  Slice GetTimestamp(size_t ts_sz) {
+    assert(ts_sz <= user_key.size());
+    const char* addr = user_key.data() + user_key.size() - ts_sz;
+    return Slice(const_cast<char*>(addr), ts_sz);
+  }
+};
+
+// Return the length of the encoding of "key".
+inline size_t InternalKeyEncodingLength(const ParsedInternalKey& key) {
+  return key.user_key.size() + kNumInternalBytes;
+}
+
+// Pack a sequence number and a ValueType into a uint64_t
+inline uint64_t PackSequenceAndType(uint64_t seq, ValueType t) {
+  assert(seq <= kMaxSequenceNumber);
+  // kTypeMaxValid is used in TruncatedRangeDelIterator, see its constructor.
+  assert(IsExtendedValueType(t) || t == kTypeMaxValid);
+  return (seq << 8) | t;
+}
+
+// Given the result of PackSequenceAndType, store the sequence number in *seq
+// and the ValueType in *t.
+inline void UnPackSequenceAndType(uint64_t packed, uint64_t* seq,
+                                  ValueType* t) {
+  *seq = packed >> 8;
+  *t = static_cast<ValueType>(packed & 0xff);
+
+  // Commented the following two assertions in order to test key-value checksum
+  // on corrupted keys without crashing ("DbKvChecksumTest").
+  // assert(*seq <= kMaxSequenceNumber);
+  // assert(IsExtendedValueType(*t));
+}
+
+EntryType GetEntryType(ValueType value_type);
+
+// Append the serialization of "key" to *result.
+extern void AppendInternalKey(std::string* result,
+                              const ParsedInternalKey& key);
+
+// Append the serialization of "key" to *result, replacing the original
+// timestamp with argument ts.
+extern void AppendInternalKeyWithDifferentTimestamp(
+    std::string* result, const ParsedInternalKey& key, const Slice& ts);
+
+// Serialized internal key consists of user key followed by footer.
+// This function appends the footer to *result, assuming that *result already
+// contains the user key at the end.
+extern void AppendInternalKeyFooter(std::string* result, SequenceNumber s,
+                                    ValueType t);
+
+// Append the key and a minimal timestamp to *result
+extern void AppendKeyWithMinTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// Append the key and a maximal timestamp to *result
+extern void AppendKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                      size_t ts_sz);
+
+// `key` is a user key with timestamp. Append the user key without timestamp
+// and the maximal timestamp to *result.
+extern void AppendUserKeyWithMaxTimestamp(std::string* result, const Slice& key,
+                                          size_t ts_sz);
+
+// Attempt to parse an internal key from "internal_key".  On success,
+// stores the parsed data in "*result", and returns true.
+//
+// On error, returns false, leaves "*result" in an undefined state.
+extern Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key);
+
+// Returns the user key portion of an internal key.
+inline Slice ExtractUserKey(const Slice& internal_key) {
+  assert(internal_key.size() >= kNumInternalBytes);
+  return Slice(internal_key.data(), internal_key.size() - kNumInternalBytes);
+}
+
+inline Slice ExtractUserKeyAndStripTimestamp(const Slice& internal_key,
+                                             size_t ts_sz) {
+  Slice ret = internal_key;
+  ret.remove_suffix(kNumInternalBytes + ts_sz);
+  return ret;
+}
+
+inline Slice StripTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  Slice ret = user_key;
+  ret.remove_suffix(ts_sz);
+  return ret;
+}
+
+inline Slice ExtractTimestampFromUserKey(const Slice& user_key, size_t ts_sz) {
+  assert(user_key.size() >= ts_sz);
+  return Slice(user_key.data() + user_key.size() - ts_sz, ts_sz);
+}
+
+inline Slice ExtractTimestampFromKey(const Slice& internal_key, size_t ts_sz) {
+  const size_t key_size = internal_key.size();
+  assert(key_size >= kNumInternalBytes + ts_sz);
+  return Slice(internal_key.data() + key_size - ts_sz - kNumInternalBytes,
+               ts_sz);
+}
+
+inline uint64_t ExtractInternalKeyFooter(const Slice& internal_key) {
+  assert(internal_key.size() >= kNumInternalBytes);
+  const size_t n = internal_key.size();
+  return DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+}
+
+inline ValueType ExtractValueType(const Slice& internal_key) {
+  uint64_t num = ExtractInternalKeyFooter(internal_key);
+  unsigned char c = num & 0xff;
+  return static_cast<ValueType>(c);
+}
+
+// A comparator for internal keys that uses a specified comparator for
+// the user key portion and breaks ties by decreasing sequence number.
+class InternalKeyComparator
+#ifdef NDEBUG
+    final
+#endif
+    : public CompareInterface {
+ private:
+  UserComparatorWrapper user_comparator_;
+
+ public:
+  // `InternalKeyComparator`s constructed with the default constructor are not
+  // usable and will segfault on any attempt to use them for comparisons.
+  InternalKeyComparator() = default;
+
+  // @param named If true, assign a name to this comparator based on the
+  //    underlying comparator's name. This involves an allocation and copy in
+  //    this constructor to precompute the result of `Name()`. To avoid this
+  //    overhead, set `named` to false. In that case, `Name()` will return a
+  //    generic name that is non-specific to the underlying comparator.
+  explicit InternalKeyComparator(const Comparator* c) : user_comparator_(c) {}
+  virtual ~InternalKeyComparator() {}
+
+  int Compare(const Slice& a, const Slice& b) const override;
+
+  bool Equal(const Slice& a, const Slice& b) const {
+    // TODO Use user_comparator_.Equal(). Perhaps compare seqno before
+    // comparing the user key too.
+    return Compare(a, b) == 0;
+  }
+
+  // Same as Compare except that it excludes the value type from comparison
+  int CompareKeySeq(const Slice& a, const Slice& b) const;
+
+  const Comparator* user_comparator() const {
+    return user_comparator_.user_comparator();
+  }
+
+  int Compare(const InternalKey& a, const InternalKey& b) const;
+  int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
+  // In this `Compare()` overload, the sequence numbers provided in
+  // `a_global_seqno` and `b_global_seqno` override the sequence numbers in `a`
+  // and `b`, respectively. To disable sequence number override(s), provide the
+  // value `kDisableGlobalSequenceNumber`.
+  int Compare(const Slice& a, SequenceNumber a_global_seqno, const Slice& b,
+              SequenceNumber b_global_seqno) const;
+};
+
+// The class represent the internal key in encoded form.
+class InternalKey {
+ private:
+  std::string rep_;
+
+ public:
+  InternalKey() {}  // Leave rep_ as empty to indicate it is invalid
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t));
+  }
+  InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t, Slice ts) {
+    AppendInternalKeyWithDifferentTimestamp(
+        &rep_, ParsedInternalKey(_user_key, s, t), ts);
+  }
+
+  // sets the internal key to be bigger or equal to all internal keys with this
+  // user key
+  void SetMaxPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(
+        &rep_, ParsedInternalKey(_user_key, 0, static_cast<ValueType>(0)));
+  }
+
+  // sets the internal key to be smaller or equal to all internal keys with this
+  // user key
+  void SetMinPossibleForUserKey(const Slice& _user_key) {
+    AppendInternalKey(&rep_, ParsedInternalKey(_user_key, kMaxSequenceNumber,
+                                               kValueTypeForSeek));
+  }
+
+  bool Valid() const {
+    ParsedInternalKey parsed;
+    return (ParseInternalKey(Slice(rep_), &parsed, false /* log_err_key */)
+                .ok());  // TODO
+  }
+
+  void DecodeFrom(const Slice& s) { rep_.assign(s.data(), s.size()); }
+  Slice Encode() const {
+    assert(!rep_.empty());
+    return rep_;
+  }
+
+  Slice user_key() const { return ExtractUserKey(rep_); }
+  size_t size() const { return rep_.size(); }
+
+  void Set(const Slice& _user_key, SequenceNumber s, ValueType t) {
+    SetFrom(ParsedInternalKey(_user_key, s, t));
+  }
+
+  void Set(const Slice& _user_key_with_ts, SequenceNumber s, ValueType t,
+           const Slice& ts) {
+    ParsedInternalKey pik = ParsedInternalKey(_user_key_with_ts, s, t);
+    // Should not call pik.SetTimestamp() directly as it overwrites the buffer
+    // containing _user_key.
+    SetFrom(pik, ts);
+  }
+
+  void SetFrom(const ParsedInternalKey& p) {
+    rep_.clear();
+    AppendInternalKey(&rep_, p);
+  }
+
+  void SetFrom(const ParsedInternalKey& p, const Slice& ts) {
+    rep_.clear();
+    AppendInternalKeyWithDifferentTimestamp(&rep_, p, ts);
+  }
+
+  void Clear() { rep_.clear(); }
+
+  // The underlying representation.
+  // Intended only to be used together with ConvertFromUserKey().
+  std::string* rep() { return &rep_; }
+
+  // Assuming that *rep() contains a user key, this method makes internal key
+  // out of it in-place. This saves a memcpy compared to Set()/SetFrom().
+  void ConvertFromUserKey(SequenceNumber s, ValueType t) {
+    AppendInternalKeyFooter(&rep_, s, t);
+  }
+
+  std::string DebugString(bool hex) const;
+};
+
+inline int InternalKeyComparator::Compare(const InternalKey& a,
+                                          const InternalKey& b) const {
+  return Compare(a.Encode(), b.Encode());
+}
+
+inline Status ParseInternalKey(const Slice& internal_key,
+                               ParsedInternalKey* result, bool log_err_key) {
+  const size_t n = internal_key.size();
+
+  if (n < kNumInternalBytes) {
+    return Status::Corruption("Corrupted Key: Internal Key too small. Size=" +
+                              std::to_string(n) + ". ");
+  }
+
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+  unsigned char c = num & 0xff;
+  result->sequence = num >> 8;
+  result->type = static_cast<ValueType>(c);
+  assert(result->type <= ValueType::kMaxValue);
+  result->user_key = Slice(internal_key.data(), n - kNumInternalBytes);
+
+  if (IsExtendedValueType(result->type)) {
+    return Status::OK();
+  } else {
+    return Status::Corruption("Corrupted Key",
+                              result->DebugString(log_err_key, true));
+  }
+}
+
+// Update the sequence number in the internal key.
+// Guarantees not to invalidate ikey.data().
+inline void UpdateInternalKey(std::string* ikey, uint64_t seq, ValueType t) {
+  size_t ikey_sz = ikey->size();
+  assert(ikey_sz >= kNumInternalBytes);
+  uint64_t newval = (seq << 8) | t;
+
+  // Note: Since C++11, strings are guaranteed to be stored contiguously and
+  // string::operator[]() is guaranteed not to change ikey.data().
+  EncodeFixed64(&(*ikey)[ikey_sz - kNumInternalBytes], newval);
+}
+
+// Get the sequence number from the internal key
+inline uint64_t GetInternalKeySeqno(const Slice& internal_key) {
+  const size_t n = internal_key.size();
+  assert(n >= kNumInternalBytes);
+  uint64_t num = DecodeFixed64(internal_key.data() + n - kNumInternalBytes);
+  return num >> 8;
+}
+
+// The class to store keys in an efficient way. It allows:
+// 1. Users can either copy the key into it, or have it point to an unowned
+//    address.
+// 2. For copied key, a short inline buffer is kept to reduce memory
+//    allocation for smaller keys.
+// 3. It tracks user key or internal key, and allow conversion between them.
+class IterKey {
+ public:
+  IterKey()
+      : buf_(space_),
+        key_(buf_),
+        key_size_(0),
+        buf_size_(sizeof(space_)),
+        is_user_key_(true) {}
+  // No copying allowed
+  IterKey(const IterKey&) = delete;
+  void operator=(const IterKey&) = delete;
+
+  ~IterKey() { ResetBuffer(); }
+
+  // The bool will be picked up by the next calls to SetKey
+  void SetIsUserKey(bool is_user_key) { is_user_key_ = is_user_key; }
+
+  // Returns the key in whichever format that was provided to KeyIter
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // return result.
+  Slice GetKey() const { return Slice(key_, key_size_); }
+
+  Slice GetInternalKey() const {
+    assert(!IsUserKey());
+    return Slice(key_, key_size_);
+  }
+
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // return result of GetUserKey();
+  Slice GetUserKey() const {
+    if (IsUserKey()) {
+      return Slice(key_, key_size_);
+    } else {
+      assert(key_size_ >= kNumInternalBytes);
+      return Slice(key_, key_size_ - kNumInternalBytes);
+    }
+  }
+
+  size_t Size() const { return key_size_; }
+
+  void Clear() { key_size_ = 0; }
+
+  // Append "non_shared_data" to its back, from "shared_len"
+  // This function is used in Block::Iter::ParseNextKey
+  // shared_len: bytes in [0, shard_len-1] would be remained
+  // non_shared_data: data to be append, its length must be >= non_shared_len
+  void TrimAppend(const size_t shared_len, const char* non_shared_data,
+                  const size_t non_shared_len) {
+    assert(shared_len <= key_size_);
+    size_t total_size = shared_len + non_shared_len;
+
+    if (IsKeyPinned() /* key is not in buf_ */) {
+      // Copy the key from external memory to buf_ (copy shared_len bytes)
+      EnlargeBufferIfNeeded(total_size);
+      memcpy(buf_, key_, shared_len);
+    } else if (total_size > buf_size_) {
+      // Need to allocate space, delete previous space
+      char* p = new char[total_size];
+      memcpy(p, key_, shared_len);
+
+      if (buf_ != space_) {
+        delete[] buf_;
+      }
+
+      buf_ = p;
+      buf_size_ = total_size;
+    }
+
+    memcpy(buf_ + shared_len, non_shared_data, non_shared_len);
+    key_ = buf_;
+    key_size_ = total_size;
+  }
+
+  Slice SetKey(const Slice& key, bool copy = true) {
+    // is_user_key_ expected to be set already via SetIsUserKey
+    return SetKeyImpl(key, copy);
+  }
+
+  // If user-defined timestamp is enabled, then `key` includes timestamp.
+  // TODO(yanqin) this is also used to set prefix, which do not include
+  // timestamp. Should be handled.
+  Slice SetUserKey(const Slice& key, bool copy = true) {
+    is_user_key_ = true;
+    return SetKeyImpl(key, copy);
+  }
+
+  Slice SetInternalKey(const Slice& key, bool copy = true) {
+    is_user_key_ = false;
+    return SetKeyImpl(key, copy);
+  }
+
+  // Copies the content of key, updates the reference to the user key in ikey
+  // and returns a Slice referencing the new copy.
+  Slice SetInternalKey(const Slice& key, ParsedInternalKey* ikey) {
+    size_t key_n = key.size();
+    assert(key_n >= kNumInternalBytes);
+    SetInternalKey(key);
+    ikey->user_key = Slice(key_, key_n - kNumInternalBytes);
+    return Slice(key_, key_n);
+  }
+
+  // Copy the key into IterKey own buf_
+  void OwnKey() {
+    assert(IsKeyPinned() == true);
+
+    Reserve(key_size_);
+    memcpy(buf_, key_, key_size_);
+    key_ = buf_;
+  }
+
+  // Update the sequence number in the internal key.  Guarantees not to
+  // invalidate slices to the key (and the user key).
+  void UpdateInternalKey(uint64_t seq, ValueType t, const Slice* ts = nullptr) {
+    assert(!IsKeyPinned());
+    assert(key_size_ >= kNumInternalBytes);
+    if (ts) {
+      assert(key_size_ >= kNumInternalBytes + ts->size());
+      memcpy(&buf_[key_size_ - kNumInternalBytes - ts->size()], ts->data(),
+             ts->size());
+    }
+    uint64_t newval = (seq << 8) | t;
+    EncodeFixed64(&buf_[key_size_ - kNumInternalBytes], newval);
+  }
+
+  bool IsKeyPinned() const { return (key_ != buf_); }
+
+  // If `ts` is provided, user_key should not contain timestamp,
+  // and `ts` is appended after user_key.
+  // TODO: more efficient storage for timestamp.
+  void SetInternalKey(const Slice& key_prefix, const Slice& user_key,
+                      SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
+    size_t psize = key_prefix.size();
+    size_t usize = user_key.size();
+    size_t ts_sz = (ts != nullptr ? ts->size() : 0);
+    EnlargeBufferIfNeeded(psize + usize + sizeof(uint64_t) + ts_sz);
+    if (psize > 0) {
+      memcpy(buf_, key_prefix.data(), psize);
+    }
+    memcpy(buf_ + psize, user_key.data(), usize);
+    if (ts) {
+      memcpy(buf_ + psize + usize, ts->data(), ts_sz);
+    }
+    EncodeFixed64(buf_ + usize + psize + ts_sz,
+                  PackSequenceAndType(s, value_type));
+
+    key_ = buf_;
+    key_size_ = psize + usize + sizeof(uint64_t) + ts_sz;
+    is_user_key_ = false;
+  }
+
+  void SetInternalKey(const Slice& user_key, SequenceNumber s,
+                      ValueType value_type = kValueTypeForSeek,
+                      const Slice* ts = nullptr) {
+    SetInternalKey(Slice(), user_key, s, value_type, ts);
+  }
+
+  void Reserve(size_t size) {
+    EnlargeBufferIfNeeded(size);
+    key_size_ = size;
+  }
+
+  void SetInternalKey(const ParsedInternalKey& parsed_key) {
+    SetInternalKey(Slice(), parsed_key);
+  }
+
+  void SetInternalKey(const Slice& key_prefix,
+                      const ParsedInternalKey& parsed_key_suffix) {
+    SetInternalKey(key_prefix, parsed_key_suffix.user_key,
+                   parsed_key_suffix.sequence, parsed_key_suffix.type);
+  }
+
+  void EncodeLengthPrefixedKey(const Slice& key) {
+    auto size = key.size();
+    EnlargeBufferIfNeeded(size + static_cast<size_t>(VarintLength(size)));
+    char* ptr = EncodeVarint32(buf_, static_cast<uint32_t>(size));
+    memcpy(ptr, key.data(), size);
+    key_ = buf_;
+    is_user_key_ = true;
+  }
+
+  bool IsUserKey() const { return is_user_key_; }
+
+ private:
+  char* buf_;
+  const char* key_;
+  size_t key_size_;
+  size_t buf_size_;
+  char space_[32];  // Avoid allocation for short keys
+  bool is_user_key_;
+
+  Slice SetKeyImpl(const Slice& key, bool copy) {
+    size_t size = key.size();
+    if (copy) {
+      // Copy key to buf_
+      EnlargeBufferIfNeeded(size);
+      memcpy(buf_, key.data(), size);
+      key_ = buf_;
+    } else {
+      // Update key_ to point to external memory
+      key_ = key.data();
+    }
+    key_size_ = size;
+    return Slice(key_, key_size_);
+  }
+
+  void ResetBuffer() {
+    if (buf_ != space_) {
+      delete[] buf_;
+      buf_ = space_;
+    }
+    buf_size_ = sizeof(space_);
+    key_size_ = 0;
+  }
+
+  // Enlarge the buffer size if needed based on key_size.
+  // By default, static allocated buffer is used. Once there is a key
+  // larger than the static allocated buffer, another buffer is dynamically
+  // allocated, until a larger key buffer is requested. In that case, we
+  // reallocate buffer and delete the old one.
+  void EnlargeBufferIfNeeded(size_t key_size) {
+    // If size is smaller than buffer size, continue using current buffer,
+    // or the static allocated one, as default
+    if (key_size > buf_size_) {
+      EnlargeBuffer(key_size);
+    }
+  }
+
+  void EnlargeBuffer(size_t key_size);
+};
+
+// Convert from a SliceTransform of user keys, to a SliceTransform of
+// internal keys.
+class InternalKeySliceTransform : public SliceTransform {
+ public:
+  explicit InternalKeySliceTransform(const SliceTransform* transform)
+      : transform_(transform) {}
+
+  virtual const char* Name() const override { return transform_->Name(); }
+
+  virtual Slice Transform(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->Transform(user_key);
+  }
+
+  virtual bool InDomain(const Slice& src) const override {
+    auto user_key = ExtractUserKey(src);
+    return transform_->InDomain(user_key);
+  }
+
+  virtual bool InRange(const Slice& dst) const override {
+    auto user_key = ExtractUserKey(dst);
+    return transform_->InRange(user_key);
+  }
+
+  const SliceTransform* user_prefix_extractor() const { return transform_; }
+
+ private:
+  // Like comparator, InternalKeySliceTransform will not take care of the
+  // deletion of transform_
+  const SliceTransform* const transform_;
+};
+
+// Read the key of a record from a write batch.
+// if this record represent the default column family then cf_record
+// must be passed as false, otherwise it must be passed as true.
+extern bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key,
+                                       bool cf_record);
+
+// Read record from a write batch piece from input.
+// tag, column_family, key, value and blob are return values. Callers own the
+// slice they point to.
+// Tag is defined as ValueType.
+// input will be advanced to after the record.
+// If user-defined timestamp is enabled for a column family, then the `key`
+// resulting from this call will include timestamp.
+extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                       uint32_t* column_family, Slice* key,
+                                       Slice* value, Slice* blob, Slice* xid);
+
+// When user call DeleteRange() to delete a range of keys,
+// we will store a serialized RangeTombstone in MemTable and SST.
+// the struct here is an easy-understood form
+// start/end_key_ is the start/end user key of the range to be deleted
+struct RangeTombstone {
+  Slice start_key_;
+  Slice end_key_;
+  SequenceNumber seq_;
+  // TODO: we should optimize the storage here when user-defined timestamp
+  //  is NOT enabled: they currently take up (16 + 32 + 32) bytes per tombstone.
+  Slice ts_;
+  std::string pinned_start_key_;
+  std::string pinned_end_key_;
+
+  RangeTombstone() = default;
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn)
+      : start_key_(sk), end_key_(ek), seq_(sn) {}
+
+  // User-defined timestamp is enabled, `sk` and `ek` should be user key
+  // with timestamp, `ts` will replace the timestamps in `sk` and
+  // `ek`.
+  RangeTombstone(Slice sk, Slice ek, SequenceNumber sn, Slice ts)
+      : seq_(sn), ts_(ts) {
+    assert(!ts.empty());
+    pinned_start_key_.reserve(sk.size());
+    pinned_start_key_.append(sk.data(), sk.size() - ts.size());
+    pinned_start_key_.append(ts.data(), ts.size());
+    pinned_end_key_.reserve(ek.size());
+    pinned_end_key_.append(ek.data(), ek.size() - ts.size());
+    pinned_end_key_.append(ts.data(), ts.size());
+    start_key_ = pinned_start_key_;
+    end_key_ = pinned_end_key_;
+  }
+
+  RangeTombstone(ParsedInternalKey parsed_key, Slice value) {
+    start_key_ = parsed_key.user_key;
+    seq_ = parsed_key.sequence;
+    end_key_ = value;
+  }
+
+  // be careful to use Serialize(), allocates new memory
+  std::pair<InternalKey, Slice> Serialize() const {
+    auto key = InternalKey(start_key_, seq_, kTypeRangeDeletion);
+    return std::make_pair(std::move(key), end_key_);
+  }
+
+  // be careful to use SerializeKey(), allocates new memory
+  InternalKey SerializeKey() const {
+    return InternalKey(start_key_, seq_, kTypeRangeDeletion);
+  }
+
+  // The tombstone end-key is exclusive, so we generate an internal-key here
+  // which has a similar property. Using kMaxSequenceNumber guarantees that
+  // the returned internal-key will compare less than any other internal-key
+  // with the same user-key. This in turn guarantees that the serialized
+  // end-key for a tombstone such as [a-b] will compare less than the key "b".
+  //
+  // be careful to use SerializeEndKey(), allocates new memory
+  InternalKey SerializeEndKey() const {
+    if (!ts_.empty()) {
+      static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+      if (ts_.size() <= strlen(kTsMax)) {
+        return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+                           Slice(kTsMax, ts_.size()));
+      } else {
+        return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion,
+                           std::string(ts_.size(), '\xff'));
+      }
+    }
+    return InternalKey(end_key_, kMaxSequenceNumber, kTypeRangeDeletion);
+  }
+};
+
+inline int InternalKeyComparator::Compare(const Slice& akey,
+                                          const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  //    decreasing type (though sequence# should be enough to disambiguate)
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes);
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes);
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::CompareKeySeq(const Slice& akey,
+                                                const Slice& bkey) const {
+  // Order by:
+  //    increasing user key (according to user-supplied comparator)
+  //    decreasing sequence number
+  int r = user_comparator_.Compare(ExtractUserKey(akey), ExtractUserKey(bkey));
+  if (r == 0) {
+    // Shift the number to exclude the last byte which contains the value type
+    const uint64_t anum =
+        DecodeFixed64(akey.data() + akey.size() - kNumInternalBytes) >> 8;
+    const uint64_t bnum =
+        DecodeFixed64(bkey.data() + bkey.size() - kNumInternalBytes) >> 8;
+    if (anum > bnum) {
+      r = -1;
+    } else if (anum < bnum) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+inline int InternalKeyComparator::Compare(const Slice& a,
+                                          SequenceNumber a_global_seqno,
+                                          const Slice& b,
+                                          SequenceNumber b_global_seqno) const {
+  int r = user_comparator_.Compare(ExtractUserKey(a), ExtractUserKey(b));
+  if (r == 0) {
+    uint64_t a_footer, b_footer;
+    if (a_global_seqno == kDisableGlobalSequenceNumber) {
+      a_footer = ExtractInternalKeyFooter(a);
+    } else {
+      a_footer = PackSequenceAndType(a_global_seqno, ExtractValueType(a));
+    }
+    if (b_global_seqno == kDisableGlobalSequenceNumber) {
+      b_footer = ExtractInternalKeyFooter(b);
+    } else {
+      b_footer = PackSequenceAndType(b_global_seqno, ExtractValueType(b));
+    }
+    if (a_footer > b_footer) {
+      r = -1;
+    } else if (a_footer < b_footer) {
+      r = +1;
+    }
+  }
+  return r;
+}
+
+// Wrap InternalKeyComparator as a comparator class for ParsedInternalKey.
+struct ParsedInternalKeyComparator {
+  explicit ParsedInternalKeyComparator(const InternalKeyComparator* c)
+      : cmp(c) {}
+
+  bool operator()(const ParsedInternalKey& a,
+                  const ParsedInternalKey& b) const {
+    return cmp->Compare(a, b) < 0;
+  }
+
+  const InternalKeyComparator* cmp;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/dbformat_test.cc b/src/rocksdb/db/dbformat_test.cc
new file mode 100644
index 000000000..8dc3387df
--- /dev/null
+++ b/src/rocksdb/db/dbformat_test.cc
@@ -0,0 +1,214 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/dbformat.h"
+
+#include "table/block_based/index_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string IKey(const std::string& user_key, uint64_t seq,
+                        ValueType vt) {
+  std::string encoded;
+  AppendInternalKey(&encoded, ParsedInternalKey(user_key, seq, vt));
+  return encoded;
+}
+
+static std::string Shorten(const std::string& s, const std::string& l) {
+  std::string result = s;
+  ShortenedIndexBuilder::FindShortestInternalKeySeparator(*BytewiseComparator(),
+                                                          &result, l);
+  return result;
+}
+
+static std::string ShortSuccessor(const std::string& s) {
+  std::string result = s;
+  ShortenedIndexBuilder::FindShortInternalKeySuccessor(*BytewiseComparator(),
+                                                       &result);
+  return result;
+}
+
+static void TestKey(const std::string& key, uint64_t seq, ValueType vt) {
+  std::string encoded = IKey(key, seq, vt);
+
+  Slice in(encoded);
+  ParsedInternalKey decoded("", 0, kTypeValue);
+
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+  ASSERT_EQ(key, decoded.user_key.ToString());
+  ASSERT_EQ(seq, decoded.sequence);
+  ASSERT_EQ(vt, decoded.type);
+
+  ASSERT_NOK(ParseInternalKey(Slice("bar"), &decoded, true /* log_err_key */));
+}
+
+class FormatTest : public testing::Test {};
+
+TEST_F(FormatTest, InternalKey_EncodeDecode) {
+  const char* keys[] = {"", "k", "hello", "longggggggggggggggggggggg"};
+  const uint64_t seq[] = {1,
+                          2,
+                          3,
+                          (1ull << 8) - 1,
+                          1ull << 8,
+                          (1ull << 8) + 1,
+                          (1ull << 16) - 1,
+                          1ull << 16,
+                          (1ull << 16) + 1,
+                          (1ull << 32) - 1,
+                          1ull << 32,
+                          (1ull << 32) + 1};
+  for (unsigned int k = 0; k < sizeof(keys) / sizeof(keys[0]); k++) {
+    for (unsigned int s = 0; s < sizeof(seq) / sizeof(seq[0]); s++) {
+      TestKey(keys[k], seq[s], kTypeValue);
+      TestKey("hello", 1, kTypeDeletion);
+    }
+  }
+}
+
+TEST_F(FormatTest, InternalKeyShortSeparator) {
+  // When user keys are same
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 99, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 101, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foo", 100, kTypeDeletion)));
+
+  // When user keys are misordered
+  ASSERT_EQ(IKey("foo", 100, kTypeValue),
+            Shorten(IKey("foo", 100, kTypeValue), IKey("bar", 99, kTypeValue)));
+
+  // When user keys are different, but correctly ordered
+  ASSERT_EQ(
+      IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("hello", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("ABC2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("ABC1AAAAA", 100, kTypeValue),
+                    IKey("ABC2ABB", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2AA", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA4", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1B", kMaxSequenceNumber, kValueTypeForSeek),
+      Shorten(IKey("AAA1AAA", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  ASSERT_EQ(IKey("AAA2", kMaxSequenceNumber, kValueTypeForSeek),
+            Shorten(IKey("AAA1AAA", 100, kTypeValue),
+                    IKey("AAA2A", 200, kTypeValue)));
+
+  ASSERT_EQ(
+      IKey("AAA1", 100, kTypeValue),
+      Shorten(IKey("AAA1", 100, kTypeValue), IKey("AAA2", 200, kTypeValue)));
+
+  // When start user key is prefix of limit user key
+  ASSERT_EQ(
+      IKey("foo", 100, kTypeValue),
+      Shorten(IKey("foo", 100, kTypeValue), IKey("foobar", 200, kTypeValue)));
+
+  // When limit user key is prefix of start user key
+  ASSERT_EQ(
+      IKey("foobar", 100, kTypeValue),
+      Shorten(IKey("foobar", 100, kTypeValue), IKey("foo", 200, kTypeValue)));
+}
+
+TEST_F(FormatTest, InternalKeyShortestSuccessor) {
+  ASSERT_EQ(IKey("g", kMaxSequenceNumber, kValueTypeForSeek),
+            ShortSuccessor(IKey("foo", 100, kTypeValue)));
+  ASSERT_EQ(IKey("\xff\xff", 100, kTypeValue),
+            ShortSuccessor(IKey("\xff\xff", 100, kTypeValue)));
+}
+
+TEST_F(FormatTest, IterKeyOperation) {
+  IterKey k;
+  const char p[] = "abcdefghijklmnopqrstuvwxyz";
+  const char q[] = "0123456789";
+
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string(""));
+
+  k.TrimAppend(0, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abc"));
+
+  k.TrimAppend(1, p, 3);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("aabc"));
+
+  k.TrimAppend(0, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz"));
+
+  k.TrimAppend(26, q, 10);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0123456789"));
+
+  k.TrimAppend(36, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz01234567890"));
+
+  k.TrimAppend(26, q, 1);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"));
+
+  // Size going up, memory allocation is triggered
+  k.TrimAppend(27, p, 26);
+  ASSERT_EQ(std::string(k.GetUserKey().data(), k.GetUserKey().size()),
+            std::string("abcdefghijklmnopqrstuvwxyz0"
+                        "abcdefghijklmnopqrstuvwxyz"));
+}
+
+TEST_F(FormatTest, UpdateInternalKey) {
+  std::string user_key("abcdefghijklmnopqrstuvwxyz");
+  uint64_t new_seq = 0x123456;
+  ValueType new_val_type = kTypeDeletion;
+
+  std::string ikey;
+  AppendInternalKey(&ikey, ParsedInternalKey(user_key, 100U, kTypeValue));
+  size_t ikey_size = ikey.size();
+  UpdateInternalKey(&ikey, new_seq, new_val_type);
+  ASSERT_EQ(ikey_size, ikey.size());
+
+  Slice in(ikey);
+  ParsedInternalKey decoded;
+  ASSERT_OK(ParseInternalKey(in, &decoded, true /* log_err_key */));
+  ASSERT_EQ(user_key, decoded.user_key.ToString());
+  ASSERT_EQ(new_seq, decoded.sequence);
+  ASSERT_EQ(new_val_type, decoded.type);
+}
+
+TEST_F(FormatTest, RangeTombstoneSerializeEndKey) {
+  RangeTombstone t("a", "b", 2);
+  InternalKey k("b", 3, kTypeValue);
+  const InternalKeyComparator cmp(BytewiseComparator());
+  ASSERT_LT(cmp.Compare(t.SerializeEndKey(), k), 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/deletefile_test.cc b/src/rocksdb/db/deletefile_test.cc
new file mode 100644
index 000000000..34925e828
--- /dev/null
+++ b/src/rocksdb/db/deletefile_test.cc
@@ -0,0 +1,614 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DeleteFileTest : public DBTestBase {
+ public:
+  const int numlevels_;
+  const std::string wal_dir_;
+
+  DeleteFileTest()
+      : DBTestBase("deletefile_test", /*env_do_fsync=*/true),
+        numlevels_(7),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void SetOptions(Options* options) {
+    ASSERT_NE(options, nullptr);
+    options->delete_obsolete_files_period_micros = 0;  // always do full purge
+    options->enable_thread_tracking = true;
+    options->write_buffer_size = 1024 * 1024 * 1000;
+    options->target_file_size_base = 1024 * 1024 * 1000;
+    options->max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options->WAL_ttl_seconds = 300;     // Used to test log files
+    options->WAL_size_limit_MB = 1024;  // Used to test log files
+    options->wal_dir = wal_dir_;
+  }
+
+  void AddKeys(int numkeys, int startkey = 0) {
+    WriteOptions options;
+    options.sync = false;
+    ReadOptions roptions;
+    for (int i = startkey; i < (numkeys + startkey); i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  int numKeysInLevels(std::vector<LiveFileMetaData>& metadata,
+                      std::vector<int>* keysperlevel = nullptr) {
+    if (keysperlevel != nullptr) {
+      keysperlevel->resize(numlevels_);
+    }
+
+    int numKeys = 0;
+    for (size_t i = 0; i < metadata.size(); i++) {
+      int startkey = atoi(metadata[i].smallestkey.c_str());
+      int endkey = atoi(metadata[i].largestkey.c_str());
+      int numkeysinfile = (endkey - startkey + 1);
+      numKeys += numkeysinfile;
+      if (keysperlevel != nullptr) {
+        (*keysperlevel)[(int)metadata[i].level] += numkeysinfile;
+      }
+      fprintf(stderr, "level %d name %s smallest %s largest %s\n",
+              metadata[i].level, metadata[i].name.c_str(),
+              metadata[i].smallestkey.c_str(), metadata[i].largestkey.c_str());
+    }
+    return numKeys;
+  }
+
+  void CreateTwoLevels() {
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    for (int i = 0; i < 2; ++i) {
+      ASSERT_OK(dbfull()->TEST_CompactRange(i, nullptr, nullptr));
+    }
+
+    AddKeys(50000, 10000);
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr));
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+    int log_cnt = 0, sst_cnt = 0, manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kWalFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    if (required_log >= 0) {
+      ASSERT_EQ(required_log, log_cnt);
+    }
+    if (required_sst >= 0) {
+      ASSERT_EQ(required_sst, sst_cnt);
+    }
+    if (required_manifest >= 0) {
+      ASSERT_EQ(required_manifest, manifest_cnt);
+    }
+  }
+
+  static void DoSleep(void* arg) {
+    auto test = reinterpret_cast<DeleteFileTest*>(arg);
+    test->env_->SleepForMicroseconds(2 * 1000 * 1000);
+  }
+
+  // An empty job to guard all jobs are processed
+  static void GuardFinish(void* /*arg*/) {
+    TEST_SYNC_POINT("DeleteFileTest::GuardFinish");
+  }
+};
+
+TEST_F(DeleteFileTest, AddKeysAndQueryLevels) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level1file = "";
+  int level1keycount = 0;
+  std::string level2file = "";
+  int level2keycount = 0;
+  int level1index = 0;
+  int level2index = 1;
+
+  ASSERT_EQ((int)metadata.size(), 2);
+  if (metadata[0].level == 2) {
+    level1index = 1;
+    level2index = 0;
+  }
+
+  level1file = metadata[level1index].name;
+  int startkey = atoi(metadata[level1index].smallestkey.c_str());
+  int endkey = atoi(metadata[level1index].largestkey.c_str());
+  level1keycount = (endkey - startkey + 1);
+  level2file = metadata[level2index].name;
+  startkey = atoi(metadata[level2index].smallestkey.c_str());
+  endkey = atoi(metadata[level2index].largestkey.c_str());
+  level2keycount = (endkey - startkey + 1);
+
+  // COntrolled setup. Levels 1 and 2 should both have 50K files.
+  // This is a little fragile as it depends on the current
+  // compaction heuristics.
+  ASSERT_EQ(level1keycount, 50000);
+  ASSERT_EQ(level2keycount, 50000);
+
+  Status status = db_->DeleteFile("0.sst");
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // intermediate level files cannot be deleted.
+  status = db_->DeleteFile(level1file);
+  ASSERT_TRUE(status.IsInvalidArgument());
+
+  // Lowest level file deletion should succeed.
+  status = db_->DeleteFile(level2file);
+  ASSERT_OK(status);
+}
+
+TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  // there should be only one (empty) log file because CreateTwoLevels()
+  // flushes the memtables to disk
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+  // 2 ssts, 1 manifest
+  CheckFileTypeCounts(dbname_, 0, 2, 1);
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 1 sst after compaction
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+  // this time, we keep an iterator alive
+  Reopen(options);
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  itr = db_->NewIterator(ReadOptions());
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  ASSERT_OK(itr->status());
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  itr = db_->NewIterator(read_options);
+  ASSERT_OK(itr->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  test::SleepingBackgroundTask sleeping_task_before;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_before, Env::Priority::HIGH);
+  delete itr;
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure no purges are executed foreground
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  sleeping_task_before.WakeUp();
+  sleeping_task_before.WaitUntilDone();
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, PurgeDuringOpen) {
+  Options options = CurrentOptions();
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+  std::unique_ptr<WritableFile> file;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  options.avoid_unnecessary_blocking_io = false;
+  options.create_if_missing = false;
+  Reopen(options);
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+  Close();
+
+  // test background purge
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  ASSERT_OK(options.env->NewWritableFile(dbname_ + "/000002.sst", &file,
+                                         EnvOptions()));
+  ASSERT_OK(file->Close());
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::PurgeDuringOpen:1", "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  Reopen(options);
+  // the obsolete file is not deleted until the background purge job is ran
+  CheckFileTypeCounts(dbname_, -1, 1, -1);
+  TEST_SYNC_POINT("DeleteFileTest::PurgeDuringOpen:1");
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+  CheckFileTypeCounts(dbname_, -1, 0, -1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  auto do_test = [&](bool bg_purge) {
+    ColumnFamilyOptions co;
+    co.max_write_buffer_size_to_maintain =
+        static_cast<int64_t>(co.write_buffer_size);
+    WriteOptions wo;
+    FlushOptions fo;
+    ColumnFamilyHandle* cfh = nullptr;
+
+    ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh));
+
+    ASSERT_OK(db_->Put(wo, cfh, "pika", "chu"));
+    ASSERT_OK(db_->Flush(fo, cfh));
+    // Expect 1 sst file.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    ASSERT_OK(db_->DropColumnFamily(cfh));
+    // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive.
+    CheckFileTypeCounts(dbname_, 0, 1, 1);
+
+    delete cfh;
+    test::SleepingBackgroundTask sleeping_task_after;
+    env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                   &sleeping_task_after, Env::Priority::HIGH);
+    // If background purge is enabled, the file should still be there.
+    CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1);
+    TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeCFDropTest:1");
+
+    // Execute background purges.
+    sleeping_task_after.WakeUp();
+    sleeping_task_after.WaitUntilDone();
+    // The file should have been deleted.
+    CheckFileTypeCounts(dbname_, 0, 0, 1);
+  };
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = false");
+    do_test(false);
+  }
+
+  options.avoid_unnecessary_blocking_io = true;
+  options.create_if_missing = false;
+  Reopen(options);
+  ASSERT_OK(dbfull()->TEST_WaitForPurge());
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DeleteFileTest::BackgroundPurgeCFDropTest:1",
+        "DBImpl::BGWorkPurge:start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  {
+    SCOPED_TRACE("avoid_unnecessary_blocking_io = true");
+    do_test(true);
+  }
+}
+
+// This test is to reproduce a bug that read invalid ReadOption in iterator
+// cleanup function
+TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  Iterator* itr = nullptr;
+  CreateTwoLevels();
+  {
+    ReadOptions read_options;
+    read_options.background_purge_on_iterator_cleanup = true;
+    itr = db_->NewIterator(read_options);
+    ASSERT_OK(itr->status());
+    // ReadOptions is deleted, but iterator cleanup function should not be
+    // affected
+  }
+
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 3 sst after compaction with live iterator
+  CheckFileTypeCounts(dbname_, 0, 3, 1);
+  delete itr;
+
+  test::SleepingBackgroundTask sleeping_task_after;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask,
+                 &sleeping_task_after, Env::Priority::HIGH);
+
+  // Make sure all background purges are executed
+  sleeping_task_after.WakeUp();
+  sleeping_task_after.WaitUntilDone();
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, BackgroundPurgeTestMultipleJobs) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  std::string first("0"), last("999999");
+  CompactRangeOptions compact_options;
+  compact_options.change_level = true;
+  compact_options.target_level = 2;
+  Slice first_slice(first), last_slice(last);
+
+  // We keep an iterator alive
+  CreateTwoLevels();
+  ReadOptions read_options;
+  read_options.background_purge_on_iterator_cleanup = true;
+  Iterator* itr1 = db_->NewIterator(read_options);
+  ASSERT_OK(itr1->status());
+  CreateTwoLevels();
+  Iterator* itr2 = db_->NewIterator(read_options);
+  ASSERT_OK(itr2->status());
+  ASSERT_OK(db_->CompactRange(compact_options, &first_slice, &last_slice));
+  // 5 sst files after 2 compactions with 2 live iterators
+  CheckFileTypeCounts(dbname_, 0, 5, 1);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  // ~DBImpl should wait until all BGWorkPurge are finished
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::~DBImpl:WaitJob", "DBImpl::BGWorkPurge"},
+       {"DeleteFileTest::GuardFinish",
+        "DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  delete itr1;
+  env_->Schedule(&DeleteFileTest::DoSleep, this, Env::Priority::HIGH);
+  delete itr2;
+  env_->Schedule(&DeleteFileTest::GuardFinish, nullptr, Env::Priority::HIGH);
+  Close();
+
+  TEST_SYNC_POINT("DeleteFileTest::BackgroundPurgeTestMultipleJobs:DBClose");
+  // 1 sst after iterator deletion
+  CheckFileTypeCounts(dbname_, 0, 1, 1);
+}
+
+TEST_F(DeleteFileTest, DeleteFileWithIterator) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  CreateTwoLevels();
+  ReadOptions read_options;
+  Iterator* it = db_->NewIterator(read_options);
+  ASSERT_OK(it->status());
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+
+  std::string level2file;
+
+  ASSERT_EQ(metadata.size(), static_cast<size_t>(2));
+  if (metadata[0].level == 1) {
+    level2file = metadata[1].name;
+  } else {
+    level2file = metadata[0].name;
+  }
+
+  Status status = db_->DeleteFile(level2file);
+  fprintf(stdout, "Deletion status %s: %s\n", level2file.c_str(),
+          status.ToString().c_str());
+  ASSERT_OK(status);
+  it->SeekToFirst();
+  int numKeysIterated = 0;
+  while (it->Valid()) {
+    numKeysIterated++;
+    it->Next();
+  }
+  ASSERT_EQ(numKeysIterated, 50000);
+  delete it;
+}
+
+TEST_F(DeleteFileTest, DeleteLogFiles) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+
+  AddKeys(10, 0);
+  VectorLogPtr logfiles;
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+  ASSERT_GT(logfiles.size(), 0UL);
+  // Take the last log file which is expected to be alive and try to delete it
+  // Should not succeed because live logs are not allowed to be deleted
+  std::unique_ptr<LogFile> alive_log = std::move(logfiles.back());
+  ASSERT_EQ(alive_log->Type(), kAliveLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  fprintf(stdout, "Deleting alive log file %s\n",
+          alive_log->PathName().c_str());
+  ASSERT_NOK(db_->DeleteFile(alive_log->PathName()));
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + alive_log->PathName()));
+  logfiles.clear();
+
+  // Call Flush to bring about a new working log file and add more keys
+  // Call Flush again to flush out memtable and move alive log to archived log
+  // and try to delete the archived log file
+  FlushOptions fopts;
+  ASSERT_OK(db_->Flush(fopts));
+  AddKeys(10, 0);
+  ASSERT_OK(db_->Flush(fopts));
+  ASSERT_OK(db_->GetSortedWalFiles(logfiles));
+  ASSERT_GT(logfiles.size(), 0UL);
+  std::unique_ptr<LogFile> archived_log = std::move(logfiles.front());
+  ASSERT_EQ(archived_log->Type(), kArchivedLogFile);
+  ASSERT_OK(env_->FileExists(wal_dir_ + "/" + archived_log->PathName()));
+  fprintf(stdout, "Deleting archived log file %s\n",
+          archived_log->PathName().c_str());
+  ASSERT_OK(db_->DeleteFile(archived_log->PathName()));
+  ASSERT_TRUE(
+      env_->FileExists(wal_dir_ + "/" + archived_log->PathName()).IsNotFound());
+}
+
+TEST_F(DeleteFileTest, DeleteNonDefaultColumnFamily) {
+  Options options = CurrentOptions();
+  SetOptions(&options);
+  Destroy(options);
+  options.create_if_missing = true;
+  Reopen(options);
+  CreateAndReopenWithCF({"new_cf"}, options);
+
+  Random rnd(5);
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(db_->Put(WriteOptions(), handles_[1], test::RandomKey(&rnd, 10),
+                       test::RandomKey(&rnd, 10)));
+  }
+  ASSERT_OK(db_->Flush(FlushOptions(), handles_[1]));
+
+  std::vector<LiveFileMetaData> metadata;
+  db_->GetLiveFilesMetaData(&metadata);
+  ASSERT_EQ(2U, metadata.size());
+  ASSERT_EQ("new_cf", metadata[0].column_family_name);
+  ASSERT_EQ("new_cf", metadata[1].column_family_name);
+  auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno
+                      ? metadata[0].name
+                      : metadata[1].name;
+  ASSERT_TRUE(db_->DeleteFile(new_file).IsInvalidArgument());
+  ASSERT_OK(db_->DeleteFile(old_file));
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    ASSERT_OK(itr->status());
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "new_cf"}, options);
+
+  {
+    std::unique_ptr<Iterator> itr(db_->NewIterator(ReadOptions(), handles_[1]));
+    int count = 0;
+    for (itr->SeekToFirst(); itr->Valid(); itr->Next()) {
+      ASSERT_OK(itr->status());
+      ++count;
+    }
+    ASSERT_EQ(count, 1000);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/error_handler.cc b/src/rocksdb/db/error_handler.cc
new file mode 100644
index 000000000..7f68bb026
--- /dev/null
+++ b/src/rocksdb/db/error_handler.cc
@@ -0,0 +1,819 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/error_handler.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/event_helpers.h"
+#include "file/sst_file_manager_impl.h"
+#include "logging/logging.h"
+#include "port/lang.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Maps to help decide the severity of an error based on the
+// BackgroundErrorReason, Code, SubCode and whether db_options.paranoid_checks
+// is set or not. There are 3 maps, going from most specific to least specific
+// (i.e from all 4 fields in a tuple to only the BackgroundErrorReason and
+// paranoid_checks). The less specific map serves as a catch all in case we miss
+// a specific error code or subcode.
+std::map<std::tuple<BackgroundErrorReason, Status::Code, Status::SubCode, bool>,
+         Status::Severity>
+    ErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kSoftError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kNoSpace, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kSpaceLimit, true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         Status::SubCode::kIOFenced, false),
+         Status::Severity::kFatalError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kSpaceLimit,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+        // Errors during MANIFEST write when WAL is disabled
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         true),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kNoSpace,
+                         false),
+         Status::Severity::kHardError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, Status::SubCode::kIOFenced,
+                         false),
+         Status::Severity::kFatalError},
+
+};
+
+std::map<std::tuple<BackgroundErrorReason, Status::Code, bool>,
+         Status::Severity>
+    DefaultErrorSeverityMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlush,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, Status::Code::kIOError,
+                         false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWrite,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
+        // Errors during BG flush with WAL disabled
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, true),
+         Status::Severity::kUnrecoverableError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kCorruption, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlushNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kNoError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kManifestWriteNoWAL,
+                         Status::Code::kIOError, false),
+         Status::Severity::kFatalError},
+};
+
+std::map<std::tuple<BackgroundErrorReason, bool>, Status::Severity>
+    DefaultReasonMap = {
+        // Errors during BG compaction
+        {std::make_tuple(BackgroundErrorReason::kCompaction, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kCompaction, false),
+         Status::Severity::kNoError},
+        // Errors during BG flush
+        {std::make_tuple(BackgroundErrorReason::kFlush, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kFlush, false),
+         Status::Severity::kNoError},
+        // Errors during Write
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kWriteCallback, false),
+         Status::Severity::kFatalError},
+        // Errors during Memtable update
+        {std::make_tuple(BackgroundErrorReason::kMemTable, true),
+         Status::Severity::kFatalError},
+        {std::make_tuple(BackgroundErrorReason::kMemTable, false),
+         Status::Severity::kFatalError},
+};
+
+void ErrorHandler::CancelErrorRecovery() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // We'll release the lock before calling sfm, so make sure no new
+  // recovery gets scheduled at that point
+  auto_recovery_ = false;
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+  if (sfm) {
+    // This may or may not cancel a pending recovery
+    db_mutex_->Unlock();
+    bool cancelled = sfm->CancelErrorRecovery(this);
+    db_mutex_->Lock();
+    if (cancelled) {
+      recovery_in_prog_ = false;
+    }
+  }
+
+  // If auto recovery is also runing to resume from the retryable error,
+  // we should wait and end the auto recovery.
+  EndAutoRecovery();
+#endif
+}
+
+STATIC_AVOID_DESTRUCTION(const Status, kOkStatus){Status::OK()};
+
+// This is the main function for looking at an error during a background
+// operation and deciding the severity, and error recovery strategy. The high
+// level algorithm is as follows -
+// 1. Classify the severity of the error based on the ErrorSeverityMap,
+//    DefaultErrorSeverityMap and DefaultReasonMap defined earlier
+// 2. Call a Status code specific override function to adjust the severity
+//    if needed. The reason for this is our ability to recover may depend on
+//    the exact options enabled in DBOptions
+// 3. Determine if auto recovery is possible. A listener notification callback
+//    is called, which can disable the auto recovery even if we decide its
+//    feasible
+// 4. For Status::NoSpace() errors, rely on SstFileManagerImpl to control
+//    the actual recovery. If no sst file manager is specified in DBOptions,
+//    a default one is allocated during DB::Open(), so there will always be
+//    one.
+// This can also get called as part of a recovery operation. In that case, we
+// also track the error separately in recovery_error_ so we can tell in the
+// end whether recovery succeeded or not
+const Status& ErrorHandler::HandleKnownErrors(const Status& bg_err,
+                                              BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+  if (bg_err.ok()) {
+    return kOkStatus;
+  }
+
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+  }
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "ErrorHandler: Set regular background error\n");
+
+  bool paranoid = db_options_.paranoid_checks;
+  Status::Severity sev = Status::Severity::kFatalError;
+  Status new_bg_err;
+  DBRecoverContext context;
+  bool found = false;
+
+  {
+    auto entry = ErrorSeverityMap.find(
+        std::make_tuple(reason, bg_err.code(), bg_err.subcode(), paranoid));
+    if (entry != ErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultErrorSeverityMap.find(
+        std::make_tuple(reason, bg_err.code(), paranoid));
+    if (entry != DefaultErrorSeverityMap.end()) {
+      sev = entry->second;
+      found = true;
+    }
+  }
+
+  if (!found) {
+    auto entry = DefaultReasonMap.find(std::make_tuple(reason, paranoid));
+    if (entry != DefaultReasonMap.end()) {
+      sev = entry->second;
+    }
+  }
+
+  new_bg_err = Status(bg_err, sev);
+
+  // Check if recovery is currently in progress. If it is, we will save this
+  // error so we can check it at the end to see if recovery succeeded or not
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = new_bg_err;
+  }
+
+  bool auto_recovery = auto_recovery_;
+  if (new_bg_err.severity() >= Status::Severity::kFatalError && auto_recovery) {
+    auto_recovery = false;
+  }
+
+  // Allow some error specific overrides
+  if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+      new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+    new_bg_err = OverrideNoSpaceError(new_bg_err, &auto_recovery);
+  }
+
+  if (!new_bg_err.ok()) {
+    Status s = new_bg_err;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason, &s,
+                                          db_mutex_, &auto_recovery);
+    if (!s.ok() && (s.severity() > bg_error_.severity())) {
+      bg_error_ = s;
+    } else {
+      // This error is less severe than previously encountered error. Don't
+      // take any further action
+      return bg_error_;
+    }
+  }
+
+  recover_context_ = context;
+  if (auto_recovery) {
+    recovery_in_prog_ = true;
+
+    // Kick-off error specific recovery
+    if (new_bg_err.subcode() == IOStatus::SubCode::kNoSpace ||
+        new_bg_err.subcode() == IOStatus::SubCode::kSpaceLimit) {
+      RecoverFromNoSpace();
+    }
+  }
+  if (bg_error_.severity() >= Status::Severity::kHardError) {
+    is_db_stopped_.store(true, std::memory_order_release);
+  }
+  return bg_error_;
+}
+
+// This is the main function for looking at IO related error during the
+// background operations. The main logic is:
+// 1) File scope IO error is treated as retryable IO error in the write
+//    path. In RocksDB, If a file has write IO error and it is at file scope,
+//    RocksDB never write to the same file again. RocksDB will create a new
+//    file and rewrite the whole content. Thus, it is retryable.
+// 1) if the error is caused by data loss, the error is mapped to
+//    unrecoverable error. Application/user must take action to handle
+//    this situation (File scope case is excluded).
+// 2) if the error is a Retryable IO error (i.e., it is a file scope IO error,
+//     or its retryable flag is set and not a data loss error), auto resume
+//     will be called and the auto resume can be controlled by resume count
+//     and resume interval options. There are three sub-cases:
+//    a) if the error happens during compaction, it is mapped to a soft error.
+//       the compaction thread will reschedule a new compaction.
+//    b) if the error happens during flush and also WAL is empty, it is mapped
+//       to a soft error. Note that, it includes the case that IO error happens
+//       in SST or manifest write during flush.
+//    c) all other errors are mapped to hard error.
+// 3) for other cases, SetBGError(const Status& bg_err, BackgroundErrorReason
+//    reason) will be called to handle other error cases.
+const Status& ErrorHandler::SetBGError(const Status& bg_status,
+                                       BackgroundErrorReason reason) {
+  db_mutex_->AssertHeld();
+  Status tmp_status = bg_status;
+  IOStatus bg_io_err = status_to_io_status(std::move(tmp_status));
+
+  if (bg_io_err.ok()) {
+    return kOkStatus;
+  }
+  ROCKS_LOG_WARN(db_options_.info_log, "Background IO error %s",
+                 bg_io_err.ToString().c_str());
+
+  if (recovery_in_prog_ && recovery_io_error_.ok()) {
+    recovery_io_error_ = bg_io_err;
+  }
+  if (BackgroundErrorReason::kManifestWrite == reason ||
+      BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+    // Always returns ok
+    ROCKS_LOG_INFO(db_options_.info_log, "Disabling File Deletions");
+    db_->DisableFileDeletionsWithLock().PermitUncheckedError();
+  }
+
+  Status new_bg_io_err = bg_io_err;
+  DBRecoverContext context;
+  if (bg_io_err.GetScope() != IOStatus::IOErrorScope::kIOErrorScopeFile &&
+      bg_io_err.GetDataLoss()) {
+    // First, data loss (non file scope) is treated as unrecoverable error. So
+    // it can directly overwrite any existing bg_error_.
+    bool auto_recovery = false;
+    Status bg_err(new_bg_io_err, Status::Severity::kUnrecoverableError);
+    CheckAndSetRecoveryAndBGError(bg_err);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "ErrorHandler: Set background IO error as unrecoverable error\n");
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &bg_err, db_mutex_, &auto_recovery);
+    recover_context_ = context;
+    return bg_error_;
+  } else if (bg_io_err.subcode() != IOStatus::SubCode::kNoSpace &&
+             (bg_io_err.GetScope() ==
+                  IOStatus::IOErrorScope::kIOErrorScopeFile ||
+              bg_io_err.GetRetryable())) {
+    // Second, check if the error is a retryable IO error (file scope IO error
+    // is also treated as retryable IO error in RocksDB write path). if it is
+    // retryable error and its severity is higher than bg_error_, overwrite the
+    // bg_error_ with new error. In current stage, for retryable IO error of
+    // compaction, treat it as soft error. In other cases, treat the retryable
+    // IO error as hard error. Note that, all the NoSpace error should be
+    // handled by the SstFileManager::StartErrorRecovery(). Therefore, no matter
+    // it is retryable or file scope, this logic will be bypassed.
+    bool auto_recovery = false;
+    EventHelpers::NotifyOnBackgroundError(db_options_.listeners, reason,
+                                          &new_bg_io_err, db_mutex_,
+                                          &auto_recovery);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT);
+    }
+    ROCKS_LOG_INFO(db_options_.info_log,
+                   "ErrorHandler: Set background retryable IO error\n");
+    if (BackgroundErrorReason::kCompaction == reason) {
+      // We map the retryable IO error during compaction to soft error. Since
+      // compaction can reschedule by itself. We will not set the BG error in
+      // this case
+      // TODO:  a better way to set or clean the retryable IO error which
+      // happens during compaction SST file write.
+      if (bg_error_stats_ != nullptr) {
+        RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+      }
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "ErrorHandler: Compaction will schedule by itself to resume\n");
+      return bg_error_;
+    } else if (BackgroundErrorReason::kFlushNoWAL == reason ||
+               BackgroundErrorReason::kManifestWriteNoWAL == reason) {
+      // When the BG Retryable IO error reason is flush without WAL,
+      // We map it to a soft error. At the same time, all the background work
+      // should be stopped except the BG work from recovery. Therefore, we
+      // set the soft_error_no_bg_work_ to true. At the same time, since DB
+      // continues to receive writes when BG error is soft error, to avoid
+      // to many small memtable being generated during auto resume, the flush
+      // reason is set to kErrorRecoveryRetryFlush.
+      Status bg_err(new_bg_io_err, Status::Severity::kSoftError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      soft_error_no_bg_work_ = true;
+      context.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    } else {
+      Status bg_err(new_bg_io_err, Status::Severity::kHardError);
+      CheckAndSetRecoveryAndBGError(bg_err);
+      recover_context_ = context;
+      return StartRecoverFromRetryableBGIOError(bg_io_err);
+    }
+  } else {
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(), ERROR_HANDLER_BG_IO_ERROR_COUNT);
+    }
+    // HandleKnownErrors() will use recovery_error_, so ignore
+    // recovery_io_error_.
+    // TODO: Do some refactoring and use only one recovery_error_
+    recovery_io_error_.PermitUncheckedError();
+    return HandleKnownErrors(new_bg_io_err, reason);
+  }
+}
+
+Status ErrorHandler::OverrideNoSpaceError(const Status& bg_error,
+                                          bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (bg_error.severity() >= Status::Severity::kFatalError) {
+    return bg_error;
+  }
+
+  if (db_options_.sst_file_manager.get() == nullptr) {
+    // We rely on SFM to poll for enough disk space and recover
+    *auto_recovery = false;
+    return bg_error;
+  }
+
+  if (db_options_.allow_2pc &&
+      (bg_error.severity() <= Status::Severity::kSoftError)) {
+    // Don't know how to recover, as the contents of the current WAL file may
+    // be inconsistent, and it may be needed for 2PC. If 2PC is not enabled,
+    // we can just flush the memtable and discard the log
+    *auto_recovery = false;
+    return Status(bg_error, Status::Severity::kFatalError);
+  }
+
+  {
+    uint64_t free_space;
+    if (db_options_.env->GetFreeSpace(db_options_.db_paths[0].path,
+                                      &free_space) == Status::NotSupported()) {
+      *auto_recovery = false;
+    }
+  }
+
+  return bg_error;
+#else
+  (void)auto_recovery;
+  return Status(bg_error, Status::Severity::kFatalError);
+#endif
+}
+
+void ErrorHandler::RecoverFromNoSpace() {
+#ifndef ROCKSDB_LITE
+  SstFileManagerImpl* sfm =
+      reinterpret_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
+
+  // Inform SFM of the error, so it can kick-off the recovery
+  if (sfm) {
+    sfm->StartErrorRecovery(this, bg_error_);
+  }
+#endif
+}
+
+Status ErrorHandler::ClearBGError() {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+
+  // Signal that recovery succeeded
+  if (recovery_error_.ok()) {
+    Status old_bg_error = bg_error_;
+    // old_bg_error is only for notifying listeners, so may not be checked
+    old_bg_error.PermitUncheckedError();
+    // Clear and check the recovery IO and BG error
+    bg_error_ = Status::OK();
+    recovery_io_error_ = IOStatus::OK();
+    bg_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
+    recovery_in_prog_ = false;
+    soft_error_no_bg_work_ = false;
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, old_bg_error,
+                                           bg_error_, db_mutex_);
+  }
+  return recovery_error_;
+#else
+  return bg_error_;
+#endif
+}
+
+Status ErrorHandler::RecoverFromBGError(bool is_manual) {
+#ifndef ROCKSDB_LITE
+  InstrumentedMutexLock l(db_mutex_);
+  bool no_bg_work_original_flag = soft_error_no_bg_work_;
+  if (is_manual) {
+    // If its a manual recovery and there's a background recovery in progress
+    // return busy status
+    if (recovery_in_prog_) {
+      return Status::Busy();
+    }
+    recovery_in_prog_ = true;
+
+    // In manual resume, we allow the bg work to run. If it is a auto resume,
+    // the bg work should follow this tag.
+    soft_error_no_bg_work_ = false;
+
+    // In manual resume, if the bg error is a soft error and also requires
+    // no bg work, the error must be recovered by call the flush with
+    // flush reason: kErrorRecoveryRetryFlush. In other case, the flush
+    // reason is set to kErrorRecovery.
+    if (no_bg_work_original_flag) {
+      recover_context_.flush_reason = FlushReason::kErrorRecoveryRetryFlush;
+    } else {
+      recover_context_.flush_reason = FlushReason::kErrorRecovery;
+    }
+  }
+
+  if (bg_error_.severity() == Status::Severity::kSoftError &&
+      recover_context_.flush_reason == FlushReason::kErrorRecovery) {
+    // Simply clear the background error and return
+    recovery_error_ = Status::OK();
+    return ClearBGError();
+  }
+
+  // Reset recovery_error_. We will use this to record any errors that happen
+  // during the recovery process. While recovering, the only operations that
+  // can generate background errors should be the flush operations
+  recovery_error_ = Status::OK();
+  recovery_error_.PermitUncheckedError();
+  Status s = db_->ResumeImpl(recover_context_);
+  if (s.ok()) {
+    soft_error_no_bg_work_ = false;
+  } else {
+    soft_error_no_bg_work_ = no_bg_work_original_flag;
+  }
+
+  // For manual recover, shutdown, and fatal error  cases, set
+  // recovery_in_prog_ to false. For automatic background recovery, leave it
+  // as is regardless of success or failure as it will be retried
+  if (is_manual || s.IsShutdownInProgress() ||
+      bg_error_.severity() >= Status::Severity::kFatalError) {
+    recovery_in_prog_ = false;
+  }
+  return s;
+#else
+  (void)is_manual;
+  return bg_error_;
+#endif
+}
+
+const Status& ErrorHandler::StartRecoverFromRetryableBGIOError(
+    const IOStatus& io_error) {
+#ifndef ROCKSDB_LITE
+  db_mutex_->AssertHeld();
+  if (bg_error_.ok()) {
+    return bg_error_;
+  } else if (io_error.ok()) {
+    return kOkStatus;
+  } else if (db_options_.max_bgerror_resume_count <= 0 || recovery_in_prog_) {
+    // Auto resume BG error is not enabled, directly return bg_error_.
+    return bg_error_;
+  }
+  if (bg_error_stats_ != nullptr) {
+    RecordTick(bg_error_stats_.get(), ERROR_HANDLER_AUTORESUME_COUNT);
+  }
+  ROCKS_LOG_INFO(
+      db_options_.info_log,
+      "ErrorHandler: Call StartRecoverFromRetryableBGIOError to resume\n");
+  if (recovery_thread_) {
+    // In this case, if recovery_in_prog_ is false, current thread should
+    // wait the previous recover thread to finish and create a new thread
+    // to recover from the bg error.
+    db_mutex_->Unlock();
+    recovery_thread_->join();
+    db_mutex_->Lock();
+  }
+
+  recovery_in_prog_ = true;
+  recovery_thread_.reset(
+      new port::Thread(&ErrorHandler::RecoverFromRetryableBGIOError, this));
+
+  if (recovery_io_error_.ok() && recovery_error_.ok()) {
+    return recovery_error_;
+  } else {
+    return bg_error_;
+  }
+#else
+  (void)io_error;
+  return bg_error_;
+#endif
+}
+
+// Automatic recover from Retryable BG IO error. Must be called after db
+// mutex is released.
+void ErrorHandler::RecoverFromRetryableBGIOError() {
+#ifndef ROCKSDB_LITE
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeStart");
+  InstrumentedMutexLock l(db_mutex_);
+  if (end_recovery_) {
+    EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                           Status::ShutdownInProgress(),
+                                           db_mutex_);
+    return;
+  }
+  DBRecoverContext context = recover_context_;
+  int resume_count = db_options_.max_bgerror_resume_count;
+  uint64_t wait_interval = db_options_.bgerror_resume_retry_interval;
+  uint64_t retry_count = 0;
+  // Recover from the retryable error. Create a separate thread to do it.
+  while (resume_count > 0) {
+    if (end_recovery_) {
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             Status::ShutdownInProgress(),
+                                             db_mutex_);
+      return;
+    }
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume0");
+    TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeResume1");
+    recovery_io_error_ = IOStatus::OK();
+    recovery_error_ = Status::OK();
+    retry_count++;
+    Status s = db_->ResumeImpl(context);
+    if (bg_error_stats_ != nullptr) {
+      RecordTick(bg_error_stats_.get(),
+                 ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT);
+    }
+    if (s.IsShutdownInProgress() ||
+        bg_error_.severity() >= Status::Severity::kFatalError) {
+      // If DB shutdown in progress or the error severity is higher than
+      // Hard Error, stop auto resume and returns.
+      recovery_in_prog_ = false;
+      if (bg_error_stats_ != nullptr) {
+        RecordInHistogram(bg_error_stats_.get(),
+                          ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+      }
+      EventHelpers::NotifyOnErrorRecoveryEnd(db_options_.listeners, bg_error_,
+                                             bg_error_, db_mutex_);
+      return;
+    }
+    if (!recovery_io_error_.ok() &&
+        recovery_error_.severity() <= Status::Severity::kHardError &&
+        recovery_io_error_.GetRetryable()) {
+      // If new BG IO error happens during auto recovery and it is retryable
+      // and its severity is Hard Error or lower, the auto resmue sleep for
+      // a period of time and redo auto resume if it is allowed.
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait0");
+      TEST_SYNC_POINT("RecoverFromRetryableBGIOError:BeforeWait1");
+      int64_t wait_until = db_options_.clock->NowMicros() + wait_interval;
+      cv_.TimedWait(wait_until);
+    } else {
+      // There are three possibility: 1) recover_io_error is set during resume
+      // and the error is not retryable, 2) recover is successful, 3) other
+      // error happens during resume and cannot be resumed here.
+      if (recovery_io_error_.ok() && recovery_error_.ok() && s.ok()) {
+        // recover from the retryable IO error and no other BG errors. Clean
+        // the bg_error and notify user.
+        TEST_SYNC_POINT("RecoverFromRetryableBGIOError:RecoverSuccess");
+        Status old_bg_error = bg_error_;
+        is_db_stopped_.store(false, std::memory_order_release);
+        bg_error_ = Status::OK();
+        bg_error_.PermitUncheckedError();
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, old_bg_error, bg_error_, db_mutex_);
+        if (bg_error_stats_ != nullptr) {
+          RecordTick(bg_error_stats_.get(),
+                     ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT);
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        recovery_in_prog_ = false;
+        if (soft_error_no_bg_work_) {
+          soft_error_no_bg_work_ = false;
+        }
+        return;
+      } else {
+        // In this case: 1) recovery_io_error is more serious or not retryable
+        // 2) other Non IO recovery_error happens. The auto recovery stops.
+        recovery_in_prog_ = false;
+        if (bg_error_stats_ != nullptr) {
+          RecordInHistogram(bg_error_stats_.get(),
+                            ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+        }
+        EventHelpers::NotifyOnErrorRecoveryEnd(
+            db_options_.listeners, bg_error_,
+            !recovery_io_error_.ok()
+                ? recovery_io_error_
+                : (!recovery_error_.ok() ? recovery_error_ : s),
+            db_mutex_);
+        return;
+      }
+    }
+    resume_count--;
+  }
+  recovery_in_prog_ = false;
+  EventHelpers::NotifyOnErrorRecoveryEnd(
+      db_options_.listeners, bg_error_,
+      Status::Aborted("Exceeded resume retry count"), db_mutex_);
+  TEST_SYNC_POINT("RecoverFromRetryableBGIOError:LoopOut");
+  if (bg_error_stats_ != nullptr) {
+    RecordInHistogram(bg_error_stats_.get(),
+                      ERROR_HANDLER_AUTORESUME_RETRY_COUNT, retry_count);
+  }
+  return;
+#else
+  return;
+#endif
+}
+
+void ErrorHandler::CheckAndSetRecoveryAndBGError(const Status& bg_err) {
+  if (recovery_in_prog_ && recovery_error_.ok()) {
+    recovery_error_ = bg_err;
+  }
+  if (bg_err.severity() > bg_error_.severity()) {
+    bg_error_ = bg_err;
+  }
+  if (bg_error_.severity() >= Status::Severity::kHardError) {
+    is_db_stopped_.store(true, std::memory_order_release);
+  }
+  return;
+}
+
+void ErrorHandler::EndAutoRecovery() {
+  db_mutex_->AssertHeld();
+  if (!end_recovery_) {
+    end_recovery_ = true;
+  }
+  cv_.SignalAll();
+  db_mutex_->Unlock();
+  if (recovery_thread_) {
+    recovery_thread_->join();
+  }
+  db_mutex_->Lock();
+  return;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler.h b/src/rocksdb/db/error_handler.h
new file mode 100644
index 000000000..34e08a525
--- /dev/null
+++ b/src/rocksdb/db/error_handler.h
@@ -0,0 +1,124 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+
+// This structure is used to store the DB recovery context. The context is
+// the information that related to the recover actions. For example, it contains
+// FlushReason, which tells the flush job why this flush is called.
+struct DBRecoverContext {
+  FlushReason flush_reason;
+
+  DBRecoverContext() : flush_reason(FlushReason::kErrorRecovery) {}
+
+  DBRecoverContext(FlushReason reason) : flush_reason(reason) {}
+};
+
+class ErrorHandler {
+ public:
+  ErrorHandler(DBImpl* db, const ImmutableDBOptions& db_options,
+               InstrumentedMutex* db_mutex)
+      : db_(db),
+        db_options_(db_options),
+        cv_(db_mutex),
+        end_recovery_(false),
+        recovery_thread_(nullptr),
+        db_mutex_(db_mutex),
+        auto_recovery_(false),
+        recovery_in_prog_(false),
+        soft_error_no_bg_work_(false),
+        is_db_stopped_(false),
+        bg_error_stats_(db_options.statistics) {
+    // Clear the checked flag for uninitialized errors
+    bg_error_.PermitUncheckedError();
+    recovery_error_.PermitUncheckedError();
+    recovery_io_error_.PermitUncheckedError();
+  }
+
+  void EnableAutoRecovery() { auto_recovery_ = true; }
+
+  Status::Severity GetErrorSeverity(BackgroundErrorReason reason,
+                                    Status::Code code, Status::SubCode subcode);
+
+  const Status& SetBGError(const Status& bg_err, BackgroundErrorReason reason);
+
+  Status GetBGError() const { return bg_error_; }
+
+  Status GetRecoveryError() const { return recovery_error_; }
+
+  Status ClearBGError();
+
+  bool IsDBStopped() { return is_db_stopped_.load(std::memory_order_acquire); }
+
+  bool IsBGWorkStopped() {
+    assert(db_mutex_);
+    db_mutex_->AssertHeld();
+    return !bg_error_.ok() &&
+           (bg_error_.severity() >= Status::Severity::kHardError ||
+            !auto_recovery_ || soft_error_no_bg_work_);
+  }
+
+  bool IsSoftErrorNoBGWork() { return soft_error_no_bg_work_; }
+
+  bool IsRecoveryInProgress() { return recovery_in_prog_; }
+
+  Status RecoverFromBGError(bool is_manual = false);
+  void CancelErrorRecovery();
+
+  void EndAutoRecovery();
+
+ private:
+  DBImpl* db_;
+  const ImmutableDBOptions& db_options_;
+  Status bg_error_;
+  // A separate Status variable used to record any errors during the
+  // recovery process from hard errors
+  Status recovery_error_;
+  // A separate IO Status variable used to record any IO errors during
+  // the recovery process. At the same time, recovery_error_ is also set.
+  IOStatus recovery_io_error_;
+  // The condition variable used with db_mutex during auto resume for time
+  // wait.
+  InstrumentedCondVar cv_;
+  bool end_recovery_;
+  std::unique_ptr<port::Thread> recovery_thread_;
+
+  InstrumentedMutex* db_mutex_;
+  // A flag indicating whether automatic recovery from errors is enabled
+  bool auto_recovery_;
+  bool recovery_in_prog_;
+  // A flag to indicate that for the soft error, we should not allow any
+  // background work except the work is from recovery.
+  bool soft_error_no_bg_work_;
+
+  // Used to store the context for recover, such as flush reason.
+  DBRecoverContext recover_context_;
+  std::atomic<bool> is_db_stopped_;
+
+  // The pointer of DB statistics.
+  std::shared_ptr<Statistics> bg_error_stats_;
+
+  const Status& HandleKnownErrors(const Status& bg_err,
+                                  BackgroundErrorReason reason);
+  Status OverrideNoSpaceError(const Status& bg_error, bool* auto_recovery);
+  void RecoverFromNoSpace();
+  const Status& StartRecoverFromRetryableBGIOError(const IOStatus& io_error);
+  void RecoverFromRetryableBGIOError();
+  // First, if it is in recovery and the recovery_error is ok. Set the
+  // recovery_error_ to bg_err. Second, if the severity is higher than the
+  // current bg_error_, overwrite it.
+  void CheckAndSetRecoveryAndBGError(const Status& bg_err);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/error_handler_fs_test.cc b/src/rocksdb/db/error_handler_fs_test.cc
new file mode 100644
index 000000000..153f3b79e
--- /dev/null
+++ b/src/rocksdb/db/error_handler_fs_test.cc
@@ -0,0 +1,2875 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#ifndef ROCKSDB_LITE
+
+#include "db/db_test_util.h"
+#include "file/sst_file_manager_impl.h"
+#include "port/stack_trace.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/sst_file_manager.h"
+#if !defined(ROCKSDB_LITE)
+#include "test_util/sync_point.h"
+#endif
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#include "utilities/fault_injection_fs.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBErrorHandlingFSTest : public DBTestBase {
+ public:
+  DBErrorHandlingFSTest()
+      : DBTestBase("db_error_handling_fs_test", /*env_do_fsync=*/true) {
+    fault_fs_.reset(new FaultInjectionTestFS(env_->GetFileSystem()));
+    fault_env_.reset(new CompositeEnvWrapper(env_, fault_fs_));
+  }
+
+  std::string GetManifestNameFromLiveFiles() {
+    std::vector<std::string> live_files;
+    uint64_t manifest_size;
+
+    Status s = dbfull()->GetLiveFiles(live_files, &manifest_size, false);
+    if (!s.ok()) {
+      return "";
+    }
+    for (auto& file : live_files) {
+      uint64_t num = 0;
+      FileType type;
+      if (ParseFileName(file, &num, &type) && type == kDescriptorFile) {
+        return file;
+      }
+    }
+    return "";
+  }
+
+  std::shared_ptr<FaultInjectionTestFS> fault_fs_;
+  std::unique_ptr<Env> fault_env_;
+};
+
+class ErrorHandlerFSListener : public EventListener {
+ public:
+  ErrorHandlerFSListener()
+      : mutex_(),
+        cv_(&mutex_),
+        no_auto_recovery_(false),
+        recovery_complete_(false),
+        file_creation_started_(false),
+        override_bg_error_(false),
+        file_count_(0),
+        fault_fs_(nullptr) {}
+  ~ErrorHandlerFSListener() {
+    file_creation_error_.PermitUncheckedError();
+    bg_error_.PermitUncheckedError();
+    new_bg_error_.PermitUncheckedError();
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& /*ti*/) override {
+    InstrumentedMutexLock l(&mutex_);
+    file_creation_started_ = true;
+    if (file_count_ > 0) {
+      if (--file_count_ == 0) {
+        fault_fs_->SetFilesystemActive(false, file_creation_error_);
+        file_creation_error_ = IOStatus::OK();
+      }
+    }
+    cv_.SignalAll();
+  }
+
+  void OnErrorRecoveryBegin(BackgroundErrorReason /*reason*/, Status bg_error,
+                            bool* auto_recovery) override {
+    bg_error.PermitUncheckedError();
+    if (*auto_recovery && no_auto_recovery_) {
+      *auto_recovery = false;
+    }
+  }
+
+  void OnErrorRecoveryEnd(const BackgroundErrorRecoveryInfo& info) override {
+    InstrumentedMutexLock l(&mutex_);
+    recovery_complete_ = true;
+    cv_.SignalAll();
+    new_bg_error_ = info.new_bg_error;
+  }
+
+  bool WaitForRecovery(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!recovery_complete_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    if (recovery_complete_) {
+      recovery_complete_ = false;
+      return true;
+    }
+    return false;
+  }
+
+  void WaitForTableFileCreationStarted(uint64_t /*abs_time_us*/) {
+    InstrumentedMutexLock l(&mutex_);
+    while (!file_creation_started_) {
+      cv_.Wait(/*abs_time_us*/);
+    }
+    file_creation_started_ = false;
+  }
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (override_bg_error_) {
+      *bg_error = bg_error_;
+      override_bg_error_ = false;
+    }
+  }
+
+  void EnableAutoRecovery(bool enable = true) { no_auto_recovery_ = !enable; }
+
+  void OverrideBGError(Status bg_err) {
+    bg_error_ = bg_err;
+    override_bg_error_ = true;
+  }
+
+  void InjectFileCreationError(FaultInjectionTestFS* fs, int file_count,
+                               IOStatus io_s) {
+    fault_fs_ = fs;
+    file_count_ = file_count;
+    file_creation_error_ = io_s;
+  }
+
+  Status new_bg_error() { return new_bg_error_; }
+
+ private:
+  InstrumentedMutex mutex_;
+  InstrumentedCondVar cv_;
+  bool no_auto_recovery_;
+  bool recovery_complete_;
+  bool file_creation_started_;
+  bool override_bg_error_;
+  int file_count_;
+  IOStatus file_creation_error_;
+  Status bg_error_;
+  Status new_bg_error_;
+  FaultInjectionTestFS* fault_fs_;
+};
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  Destroy(options);
+}
+
+// All the NoSpace IOError will be handled as the regular BG Error no matter the
+// retryable flag is set of not. So the auto resume for retryable IO Error will
+// not be triggered. Also, it is mapped as hard error.
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::NoSpace("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+
+  ASSERT_OK(Put(Key(2), "val2"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  // not file scope, but retyrable set
+  error_msg.SetDataLoss(false);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFileSystem);
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(3), "val3"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Reopen(options);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWALAtomicWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.atomic_flush = true;
+  Status s;
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  listener->EnableAutoRecovery(false);
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::SyncClosedLogs:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateAndReopenWithCF({"pikachu, sdfsdfsdf"}, options);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = false;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  auto cfh = dbfull()->GetColumnFamilyHandle(1);
+  s = dbfull()->DropColumnFamily(cfh);
+
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The flush error is injected before we finish the table build
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableError1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we sync table
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError2) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeSyncTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+// The retryable IO error is injected before we close the table file
+TEST_F(DBErrorHandlingFSTest, FLushWriteNoWALRetryableError3) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeCloseTableFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val2", Get(Key(2)));
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  ASSERT_EQ("val3", Get(Key(3)));
+  s = Flush();
+  ASSERT_OK(s);
+  ASSERT_EQ("val3", Get(Key(3)));
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, DoubleManifestWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+
+  // This Resume() will attempt to create a new manifest file and fail again
+  s = dbfull()->Resume();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // A successful Resume() will create a new manifest file
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       // Wait for DB instance to clear bg_error before calling
+       // TEST_WaitForCompact
+       {"SstFileManagerImpl::ErrorCleared", "CompactionManifestWriteError:2"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false,
+                                         IOStatus::NoSpace("Out of space"));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  // This Flush will trigger a compaction, which will fail when appending to
+  // the manifest
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  // Clear all errors so when the compaction is retried, it will succeed
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+  TEST_SYNC_POINT("CompactionManifestWriteError:2");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionManifestWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteError:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteError:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteError:0");
+  TEST_SYNC_POINT("CompactionManifestWriteError:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(
+      Status(Status::NoSpace(), Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::NoSpace("Out of space"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, DISABLED_CompactionWriteFileScopeError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 0;
+  Status s;
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("File Scope Data Loss Error");
+  error_msg.SetDataLoss(true);
+  error_msg.SetScope(
+      ROCKSDB_NAMESPACE::IOStatus::IOErrorScope::kIOErrorScopeFile);
+  error_msg.SetRetryable(false);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Finish",
+      [&](void*) { CancelAllBackgroundWork(dbfull()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  s = dbfull()->Resume();
+  ASSERT_OK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, CorruptionError) {
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false,
+                                       IOStatus::Corruption("Corruption"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(),
+            ROCKSDB_NAMESPACE::Status::Severity::kUnrecoverableError);
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_NOK(s);
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, AutoRecoverFlushError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  s = Put(Key(1), "val");
+  ASSERT_OK(s);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FailRecoverFlushError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  Status s;
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::NoSpace("Out of space"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  // We should be able to shutdown the database while auto recovery is going
+  // on in the background
+  Close();
+  DestroyDB(dbname_, options).PermitUncheckedError();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(s, s.NoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+  // drain all callbacks. Otherwise, a pending callback in the background
+  // could re-disable `fault_fs_` after we enable it below.
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Reopen(options);
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 0;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOError());
+  }
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  ASSERT_OK(dbfull()->Resume());
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiCFWALWriteError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  Random rnd(301);
+
+  listener->EnableAutoRecovery();
+  CreateAndReopenWithCF({"one", "two", "three"}, options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 1; i < 4; ++i) {
+      for (auto j = 0; j < 100; ++j) {
+        ASSERT_OK(batch.Put(handles_[i], Key(j), rnd.RandomString(1024)));
+      }
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    // Write to one CF
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(handles_[2], Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::NoSpace("Out of space"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    Status s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsNoSpace());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  // `ClearAllCallBacks()` is needed in addition to `DisableProcessing()` to
+  // drain all callbacks. Otherwise, a pending callback in the background
+  // could re-disable `fault_fs_` after we enable it below.
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  for (auto i = 1; i < 4; ++i) {
+    // Every CF should have been flushed
+    ASSERT_EQ(NumTableFilesAtLevel(0, i), 1);
+  }
+
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  ReopenWithColumnFamilies({"default", "one", "two", "three"}, options);
+  for (auto i = 1; i < 4; ++i) {
+    for (auto j = 0; j < 199; ++j) {
+      if (j < 100) {
+        ASSERT_NE(Get(i, Key(j)), "NOT_FOUND");
+      } else {
+        ASSERT_EQ(Get(i, Key(j)), "NOT_FOUND");
+      }
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBCompactionError) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    // Setup for returning error for the 3rd SST, which would be level 1
+    listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                         IOStatus::NoSpace("Out of space"));
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    delete db[i];
+    fault_fs[i]->SetFilesystemActive(true);
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  sfm.reset();
+  delete def_env;
+}
+
+TEST_F(DBErrorHandlingFSTest, MultiDBVariousErrors) {
+  if (mem_env_ != nullptr) {
+    ROCKSDB_GTEST_SKIP("Test requires non-mock environment");
+    return;
+  }
+  FaultInjectionTestEnv* def_env = new FaultInjectionTestEnv(env_);
+  std::vector<std::unique_ptr<Env>> fault_envs;
+  std::vector<FaultInjectionTestFS*> fault_fs;
+  std::vector<Options> options;
+  std::vector<std::shared_ptr<ErrorHandlerFSListener>> listener;
+  std::vector<DB*> db;
+  std::shared_ptr<SstFileManager> sfm(NewSstFileManager(def_env));
+  int kNumDbInstances = 3;
+  Random rnd(301);
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    listener.emplace_back(new ErrorHandlerFSListener());
+    options.emplace_back(GetDefaultOptions());
+    fault_fs.emplace_back(new FaultInjectionTestFS(env_->GetFileSystem()));
+    std::shared_ptr<FileSystem> fs(fault_fs.back());
+    fault_envs.emplace_back(new CompositeEnvWrapper(def_env, fs));
+    options[i].env = fault_envs.back().get();
+    options[i].create_if_missing = true;
+    options[i].level0_file_num_compaction_trigger = 2;
+    options[i].writable_file_max_buffer_size = 32768;
+    options[i].listeners.emplace_back(listener[i]);
+    options[i].sst_file_manager = sfm;
+    DB* dbptr;
+    char buf[16];
+
+    listener[i]->EnableAutoRecovery();
+    switch (i) {
+      case 0:
+        // Setup for returning error for the 3rd SST, which would be level 1
+        listener[i]->InjectFileCreationError(fault_fs[i], 3,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      case 1:
+        // Setup for returning error after the 1st SST, which would result
+        // in a hard error
+        listener[i]->InjectFileCreationError(fault_fs[i], 2,
+                                             IOStatus::NoSpace("Out of space"));
+        break;
+      default:
+        break;
+    }
+    snprintf(buf, sizeof(buf), "_%d", i);
+    ASSERT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    ASSERT_OK(DB::Open(options[i], dbname_ + std::string(buf), &dbptr));
+    db.emplace_back(dbptr);
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    for (auto j = 0; j <= 100; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    ASSERT_OK(db[i]->Flush(FlushOptions()));
+  }
+
+  def_env->SetFilesystemActive(false, Status::NoSpace("Out of space"));
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    WriteBatch batch;
+
+    // Write to one CF
+    for (auto j = 100; j < 199; ++j) {
+      ASSERT_OK(batch.Put(Key(j), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(db[i]->Write(wopts, &batch));
+    if (i != 1) {
+      ASSERT_OK(db[i]->Flush(FlushOptions()));
+    } else {
+      ASSERT_TRUE(db[i]->Flush(FlushOptions()).IsNoSpace());
+    }
+  }
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    Status s = static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true);
+    switch (i) {
+      case 0:
+        ASSERT_EQ(s.severity(), Status::Severity::kSoftError);
+        break;
+      case 1:
+        ASSERT_EQ(s.severity(), Status::Severity::kHardError);
+        break;
+      case 2:
+        ASSERT_OK(s);
+        break;
+    }
+    fault_fs[i]->SetFilesystemActive(true);
+  }
+
+  def_env->SetFilesystemActive(true);
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    std::string prop;
+    if (i < 2) {
+      ASSERT_EQ(listener[i]->WaitForRecovery(5000000), true);
+    }
+    if (i == 1) {
+      ASSERT_OK(static_cast<DBImpl*>(db[i])->TEST_WaitForCompact(true));
+    }
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(0), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 0);
+    EXPECT_TRUE(db[i]->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(1), &prop));
+    EXPECT_EQ(atoi(prop.c_str()), 1);
+  }
+
+  SstFileManagerImpl* sfmImpl =
+      static_cast_with_check<SstFileManagerImpl>(sfm.get());
+  sfmImpl->Close();
+
+  for (auto i = 0; i < kNumDbInstances; ++i) {
+    char buf[16];
+    snprintf(buf, sizeof(buf), "_%d", i);
+    fault_fs[i]->SetFilesystemActive(true);
+    delete db[i];
+    if (getenv("KEEP_DB")) {
+      printf("DB is still at %s%s\n", dbname_.c_str(), buf);
+    } else {
+      EXPECT_OK(DestroyDB(dbname_ + std::string(buf), options[i]));
+    }
+  }
+  options.clear();
+  delete def_env;
+}
+
+// When Put the KV-pair, the write option is set to disable WAL.
+// If retryable error happens in this condition, map the bg error
+// to soft error and trigger auto resume. During auto resume, SwitchMemtable
+// is disabled to avoid small SST tables. Write can still be applied before
+// the bg error is cleaned unless the memtable is full.
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritNoWALRetryableeErrorAutoRecover1:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritNoWALRetryableeErrorAutoRecover1:1");
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ("val1", Get(Key(1)));
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(3, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume fails, the bg error is not cleand, flush will
+  // return the bg_error set before.
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  // call auto resume
+  ASSERT_OK(dbfull()->Resume());
+  ASSERT_OK(Put(Key(3), "val3", wo));
+  // After resume is successful, the flush should be ok.
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val3", Get(Key(3)));
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FLushWritNoWALRetryableErrorAutoRecover2) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(1), "val1", wo));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_SUCCESS_COUNT));
+  HistogramData autoresume_retry;
+  options.statistics->histogramData(ERROR_HANDLER_AUTORESUME_RETRY_COUNT,
+                                    &autoresume_retry);
+  ASSERT_GE(autoresume_retry.max, 0);
+  ASSERT_OK(Put(Key(2), "val2", wo));
+  s = Flush();
+  // Since auto resume is successful, the bg error is cleaned, flush will
+  // be successful.
+  ASSERT_OK(s);
+  ASSERT_EQ("val2", Get(Key(2)));
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error. Activate the FS before the
+// first resume. Resume is successful
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover1) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  Reopen(options);
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Never activate the FS and auto resume should fail at the end
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAutoRecover2) {
+  // Fail all the resume and let user to resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"FLushWritRetryableeErrorAutoRecover2:0",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       {"RecoverFromRetryableBGIOError:LoopOut",
+        "FLushWritRetryableeErrorAutoRecover2:1"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:0");
+  TEST_SYNC_POINT("FLushWritRetryableeErrorAutoRecover2:1");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  ASSERT_EQ("val1", Get(Key(1)));
+  // Auto resume fails due to FS does not recover during resume. User call
+  // resume manually here.
+  s = dbfull()->Resume();
+  ASSERT_EQ("val1", Get(Key(1)));
+  ASSERT_OK(s);
+  ASSERT_OK(Put(Key(2), "val2"));
+  ASSERT_OK(Flush());
+  ASSERT_EQ("val2", Get(Key(2)));
+
+  Destroy(options);
+}
+
+// Auto resume fromt the flush retryable IO error and set the retry limit count.
+// Fail the first resume and let the second resume be successful.
+TEST_F(DBErrorHandlingFSTest, ManifestWriteRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteRetryableErrorAutoRecover:0"},
+       {"ManifestWriteRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, ManifestWriteNoWALRetryableErrorAutoRecover) {
+  // Fail the first resume and let the second resume be successful
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  WriteOptions wo = WriteOptions();
+  wo.disableWAL = true;
+  ASSERT_OK(Put(Key(0), "val", wo));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val", wo));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"RecoverFromRetryableBGIOError:BeforeStart",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:0"},
+       {"ManifestWriteNoWALRetryableErrorAutoRecover:1",
+        "RecoverFromRetryableBGIOError:BeforeWait1"},
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "ManifestWriteNoWALRetryableErrorAutoRecover:2"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:0");
+  fault_fs_->SetFilesystemActive(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:1");
+  TEST_SYNC_POINT("ManifestWriteNoWALRetryableErrorAutoRecover:2");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest,
+       CompactionManifestWriteRetryableErrorAutoRecover) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+  std::atomic<bool> fail_manifest(false);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Put(Key(2), "val"));
+  ASSERT_OK(Flush());
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      // Wait for flush of 2nd L0 file before starting compaction
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       // Wait for compaction to detect manifest write error
+       {"BackgroundCallCompaction:1", "CompactionManifestWriteErrorAR:0"},
+       // Make compaction thread wait for error to be cleared
+       {"CompactionManifestWriteErrorAR:1",
+        "DBImpl::BackgroundCallCompaction:FoundObsoleteFiles"},
+       {"CompactionManifestWriteErrorAR:2",
+        "RecoverFromRetryableBGIOError:BeforeStart"},
+       // Fail the first resume, before the wait in resume
+       {"RecoverFromRetryableBGIOError:BeforeResume0",
+        "CompactionManifestWriteErrorAR:3"},
+       // Activate the FS before the second resume
+       {"CompactionManifestWriteErrorAR:4",
+        "RecoverFromRetryableBGIOError:BeforeResume1"},
+       // Wait the auto resume be sucessful
+       {"RecoverFromRetryableBGIOError:RecoverSuccess",
+        "CompactionManifestWriteErrorAR:5"}});
+  // trigger manifest write failure in compaction thread
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_manifest.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        if (fail_manifest.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:0");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:1");
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:2");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:3");
+  fault_fs_->SetFilesystemActive(true);
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:4");
+  TEST_SYNC_POINT("CompactionManifestWriteErrorAR:5");
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  new_manifest = GetManifestNameFromLiveFiles();
+  ASSERT_NE(new_manifest, old_manifest);
+
+  Reopen(options);
+  ASSERT_EQ("val", Get(Key(0)));
+  ASSERT_EQ("val", Get(Key(1)));
+  ASSERT_EQ("val", Get(Key(2)));
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionWriteRetryableErrorAutoRecover) {
+  // In this test, in the first round of compaction, the FS is set to error.
+  // So the first compaction fails due to retryable IO error and it is mapped
+  // to soft error. Then, compaction is rescheduled, in the second round of
+  // compaction, the FS is set to active and compaction is successful, so
+  // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+  // point.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  Status s;
+  std::atomic<bool> fail_first(false);
+  std::atomic<bool> fail_second(true);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::FinishCompactionOutputFile1",
+        "CompactionWriteRetryableErrorAutoRecover0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::OpenCompactionOutputFile", [&](void*) {
+        if (fail_first.load() && fail_second.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+          fail_second.store(false);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(s);
+  TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover1) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"WALWriteErrorDone", "RecoverFromRetryableBGIOError:BeforeStart"},
+         {"RecoverFromRetryableBGIOError:BeforeResume0", "WALWriteError1:0"},
+         {"WALWriteError1:1", "RecoverFromRetryableBGIOError:BeforeResume1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError1:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+    TEST_SYNC_POINT("WALWriteErrorDone");
+
+    TEST_SYNC_POINT("WALWriteError1:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError1:1");
+    TEST_SYNC_POINT("WALWriteError1:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+TEST_F(DBErrorHandlingFSTest, WALWriteRetryableErrorAutoRecover2) {
+  // Fail the first recover and try second time.
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = true;
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+  Random rnd(301);
+
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  // For the first batch, write is successful, require sync
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  // For the second batch, the first 2 file Append are successful, then the
+  // following Append fails due to file system retryable IOError.
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 200; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+        {{"RecoverFromRetryableBGIOError:BeforeWait0", "WALWriteError2:0"},
+         {"WALWriteError2:1", "RecoverFromRetryableBGIOError:BeforeWait1"},
+         {"RecoverFromRetryableBGIOError:RecoverSuccess", "WALWriteError2:2"}});
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false, error_msg);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_EQ(true, s.IsIOError());
+
+    TEST_SYNC_POINT("WALWriteError2:0");
+    fault_fs_->SetFilesystemActive(true);
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    TEST_SYNC_POINT("WALWriteError2:1");
+    TEST_SYNC_POINT("WALWriteError2:2");
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Data in corrupted WAL are not stored
+  for (auto i = 0; i < 199; ++i) {
+    if (i < 100) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+
+  // Resume and write a new batch, should be in the WAL
+  {
+    WriteBatch batch;
+
+    for (auto i = 200; i < 300; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  Reopen(options);
+  for (auto i = 0; i < 300; ++i) {
+    if (i < 100 || i >= 200) {
+      ASSERT_NE(Get(Key(i)), "NOT_FOUND");
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+  Close();
+}
+
+// Fail auto resume from a flush retryable error and verify that
+// OnErrorRecoveryEnd listener callback is called
+TEST_F(DBErrorHandlingFSTest, FLushWritRetryableErrorAbortRecovery) {
+  // Activate the FS before the first resume
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.max_bgerror_resume_count = 2;
+  options.bgerror_resume_retry_interval = 100000;  // 0.1 second
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  ASSERT_OK(Put(Key(1), "val1"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeFinishBuildTable",
+      [&](void*) { fault_fs_->SetFilesystemActive(false, error_msg); });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(listener->new_bg_error(), Status::Aborted());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+
+  Destroy(options);
+}
+
+TEST_F(DBErrorHandlingFSTest, FlushReadError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeOutputValidation", [&](void*) {
+        IOStatus st = IOStatus::IOError();
+        st.SetRetryable(true);
+        st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+        fault_fs_->SetFilesystemActive(false, st);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  Reopen(GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushReadError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(false);
+  options.atomic_flush = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(0, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeOutputValidation", [&](void*) {
+        IOStatus st = IOStatus::IOError();
+        st.SetRetryable(true);
+        st.SetScope(IOStatus::IOErrorScope::kIOErrorScopeFile);
+        fault_fs_->SetFilesystemActive(false, st);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush({0, 1});
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kSoftError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  ASSERT_EQ(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_RETRYABLE_IO_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_COUNT));
+  ASSERT_LE(0, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_AUTORESUME_RETRY_TOTAL_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                              GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, AtomicFlushNoSpaceError) {
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.statistics = CreateDBStatistics();
+  Status s;
+
+  listener->EnableAutoRecovery(true);
+  options.atomic_flush = true;
+  CreateAndReopenWithCF({"pikachu"}, options);
+
+  ASSERT_OK(Put(0, Key(0), "val"));
+  ASSERT_OK(Put(1, Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("BuildTable:create_file", [&](void*) {
+    IOStatus st = IOStatus::NoSpace();
+    fault_fs_->SetFilesystemActive(false, st);
+  });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BuildTable:BeforeDeleteFile",
+      [&](void*) { fault_fs_->SetFilesystemActive(true, IOStatus::OK()); });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush({0, 1});
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kHardError);
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  ASSERT_EQ(listener->WaitForRecovery(5000000), true);
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_ERROR_COUNT));
+  ASSERT_LE(1, options.statistics->getAndResetTickerCount(
+                   ERROR_HANDLER_BG_IO_ERROR_COUNT));
+  s = dbfull()->TEST_GetBGError();
+  ASSERT_OK(s);
+
+  TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"},
+                              GetDefaultOptions());
+  ASSERT_EQ("val", Get(Key(0)));
+}
+
+TEST_F(DBErrorHandlingFSTest, CompactionReadRetryableErrorAutoRecover) {
+  // In this test, in the first round of compaction, the FS is set to error.
+  // So the first compaction fails due to retryable IO error and it is mapped
+  // to soft error. Then, compaction is rescheduled, in the second round of
+  // compaction, the FS is set to active and compaction is successful, so
+  // the test will hit the CompactionJob::FinishCompactionOutputFile1 sync
+  // point.
+  std::shared_ptr<ErrorHandlerFSListener> listener =
+      std::make_shared<ErrorHandlerFSListener>();
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  BlockBasedTableOptions table_options;
+  table_options.no_block_cache = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Status s;
+  std::atomic<bool> fail_first(false);
+  std::atomic<bool> fail_second(true);
+  Random rnd(301);
+  DestroyAndReopen(options);
+
+  IOStatus error_msg = IOStatus::IOError("Retryable IO Error");
+  error_msg.SetRetryable(true);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(Key(i), rnd.RandomString(1024)));
+  }
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->OverrideBGError(Status(error_msg, Status::Severity::kHardError));
+  listener->EnableAutoRecovery(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"},
+       {"CompactionJob::FinishCompactionOutputFile1",
+        "CompactionWriteRetryableErrorAutoRecover0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void*) { fault_fs_->SetFilesystemActive(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) { fail_first.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():PausingManualCompaction:2", [&](void*) {
+        if (fail_first.load() && fail_second.load()) {
+          fault_fs_->SetFilesystemActive(false, error_msg);
+          fail_second.store(false);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_OK(s);
+  TEST_SYNC_POINT("CompactionWriteRetryableErrorAutoRecover0");
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  Reopen(GetDefaultOptions());
+}
+
+class DBErrorHandlingFencingTest : public DBErrorHandlingFSTest,
+                                   public testing::WithParamInterface<bool> {};
+
+TEST_P(DBErrorHandlingFencingTest, FLushWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "val"));
+  SyncPoint::GetInstance()->SetCallBack("FlushJob::Start", [&](void*) {
+    fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, ManifestWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  std::string old_manifest;
+  std::string new_manifest;
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+  old_manifest = GetManifestNameFromLiveFiles();
+
+  ASSERT_OK(Put(Key(0), "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put(Key(1), "val"));
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  s = Flush();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Close();
+}
+
+TEST_P(DBErrorHandlingFencingTest, CompactionWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put(Key(0), "va;"));
+  ASSERT_OK(Put(Key(2), "va;"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  listener->EnableAutoRecovery(true);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::FlushMemTable:FlushMemTableFinished",
+        "BackgroundCallCompaction:0"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BackgroundCallCompaction:0", [&](void*) {
+        fault_fs_->SetFilesystemActive(false, IOStatus::IOFenced("IO fenced"));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(Put(Key(1), "val"));
+  s = Flush();
+  ASSERT_OK(s);
+
+  s = dbfull()->TEST_WaitForCompact();
+  ASSERT_EQ(s.severity(), ROCKSDB_NAMESPACE::Status::Severity::kFatalError);
+  ASSERT_TRUE(s.IsIOFenced());
+
+  fault_fs_->SetFilesystemActive(true);
+  s = dbfull()->Resume();
+  ASSERT_TRUE(s.IsIOFenced());
+  Destroy(options);
+}
+
+TEST_P(DBErrorHandlingFencingTest, WALWriteFenced) {
+  std::shared_ptr<ErrorHandlerFSListener> listener(
+      new ErrorHandlerFSListener());
+  Options options = GetDefaultOptions();
+  options.env = fault_env_.get();
+  options.create_if_missing = true;
+  options.writable_file_max_buffer_size = 32768;
+  options.listeners.emplace_back(listener);
+  options.paranoid_checks = GetParam();
+  Status s;
+  Random rnd(301);
+
+  listener->EnableAutoRecovery(true);
+  DestroyAndReopen(options);
+
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    ASSERT_OK(dbfull()->Write(wopts, &batch));
+  };
+
+  {
+    WriteBatch batch;
+    int write_error = 0;
+
+    for (auto i = 100; i < 199; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    SyncPoint::GetInstance()->SetCallBack(
+        "WritableFileWriter::Append:BeforePrepareWrite", [&](void*) {
+          write_error++;
+          if (write_error > 2) {
+            fault_fs_->SetFilesystemActive(false,
+                                           IOStatus::IOFenced("IO fenced"));
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  fault_fs_->SetFilesystemActive(true);
+  {
+    WriteBatch batch;
+
+    for (auto i = 0; i < 100; ++i) {
+      ASSERT_OK(batch.Put(Key(i), rnd.RandomString(1024)));
+    }
+
+    WriteOptions wopts;
+    wopts.sync = true;
+    s = dbfull()->Write(wopts, &batch);
+    ASSERT_TRUE(s.IsIOFenced());
+  }
+  Close();
+}
+
+INSTANTIATE_TEST_CASE_P(DBErrorHandlingFSTest, DBErrorHandlingFencingTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as Cuckoo table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/event_helpers.cc b/src/rocksdb/db/event_helpers.cc
new file mode 100644
index 000000000..7987b8ec6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.cc
@@ -0,0 +1,371 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/event_helpers.h"
+
+#include "rocksdb/convenience.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/utilities/customizable_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+#ifndef ROCKSDB_LITE
+Status EventListener::CreateFromString(const ConfigOptions& config_options,
+                                       const std::string& id,
+                                       std::shared_ptr<EventListener>* result) {
+  return LoadSharedObject<EventListener>(config_options, id, nullptr, result);
+}
+#endif  // ROCKSDB_LITE
+
+namespace {
+template <class T>
+inline T SafeDivide(T a, T b) {
+  return b == 0 ? 0 : a / b;
+}
+}  // anonymous namespace
+
+void EventHelpers::AppendCurrentTime(JSONWriter* jwriter) {
+  *jwriter << "time_micros"
+           << std::chrono::duration_cast<std::chrono::microseconds>(
+                  std::chrono::system_clock::now().time_since_epoch())
+                  .count();
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyTableFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, TableFileCreationReason reason) {
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileCreationBriefInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.job_id = job_id;
+  info.reason = reason;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::NotifyOnBackgroundError(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    BackgroundErrorReason reason, Status* bg_error, InstrumentedMutex* db_mutex,
+    bool* auto_recovery) {
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  db_mutex->AssertHeld();
+  // release lock while notifying events
+  db_mutex->Unlock();
+  for (auto& listener : listeners) {
+    listener->OnBackgroundError(reason, bg_error);
+    bg_error->PermitUncheckedError();
+    if (*auto_recovery) {
+      listener->OnErrorRecoveryBegin(reason, *bg_error, auto_recovery);
+    }
+  }
+  db_mutex->Lock();
+#else
+  (void)listeners;
+  (void)reason;
+  (void)bg_error;
+  (void)db_mutex;
+  (void)auto_recovery;
+#endif  // ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, const FileDescriptor& fd,
+    uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+    TableFileCreationReason reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "table_file_creation"
+            << "file_number" << fd.GetNumber() << "file_size"
+            << fd.GetFileSize() << "file_checksum"
+            << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+            << file_checksum_func_name << "smallest_seqno" << fd.smallest_seqno
+            << "largest_seqno" << fd.largest_seqno;
+
+    // table_properties
+    {
+      jwriter << "table_properties";
+      jwriter.StartObject();
+
+      // basic properties:
+      jwriter << "data_size" << table_properties.data_size << "index_size"
+              << table_properties.index_size << "index_partitions"
+              << table_properties.index_partitions << "top_level_index_size"
+              << table_properties.top_level_index_size
+              << "index_key_is_user_key"
+              << table_properties.index_key_is_user_key
+              << "index_value_is_delta_encoded"
+              << table_properties.index_value_is_delta_encoded << "filter_size"
+              << table_properties.filter_size << "raw_key_size"
+              << table_properties.raw_key_size << "raw_average_key_size"
+              << SafeDivide(table_properties.raw_key_size,
+                            table_properties.num_entries)
+              << "raw_value_size" << table_properties.raw_value_size
+              << "raw_average_value_size"
+              << SafeDivide(table_properties.raw_value_size,
+                            table_properties.num_entries)
+              << "num_data_blocks" << table_properties.num_data_blocks
+              << "num_entries" << table_properties.num_entries
+              << "num_filter_entries" << table_properties.num_filter_entries
+              << "num_deletions" << table_properties.num_deletions
+              << "num_merge_operands" << table_properties.num_merge_operands
+              << "num_range_deletions" << table_properties.num_range_deletions
+              << "format_version" << table_properties.format_version
+              << "fixed_key_len" << table_properties.fixed_key_len
+              << "filter_policy" << table_properties.filter_policy_name
+              << "column_family_name" << table_properties.column_family_name
+              << "column_family_id" << table_properties.column_family_id
+              << "comparator" << table_properties.comparator_name
+              << "merge_operator" << table_properties.merge_operator_name
+              << "prefix_extractor_name"
+              << table_properties.prefix_extractor_name << "property_collectors"
+              << table_properties.property_collectors_names << "compression"
+              << table_properties.compression_name << "compression_options"
+              << table_properties.compression_options << "creation_time"
+              << table_properties.creation_time << "oldest_key_time"
+              << table_properties.oldest_key_time << "file_creation_time"
+              << table_properties.file_creation_time
+              << "slow_compression_estimated_data_size"
+              << table_properties.slow_compression_estimated_data_size
+              << "fast_compression_estimated_data_size"
+              << table_properties.fast_compression_estimated_data_size
+              << "db_id" << table_properties.db_id << "db_session_id"
+              << table_properties.db_session_id << "orig_file_number"
+              << table_properties.orig_file_number << "seqno_to_time_mapping";
+
+      if (table_properties.seqno_to_time_mapping.empty()) {
+        jwriter << "N/A";
+      } else {
+        SeqnoToTimeMapping tmp;
+        Status status = tmp.Add(table_properties.seqno_to_time_mapping);
+        if (status.ok()) {
+          jwriter << tmp.ToHumanString();
+        } else {
+          jwriter << "Invalid";
+        }
+      }
+
+      // user collected properties
+      for (const auto& prop : table_properties.readable_properties) {
+        jwriter << prop.first << prop.second;
+      }
+      jwriter.EndObject();
+    }
+
+    if (oldest_blob_file_number != kInvalidBlobFileNumber) {
+      jwriter << "oldest_blob_file_number" << oldest_blob_file_number;
+    }
+
+    jwriter.EndObject();
+
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileCreationInfo info;
+  info.db_name = db_name;
+  info.cf_name = cf_name;
+  info.file_path = file_path;
+  info.file_size = fd.file_size;
+  info.job_id = job_id;
+  info.table_properties = table_properties;
+  info.reason = reason;
+  info.status = s;
+  info.file_checksum = file_checksum;
+  info.file_checksum_func_name = file_checksum_func_name;
+  for (auto& listener : listeners) {
+    listener->OnTableFileCreated(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)cf_name;
+  (void)file_path;
+  (void)reason;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::LogAndNotifyTableFileDeletion(
+    EventLogger* event_logger, int job_id, uint64_t file_number,
+    const std::string& file_path, const Status& status,
+    const std::string& dbname,
+    const std::vector<std::shared_ptr<EventListener>>& listeners) {
+  JSONWriter jwriter;
+  AppendCurrentTime(&jwriter);
+
+  jwriter << "job" << job_id << "event"
+          << "table_file_deletion"
+          << "file_number" << file_number;
+  if (!status.ok()) {
+    jwriter << "status" << status.ToString();
+  }
+
+  jwriter.EndObject();
+
+  event_logger->Log(jwriter);
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  TableFileDeletionInfo info;
+  info.db_name = dbname;
+  info.job_id = job_id;
+  info.file_path = file_path;
+  info.status = status;
+  for (auto& listener : listeners) {
+    listener->OnTableFileDeleted(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)file_path;
+  (void)dbname;
+  (void)listeners;
+#endif  // !ROCKSDB_LITE
+}
+
+void EventHelpers::NotifyOnErrorRecoveryEnd(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const Status& old_bg_error, const Status& new_bg_error,
+    InstrumentedMutex* db_mutex) {
+#ifndef ROCKSDB_LITE
+  if (!listeners.empty()) {
+    db_mutex->AssertHeld();
+    // release lock while notifying events
+    db_mutex->Unlock();
+    for (auto& listener : listeners) {
+      BackgroundErrorRecoveryInfo info;
+      info.old_bg_error = old_bg_error;
+      info.new_bg_error = new_bg_error;
+      listener->OnErrorRecoveryCompleted(old_bg_error);
+      listener->OnErrorRecoveryEnd(info);
+      info.old_bg_error.PermitUncheckedError();
+      info.new_bg_error.PermitUncheckedError();
+    }
+    db_mutex->Lock();
+  }
+#else
+  (void)listeners;
+  (void)old_bg_error;
+  (void)new_bg_error;
+  (void)db_mutex;
+#endif  // ROCKSDB_LITE
+}
+
+#ifndef ROCKSDB_LITE
+void EventHelpers::NotifyBlobFileCreationStarted(
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id,
+    BlobFileCreationReason creation_reason) {
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationBriefInfo info(db_name, cf_name, file_path, job_id,
+                                 creation_reason);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreationStarted(info);
+  }
+}
+#endif  // !ROCKSDB_LITE
+
+void EventHelpers::LogAndNotifyBlobFileCreationFinished(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners,
+    const std::string& db_name, const std::string& cf_name,
+    const std::string& file_path, int job_id, uint64_t file_number,
+    BlobFileCreationReason creation_reason, const Status& s,
+    const std::string& file_checksum,
+    const std::string& file_checksum_func_name, uint64_t total_blob_count,
+    uint64_t total_blob_bytes) {
+  if (s.ok() && event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+    jwriter << "cf_name" << cf_name << "job" << job_id << "event"
+            << "blob_file_creation"
+            << "file_number" << file_number << "total_blob_count"
+            << total_blob_count << "total_blob_bytes" << total_blob_bytes
+            << "file_checksum" << file_checksum << "file_checksum_func_name"
+            << file_checksum_func_name << "status" << s.ToString();
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileCreationInfo info(db_name, cf_name, file_path, job_id,
+                            creation_reason, total_blob_count, total_blob_bytes,
+                            s, file_checksum, file_checksum_func_name);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileCreated(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)db_name;
+  (void)file_path;
+  (void)creation_reason;
+#endif
+}
+
+void EventHelpers::LogAndNotifyBlobFileDeletion(
+    EventLogger* event_logger,
+    const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+    uint64_t file_number, const std::string& file_path, const Status& status,
+    const std::string& dbname) {
+  if (event_logger) {
+    JSONWriter jwriter;
+    AppendCurrentTime(&jwriter);
+
+    jwriter << "job" << job_id << "event"
+            << "blob_file_deletion"
+            << "file_number" << file_number;
+    if (!status.ok()) {
+      jwriter << "status" << status.ToString();
+    }
+
+    jwriter.EndObject();
+    event_logger->Log(jwriter);
+  }
+#ifndef ROCKSDB_LITE
+  if (listeners.empty()) {
+    return;
+  }
+  BlobFileDeletionInfo info(dbname, file_path, job_id, status);
+  for (const auto& listener : listeners) {
+    listener->OnBlobFileDeleted(info);
+  }
+  info.status.PermitUncheckedError();
+#else
+  (void)listeners;
+  (void)dbname;
+  (void)file_path;
+#endif  // !ROCKSDB_LITE
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/event_helpers.h b/src/rocksdb/db/event_helpers.h
new file mode 100644
index 000000000..68d819fe6
--- /dev/null
+++ b/src/rocksdb/db/event_helpers.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/version_edit.h"
+#include "logging/event_logger.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventHelpers {
+ public:
+  static void AppendCurrentTime(JSONWriter* json_writer);
+#ifndef ROCKSDB_LITE
+  static void NotifyTableFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, TableFileCreationReason reason);
+#endif  // !ROCKSDB_LITE
+  static void NotifyOnBackgroundError(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      BackgroundErrorReason reason, Status* bg_error,
+      InstrumentedMutex* db_mutex, bool* auto_recovery);
+  static void LogAndNotifyTableFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, const FileDescriptor& fd,
+      uint64_t oldest_blob_file_number, const TableProperties& table_properties,
+      TableFileCreationReason reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name);
+  static void LogAndNotifyTableFileDeletion(
+      EventLogger* event_logger, int job_id, uint64_t file_number,
+      const std::string& file_path, const Status& status,
+      const std::string& db_name,
+      const std::vector<std::shared_ptr<EventListener>>& listeners);
+  static void NotifyOnErrorRecoveryEnd(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const Status& old_bg_error, const Status& new_bg_error,
+      InstrumentedMutex* db_mutex);
+
+#ifndef ROCKSDB_LITE
+  static void NotifyBlobFileCreationStarted(
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id,
+      BlobFileCreationReason creation_reason);
+#endif  // !ROCKSDB_LITE
+
+  static void LogAndNotifyBlobFileCreationFinished(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const std::string& db_name, const std::string& cf_name,
+      const std::string& file_path, int job_id, uint64_t file_number,
+      BlobFileCreationReason creation_reason, const Status& s,
+      const std::string& file_checksum,
+      const std::string& file_checksum_func_name, uint64_t total_blob_count,
+      uint64_t total_blob_bytes);
+
+  static void LogAndNotifyBlobFileDeletion(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners, int job_id,
+      uint64_t file_number, const std::string& file_path, const Status& status,
+      const std::string& db_name);
+
+ private:
+  static void LogAndNotifyTableFileCreation(
+      EventLogger* event_logger,
+      const std::vector<std::shared_ptr<EventListener>>& listeners,
+      const FileDescriptor& fd, const TableFileCreationInfo& info);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/experimental.cc b/src/rocksdb/db/experimental.cc
new file mode 100644
index 000000000..d838ebde5
--- /dev/null
+++ b/src/rocksdb/db/experimental.cc
@@ -0,0 +1,155 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/experimental.h"
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_util.h"
+#include "logging/logging.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace experimental {
+
+#ifndef ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, ColumnFamilyHandle* column_family,
+                           const Slice* begin, const Slice* end) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("DB is empty");
+  }
+
+  return db->SuggestCompactRange(column_family, begin, end);
+}
+
+Status PromoteL0(DB* db, ColumnFamilyHandle* column_family, int target_level) {
+  if (db == nullptr) {
+    return Status::InvalidArgument("Didn't recognize DB object");
+  }
+  return db->PromoteL0(column_family, target_level);
+}
+
+#else  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                           const Slice* /*begin*/, const Slice* /*end*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+Status PromoteL0(DB* /*db*/, ColumnFamilyHandle* /*column_family*/,
+                 int /*target_level*/) {
+  return Status::NotSupported("Not supported in RocksDB LITE");
+}
+
+#endif  // ROCKSDB_LITE
+
+Status SuggestCompactRange(DB* db, const Slice* begin, const Slice* end) {
+  return SuggestCompactRange(db, db->DefaultColumnFamily(), begin, end);
+}
+
+Status UpdateManifestForFilesState(
+    const DBOptions& db_opts, const std::string& db_name,
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    const UpdateManifestForFilesStateOptions& opts) {
+  OfflineManifestWriter w(db_opts, db_name);
+  Status s = w.Recover(column_families);
+
+  size_t files_updated = 0;
+  size_t cfs_updated = 0;
+  auto fs = db_opts.env->GetFileSystem();
+
+  for (auto cfd : *w.Versions().GetColumnFamilySet()) {
+    if (!s.ok()) {
+      break;
+    }
+    assert(cfd);
+
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+
+    const auto* current = cfd->current();
+    assert(current);
+
+    const auto* vstorage = current->storage_info();
+    assert(vstorage);
+
+    VersionEdit edit;
+    edit.SetColumnFamily(cfd->GetID());
+
+    /* SST files */
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      if (!s.ok()) {
+        break;
+      }
+      const auto& level_files = vstorage->LevelFiles(level);
+
+      for (const auto& lf : level_files) {
+        assert(lf);
+
+        uint64_t number = lf->fd.GetNumber();
+        std::string fname =
+            TableFileName(w.IOptions().db_paths, number, lf->fd.GetPathId());
+
+        std::unique_ptr<FSSequentialFile> f;
+        FileOptions fopts;
+        // Use kUnknown to signal the FileSystem to search all tiers for the
+        // file.
+        fopts.temperature = Temperature::kUnknown;
+
+        IOStatus file_ios =
+            fs->NewSequentialFile(fname, fopts, &f, /*dbg*/ nullptr);
+        if (file_ios.ok()) {
+          if (opts.update_temperatures) {
+            Temperature temp = f->GetTemperature();
+            if (temp != Temperature::kUnknown && temp != lf->temperature) {
+              // Current state inconsistent with manifest
+              ++files_updated;
+              edit.DeleteFile(level, number);
+              edit.AddFile(
+                  level, number, lf->fd.GetPathId(), lf->fd.GetFileSize(),
+                  lf->smallest, lf->largest, lf->fd.smallest_seqno,
+                  lf->fd.largest_seqno, lf->marked_for_compaction, temp,
+                  lf->oldest_blob_file_number, lf->oldest_ancester_time,
+                  lf->file_creation_time, lf->file_checksum,
+                  lf->file_checksum_func_name, lf->unique_id);
+            }
+          }
+        } else {
+          s = file_ios;
+          break;
+        }
+      }
+    }
+
+    if (s.ok() && edit.NumEntries() > 0) {
+      std::unique_ptr<FSDirectory> db_dir;
+      s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr);
+      if (s.ok()) {
+        s = w.LogAndApply(cfd, &edit, db_dir.get());
+      }
+      if (s.ok()) {
+        ++cfs_updated;
+      }
+    }
+  }
+
+  if (cfs_updated > 0) {
+    ROCKS_LOG_INFO(db_opts.info_log,
+                   "UpdateManifestForFilesState: updated %zu files in %zu CFs",
+                   files_updated, cfs_updated);
+  } else if (s.ok()) {
+    ROCKS_LOG_INFO(db_opts.info_log,
+                   "UpdateManifestForFilesState: no updates needed");
+  }
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_opts.info_log, "UpdateManifestForFilesState failed: %s",
+                    s.ToString().c_str());
+  }
+
+  return s;
+}
+
+}  // namespace experimental
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_basic_test.cc b/src/rocksdb/db/external_sst_file_basic_test.cc
new file mode 100644
index 000000000..665c89869
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_basic_test.cc
@@ -0,0 +1,1997 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/version_edit.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class ExternalSSTFileBasicTest
+    : public DBTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileBasicTest()
+      : DBTestBase("external_sst_file_basic_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "_sst_files/";
+    fault_injection_test_env_.reset(new FaultInjectionTestEnv(env_));
+    DestroyAndRecreateExternalSSTFilesDir();
+
+    // Check if the Env supports RandomRWFile
+    std::string file_path = sst_files_dir_ + "test_random_rw_file";
+    std::unique_ptr<WritableFile> wfile;
+    assert(env_->NewWritableFile(file_path, &wfile, EnvOptions()).ok());
+    wfile.reset();
+    std::unique_ptr<RandomRWFile> rwfile;
+    Status s = env_->NewRandomRWFile(file_path, &rwfile, EnvOptions());
+    if (s.IsNotSupported()) {
+      random_rwfile_supported_ = false;
+    } else {
+      EXPECT_OK(s);
+      random_rwfile_supported_ = true;
+    }
+    rwfile.reset();
+    EXPECT_OK(env_->DeleteFile(file_path));
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+  Status AddFileWithFileChecksum(
+      const std::vector<std::string>& files,
+      const std::vector<std::string>& files_checksums,
+      const std::vector<std::string>& files_checksum_func_names,
+      bool verify_file_checksum = true, bool move_files = false,
+      bool skip_snapshot_check = false, bool write_global_seqno = true) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = write_global_seqno;
+    opts.verify_file_checksum = verify_file_checksum;
+
+    IngestExternalFileArg arg;
+    arg.column_family = db_->DefaultColumnFamily();
+    arg.external_files = files;
+    arg.options = opts;
+    arg.files_checksums = files_checksums;
+    arg.files_checksum_func_names = files_checksum_func_names;
+    return db_->IngestExternalFiles({arg});
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types,
+      std::vector<std::pair<int, int>> range_deletions, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    assert(value_types.size() == 1 || keys.size() == value_types.size());
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (size_t i = 0; i < range_deletions.size(); i++) {
+      // Account for the effect of range deletions on true_data before
+      // all point operators, even though sst_file_writer.DeleteRange
+      // must be called before other sst_file_writer methods. This is
+      // because point writes take precedence over range deletions
+      // in the same ingested sst.
+      std::string start_key = Key(range_deletions[i].first);
+      std::string end_key = Key(range_deletions[i].second);
+      s = sst_file_writer.DeleteRange(start_key, end_key);
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+      auto start_key_it = true_data->find(start_key);
+      if (start_key_it == true_data->end()) {
+        start_key_it = true_data->upper_bound(start_key);
+      }
+      auto end_key_it = true_data->find(end_key);
+      if (end_key_it == true_data->end()) {
+        end_key_it = true_data->upper_bound(end_key);
+      }
+      true_data->erase(start_key_it, end_key_it);
+    }
+    for (size_t i = 0; i < keys.size(); i++) {
+      std::string key = Key(keys[i]);
+      std::string value = Key(keys[i]) + std::to_string(file_id);
+      ValueType value_type =
+          (value_types.size() == 1 ? value_types[0] : value_types[i]);
+      switch (value_type) {
+        case ValueType::kTypeValue:
+          s = sst_file_writer.Put(key, value);
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeMerge:
+          s = sst_file_writer.Merge(key, value);
+          // we only use TestPutOperator in this test
+          (*true_data)[key] = value;
+          break;
+        case ValueType::kTypeDeletion:
+          s = sst_file_writer.Delete(key);
+          true_data->erase(key);
+          break;
+        default:
+          return Status::InvalidArgument("Value type is not supported");
+      }
+      if (!s.ok()) {
+        sst_file_writer.Finish();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = true;
+      ifo.write_global_seqno = write_global_seqno;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      s = db_->IngestExternalFile({file_path}, ifo);
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys,
+      const std::vector<ValueType>& value_types, int file_id,
+      bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, value_types, {}, file_id, write_global_seqno,
+        verify_checksums_before_ingest, true_data);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, const ValueType value_type,
+      int file_id, bool write_global_seqno, bool verify_checksums_before_ingest,
+      std::map<std::string, std::string>* true_data) {
+    return GenerateAndAddExternalFile(
+        options, keys, std::vector<ValueType>(1, value_type), file_id,
+        write_global_seqno, verify_checksums_before_ingest, true_data);
+  }
+
+  ~ExternalSSTFileBasicTest() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_test_env_;
+  bool random_rwfile_supported_;
+};
+
+TEST_F(ExternalSSTFileBasicTest, Basic) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, kUnknownFileChecksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_NOK(s) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_NOK(s) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+class ChecksumVerifyHelper {
+ private:
+  Options options_;
+
+ public:
+  ChecksumVerifyHelper(Options& options) : options_(options) {}
+  ~ChecksumVerifyHelper() {}
+
+  Status GetSingleFileChecksumAndFuncName(
+      const std::string& file_path, std::string* file_checksum,
+      std::string* file_checksum_func_name) {
+    Status s;
+    EnvOptions soptions;
+    std::unique_ptr<SequentialFile> file_reader;
+    s = options_.env->NewSequentialFile(file_path, &file_reader, soptions);
+    if (!s.ok()) {
+      return s;
+    }
+    std::unique_ptr<char[]> scratch(new char[2048]);
+    Slice result;
+    FileChecksumGenFactory* file_checksum_gen_factory =
+        options_.file_checksum_gen_factory.get();
+    if (file_checksum_gen_factory == nullptr) {
+      *file_checksum = kUnknownFileChecksum;
+      *file_checksum_func_name = kUnknownFileChecksumFuncName;
+      return Status::OK();
+    } else {
+      FileChecksumGenContext gen_context;
+      std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+          file_checksum_gen_factory->CreateFileChecksumGenerator(gen_context);
+      *file_checksum_func_name = file_checksum_gen->Name();
+      s = file_reader->Read(2048, &result, scratch.get());
+      if (!s.ok()) {
+        return s;
+      }
+      while (result.size() != 0) {
+        file_checksum_gen->Update(scratch.get(), result.size());
+        s = file_reader->Read(2048, &result, scratch.get());
+        if (!s.ok()) {
+          return s;
+        }
+      }
+      file_checksum_gen->Finalize();
+      *file_checksum = file_checksum_gen->GetChecksum();
+    }
+    return Status::OK();
+  }
+};
+
+TEST_F(ExternalSSTFileBasicTest, BasicWithFileChecksumCrc32c) {
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  std::string file_checksum, file_checksum_func_name;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum, &file_checksum_func_name));
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+  ASSERT_EQ(file1_info.num_range_del_entries, 0);
+  ASSERT_EQ(file1_info.smallest_range_del_key, "");
+  ASSERT_EQ(file1_info.largest_range_del_key, "");
+  ASSERT_EQ(file1_info.file_checksum, file_checksum);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name);
+  // sst_file_writer already finished, cannot add this value
+  s = sst_file_writer.Put(Key(100), "bad_val");
+  ASSERT_NOK(s) << s.ToString();
+  s = sst_file_writer.DeleteRange(Key(100), Key(200));
+  ASSERT_NOK(s) << s.ToString();
+
+  DestroyAndReopen(options);
+  // Add file using file path
+  s = DeprecatedAddFile({file1});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  for (int k = 0; k < 100; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileWithFileChecksum) {
+  Options old_options = CurrentOptions();
+  Options options = CurrentOptions();
+  options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory();
+  const ImmutableCFOptions ioptions(options);
+  ChecksumVerifyHelper checksum_helper(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file01.sst (1000 => 1099)
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+  std::string file_checksum1, file_checksum_func_name1;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file1, &file_checksum1, &file_checksum_func_name1));
+  ASSERT_EQ(file1_info.file_checksum, file_checksum1);
+  ASSERT_EQ(file1_info.file_checksum_func_name, file_checksum_func_name1);
+
+  // file02.sst (1100 => 1299)
+  std::string file2 = sst_files_dir_ + "file02.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 1100; k < 1300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(1100));
+  ASSERT_EQ(file2_info.largest_key, Key(1299));
+  std::string file_checksum2, file_checksum_func_name2;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file2, &file_checksum2, &file_checksum_func_name2));
+  ASSERT_EQ(file2_info.file_checksum, file_checksum2);
+  ASSERT_EQ(file2_info.file_checksum_func_name, file_checksum_func_name2);
+
+  // file03.sst (1300 => 1499)
+  std::string file3 = sst_files_dir_ + "file03.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 1300; k < 1500; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 200);
+  ASSERT_EQ(file3_info.smallest_key, Key(1300));
+  ASSERT_EQ(file3_info.largest_key, Key(1499));
+  std::string file_checksum3, file_checksum_func_name3;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file3, &file_checksum3, &file_checksum_func_name3));
+  ASSERT_EQ(file3_info.file_checksum, file_checksum3);
+  ASSERT_EQ(file3_info.file_checksum_func_name, file_checksum_func_name3);
+
+  // file04.sst (1500 => 1799)
+  std::string file4 = sst_files_dir_ + "file04.sst";
+  ASSERT_OK(sst_file_writer.Open(file4));
+  for (int k = 1500; k < 1800; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file4_info;
+  s = sst_file_writer.Finish(&file4_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file4_info.file_path, file4);
+  ASSERT_EQ(file4_info.num_entries, 300);
+  ASSERT_EQ(file4_info.smallest_key, Key(1500));
+  ASSERT_EQ(file4_info.largest_key, Key(1799));
+  std::string file_checksum4, file_checksum_func_name4;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file4, &file_checksum4, &file_checksum_func_name4));
+  ASSERT_EQ(file4_info.file_checksum, file_checksum4);
+  ASSERT_EQ(file4_info.file_checksum_func_name, file_checksum_func_name4);
+
+  // file05.sst (1800 => 1899)
+  std::string file5 = sst_files_dir_ + "file05.sst";
+  ASSERT_OK(sst_file_writer.Open(file5));
+  for (int k = 1800; k < 2000; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file5_info;
+  s = sst_file_writer.Finish(&file5_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file5_info.file_path, file5);
+  ASSERT_EQ(file5_info.num_entries, 200);
+  ASSERT_EQ(file5_info.smallest_key, Key(1800));
+  ASSERT_EQ(file5_info.largest_key, Key(1999));
+  std::string file_checksum5, file_checksum_func_name5;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file5, &file_checksum5, &file_checksum_func_name5));
+  ASSERT_EQ(file5_info.file_checksum, file_checksum5);
+  ASSERT_EQ(file5_info.file_checksum_func_name, file_checksum_func_name5);
+
+  // file06.sst (2000 => 2199)
+  std::string file6 = sst_files_dir_ + "file06.sst";
+  ASSERT_OK(sst_file_writer.Open(file6));
+  for (int k = 2000; k < 2200; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file6_info;
+  s = sst_file_writer.Finish(&file6_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file6_info.file_path, file6);
+  ASSERT_EQ(file6_info.num_entries, 200);
+  ASSERT_EQ(file6_info.smallest_key, Key(2000));
+  ASSERT_EQ(file6_info.largest_key, Key(2199));
+  std::string file_checksum6, file_checksum_func_name6;
+  ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+      file6, &file_checksum6, &file_checksum_func_name6));
+  ASSERT_EQ(file6_info.file_checksum, file_checksum6);
+  ASSERT_EQ(file6_info.file_checksum_func_name, file_checksum_func_name6);
+
+  s = AddFileWithFileChecksum({file1}, {file_checksum1, "xyz"},
+                              {file_checksum1}, true, false, false, false);
+  // does not care the checksum input since db does not enable file checksum
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file1));
+  std::vector<LiveFileMetaData> live_files;
+  dbfull()->GetLiveFilesMetaData(&live_files);
+  std::set<std::string> set1;
+  for (auto f : live_files) {
+    set1.insert(f.name);
+    ASSERT_EQ(f.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(f.file_checksum_func_name, kUnknownFileChecksumFuncName);
+  }
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[6].files[0].temperature);
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+
+  // Reopen Db with checksum enabled
+  Reopen(options);
+  // Enable verify_file_checksum option
+  // The checksum vector does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2, "xyz"},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {file_checksum2}, {"xyz"}, true, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // The checksum itself does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file2}, {"xyz"}, {file_checksum_func_name2},
+                              true, false, false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Enable verify_file_checksum option
+  // All matches, ingestion is successful
+  s = AddFileWithFileChecksum({file2}, {file_checksum2},
+                              {file_checksum_func_name2}, true, false, false,
+                              false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files1;
+  dbfull()->GetLiveFilesMetaData(&live_files1);
+  for (auto f : live_files1) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum2);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name2);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(env_->FileExists(file2));
+
+  // Enable verify_file_checksum option
+  // No checksum information is provided, generate it when ingesting
+  std::vector<std::string> checksum, checksum_func;
+  s = AddFileWithFileChecksum({file3}, checksum, checksum_func, true, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files2;
+  dbfull()->GetLiveFilesMetaData(&live_files2);
+  for (auto f : live_files2) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum3);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name3);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  // Does not enable verify_file_checksum options
+  // The checksum name does not match, fail the ingestion
+  s = AddFileWithFileChecksum({file4}, {file_checksum4}, {"xyz"}, false, false,
+                              false, false);
+  ASSERT_NOK(s) << s.ToString();
+
+  // Does not enable verify_file_checksum options
+  // Checksum function name matches, store the checksum being ingested.
+  s = AddFileWithFileChecksum({file4}, {"asd"}, {file_checksum_func_name4},
+                              false, false, false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files3;
+  dbfull()->GetLiveFilesMetaData(&live_files3);
+  for (auto f : live_files3) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_FALSE(f.file_checksum == file_checksum4);
+      ASSERT_EQ(f.file_checksum, "asd");
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name4);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file4));
+
+  // enable verify_file_checksum options, DB enable checksum, and enable
+  // write_global_seq. So the checksum stored is different from the one
+  // ingested due to the sequence number changes.
+  s = AddFileWithFileChecksum({file5}, {file_checksum5},
+                              {file_checksum_func_name5}, true, false, false,
+                              true);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files4;
+  dbfull()->GetLiveFilesMetaData(&live_files4);
+  for (auto f : live_files4) {
+    if (set1.find(f.name) == set1.end()) {
+      std::string cur_checksum5, cur_checksum_func_name5;
+      ASSERT_OK(checksum_helper.GetSingleFileChecksumAndFuncName(
+          dbname_ + f.name, &cur_checksum5, &cur_checksum_func_name5));
+      ASSERT_EQ(f.file_checksum, cur_checksum5);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name5);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file5));
+
+  // Does not enable verify_file_checksum options and also the ingested file
+  // checksum information is empty. DB will generate and store the checksum
+  // in Manifest.
+  std::vector<std::string> files_c6, files_name6;
+  s = AddFileWithFileChecksum({file6}, files_c6, files_name6, false, false,
+                              false, false);
+  ASSERT_OK(s) << s.ToString();
+  std::vector<LiveFileMetaData> live_files6;
+  dbfull()->GetLiveFilesMetaData(&live_files6);
+  for (auto f : live_files6) {
+    if (set1.find(f.name) == set1.end()) {
+      ASSERT_EQ(f.file_checksum, file_checksum6);
+      ASSERT_EQ(f.file_checksum_func_name, file_checksum_func_name6);
+      set1.insert(f.name);
+    }
+  }
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file6));
+  db_->GetColumnFamilyMetaData(&metadata);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, NoCopy) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  s = sst_file_writer.Finish(&file2_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  // file3.sst (110 => 124) .. overlap with file2.sst
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 110; k < 125; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+  }
+  ExternalSstFileInfo file3_info;
+  s = sst_file_writer.Finish(&file3_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 15);
+  ASSERT_EQ(file3_info.smallest_key, Key(110));
+  ASSERT_EQ(file3_info.largest_key, Key(124));
+
+  s = DeprecatedAddFile({file1}, true /* move file */);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(Status::NotFound(), env_->FileExists(file1));
+
+  s = DeprecatedAddFile({file2}, false /* copy file */);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file2));
+
+  // This file has overlapping values with the existing data
+  s = DeprecatedAddFile({file3}, true /* move file */);
+  ASSERT_NOK(s) << s.ToString();
+  ASSERT_OK(env_->FileExists(file3));
+
+  for (int k = 0; k < 300; k++) {
+    ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+  }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithGlobalSeqnoPickedSeqno) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeValue, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMultipleValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120}, {ValueType::kTypeValue}, {{120, 135}}, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{110, 120}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // The range deletion ends on a key, but it doesn't actually delete
+    // this key because the largest key in the range is exclusive. Still,
+    // it counts as an overlap so a new seqno will be assigned.
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {}, {}, {{100, 109}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40}, ValueType::kTypeDeletion, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150}, ValueType::kTypeMerge, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithMixedValueType) {
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    options.merge_operator.reset(new TestPutOperator());
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+
+    int file_id = 1;
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge, ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {10, 11, 12, 13},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 0);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 4, 6},
+        {ValueType::kTypeDeletion, ValueType::kTypeValue,
+         ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {11, 15, 19},
+        {ValueType::kTypeDeletion, ValueType::kTypeMerge,
+         ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {120, 130}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 2);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 130}, {ValueType::kTypeMerge, ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{150, 160}, {180, 190}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {150, 151, 152},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        {{200, 250}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {300, 301, 302},
+        {ValueType::kTypeValue, ValueType::kTypeMerge,
+         ValueType::kTypeDeletion},
+        {{1, 2}, {152, 154}}, file_id++, write_global_seqno,
+        verify_checksums_before_ingest, &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), 5);
+
+    // Write some keys through normal write path
+    for (int i = 0; i < 50; i++) {
+      ASSERT_OK(Put(Key(i), "memtable"));
+      true_data[Key(i)] = "memtable";
+    }
+    SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {60, 61, 62},
+        {ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeValue},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File doesn't overwrite any keys, no seqno needed
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {40, 41, 42},
+        {ValueType::kTypeValue, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 1);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {20, 30, 40},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeDeletion},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // File overwrites some keys, a seqno will be assigned
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 2);
+
+    const Snapshot* snapshot = db_->GetSnapshot();
+
+    // We will need a seqno for the file regardless if the file overwrite
+    // keys in the DB or not because we have a snapshot
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1000, 1002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 3);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {2000, 3002}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 4);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {1, 20, 40, 100, 150},
+        {ValueType::kTypeDeletion, ValueType::kTypeDeletion,
+         ValueType::kTypeValue, ValueType::kTypeMerge, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // A global seqno will be assigned anyway because of the snapshot
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    db_->ReleaseSnapshot(snapshot);
+
+    ASSERT_OK(GenerateAndAddExternalFile(
+        options, {5000, 5001}, {ValueType::kTypeValue, ValueType::kTypeMerge},
+        file_id++, write_global_seqno, verify_checksums_before_ingest,
+        &true_data));
+    // No snapshot anymore, no need to assign a seqno
+    ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno + 5);
+
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, FadviseTrigger) {
+  Options options = CurrentOptions();
+  const int kNumKeys = 10000;
+
+  size_t total_fadvised_bytes = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "SstFileWriter::Rep::InvalidatePageCache", [&](void* arg) {
+        size_t fadvise_size = *(reinterpret_cast<size_t*>(arg));
+        total_fadvised_bytes += fadvise_size;
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  std::string sst_file_path = sst_files_dir_ + "file_fadvise_disable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, false));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise disabled
+  ASSERT_EQ(total_fadvised_bytes, 0);
+
+  sst_file_path = sst_files_dir_ + "file_fadvise_enable.sst";
+  sst_file_writer.reset(
+      new SstFileWriter(EnvOptions(), options, nullptr, true));
+  ASSERT_OK(sst_file_writer->Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer->Put(Key(i), Key(i)));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+  // fadvise enabled
+  ASSERT_EQ(total_fadvised_bytes, sst_file_writer->FileSize());
+  ASSERT_GT(total_fadvised_bytes, 0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileBasicTest, SyncFailure) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = fault_injection_test_env_.get();
+
+  std::vector<std::pair<std::string, std::string>> test_cases = {
+      {"ExternalSstFileIngestionJob::BeforeSyncIngestedFile",
+       "ExternalSstFileIngestionJob::AfterSyncIngestedFile"},
+      {"ExternalSstFileIngestionJob::BeforeSyncDir",
+       "ExternalSstFileIngestionJob::AfterSyncDir"},
+      {"ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno",
+       "ExternalSstFileIngestionJob::AfterSyncGlobalSeqno"}};
+
+  for (size_t i = 0; i < test_cases.size(); i++) {
+    bool no_sync = false;
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].first, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(false);
+    });
+    SyncPoint::GetInstance()->SetCallBack(test_cases[i].second, [&](void*) {
+      fault_injection_test_env_->SetFilesystemActive(true);
+    });
+    if (i == 0) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
+    if (i == 2) {
+      SyncPoint::GetInstance()->SetCallBack(
+          "ExternalSstFileIngestionJob::NewRandomRWFile", [&](void* s) {
+            Status* status = static_cast<Status*>(s);
+            if (status->IsNotSupported()) {
+              no_sync = true;
+            }
+          });
+    }
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    DestroyAndReopen(options);
+    if (i == 2) {
+      ASSERT_OK(Put("foo", "v1"));
+    }
+
+    Options sst_file_writer_options;
+    sst_file_writer_options.env = fault_injection_test_env_.get();
+    std::unique_ptr<SstFileWriter> sst_file_writer(
+        new SstFileWriter(EnvOptions(), sst_file_writer_options));
+    std::string file_name =
+        sst_files_dir_ + "sync_failure_test_" + std::to_string(i) + ".sst";
+    ASSERT_OK(sst_file_writer->Open(file_name));
+    ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+    ASSERT_OK(sst_file_writer->Finish());
+
+    IngestExternalFileOptions ingest_opt;
+    if (i == 0) {
+      ingest_opt.move_files = true;
+    }
+    const Snapshot* snapshot = db_->GetSnapshot();
+    if (i == 2) {
+      ingest_opt.write_global_seqno = true;
+    }
+    Status s = db_->IngestExternalFile({file_name}, ingest_opt);
+    if (no_sync) {
+      ASSERT_OK(s);
+    } else {
+      ASSERT_NOK(s);
+    }
+    db_->ReleaseSnapshot(snapshot);
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    Destroy(options);
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, ReopenNotSupported) {
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+
+  SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:Reopen", [&](void* arg) {
+        Status* s = static_cast<Status*>(arg);
+        *s = Status::NotSupported();
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name =
+      sst_files_dir_ + "reopen_not_supported_test_" + ".sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  ASSERT_OK(sst_file_writer->Put("bar", "v2"));
+  ASSERT_OK(sst_file_writer->Finish());
+
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = true;
+  const Snapshot* snapshot = db_->GetSnapshot();
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+  db_->ReleaseSnapshot(snapshot);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksumReadahead) {
+  Options options;
+  options.create_if_missing = true;
+  SpecialEnv senv(env_);
+  options.env = &senv;
+  DestroyAndReopen(options);
+
+  Options sst_file_writer_options;
+  sst_file_writer_options.env = env_;
+  std::unique_ptr<SstFileWriter> sst_file_writer(
+      new SstFileWriter(EnvOptions(), sst_file_writer_options));
+  std::string file_name = sst_files_dir_ + "verify_checksum_readahead_test.sst";
+  ASSERT_OK(sst_file_writer->Open(file_name));
+  Random rnd(301);
+  std::string value = rnd.RandomString(4000);
+  for (int i = 0; i < 5000; i++) {
+    ASSERT_OK(sst_file_writer->Put(DBTestBase::Key(i), value));
+  }
+  ASSERT_OK(sst_file_writer->Finish());
+
+  // Ingest it once without verifying checksums to see the baseline
+  // preads.
+  IngestExternalFileOptions ingest_opt;
+  ingest_opt.move_files = false;
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  auto base_num_reads = senv.random_read_counter_.Read();
+  // Make sure the counter is enabled.
+  ASSERT_GT(base_num_reads, 0);
+
+  // Ingest again and observe the reads made for for readahead.
+  ingest_opt.move_files = false;
+  ingest_opt.verify_checksums_before_ingest = true;
+  ingest_opt.verify_checksums_readahead_size = size_t{2 * 1024 * 1024};
+
+  senv.count_random_reads_ = true;
+  senv.random_read_bytes_counter_ = 0;
+  ASSERT_OK(db_->IngestExternalFile({file_name}, ingest_opt));
+
+  // Make sure the counter is enabled.
+  ASSERT_GT(senv.random_read_counter_.Read() - base_num_reads, 0);
+
+  // The SST file is about 20MB. Readahead size is 2MB.
+  // Give a conservative 15 reads for metadata blocks, the number
+  // of random reads should be within 20 MB / 2MB + 15 = 25.
+  ASSERT_LE(senv.random_read_counter_.Read() - base_num_reads, 40);
+
+  Destroy(options);
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestRangeDeletionTombstoneWithGlobalSeqno) {
+  for (int i = 5; i < 25; i++) {
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), Key(i),
+                       Key(i) + "_val"));
+  }
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file.sst (delete 0 => 30)
+  std::string file = sst_files_dir_ + "file.sst";
+  ASSERT_OK(sst_file_writer.Open(file));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(30)));
+  ExternalSstFileInfo file_info;
+  ASSERT_OK(sst_file_writer.Finish(&file_info));
+  ASSERT_EQ(file_info.file_path, file);
+  ASSERT_EQ(file_info.num_entries, 0);
+  ASSERT_EQ(file_info.smallest_key, "");
+  ASSERT_EQ(file_info.largest_key, "");
+  ASSERT_EQ(file_info.num_range_del_entries, 1);
+  ASSERT_EQ(file_info.smallest_range_del_key, Key(0));
+  ASSERT_EQ(file_info.largest_range_del_key, Key(30));
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.snapshot_consistency = true;
+  ifo.allow_global_seqno = true;
+  ifo.write_global_seqno = true;
+  ifo.verify_checksums_before_ingest = false;
+  ASSERT_OK(db_->IngestExternalFile({file}, ifo));
+
+  for (int i = 5; i < 25; i++) {
+    std::string res;
+    ASSERT_TRUE(db_->Get(ReadOptions(), Key(i), &res).IsNotFound());
+  }
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestionWithRangeDeletions) {
+  int kNumLevels = 7;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = kNumLevels;
+  Reopen(options);
+
+  std::map<std::string, std::string> true_data;
+  int file_id = 1;
+  // prevent range deletions from being dropped due to becoming obsolete.
+  const Snapshot* snapshot = db_->GetSnapshot();
+
+  // range del [0, 50) in L6 file, [50, 100) in L0 file, [100, 150) in memtable
+  for (int i = 0; i < 3; i++) {
+    if (i != 0) {
+      db_->Flush(FlushOptions());
+      if (i == 1) {
+        MoveFilesToLevel(kNumLevels - 1);
+      }
+    }
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               Key(50 * i), Key(50 * (i + 1))));
+  }
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 1));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // overlaps with L0 file but not memtable, so flush is skipped and file is
+  // ingested into L0
+  SequenceNumber last_seqno = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {60, 90}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{65, 70}, {70, 85}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(0, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L6 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L5
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {10, 40}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // overlaps with L5 file but not memtable or L0 file, so flush is skipped and
+  // file is ingested into L4
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {}, {}, {{5, 15}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // ingested file overlaps with memtable, so flush is triggered before the file
+  // is ingested such that the ingested data is considered newest. So L0 file
+  // count increases by two.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {100, 140}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      file_id++, write_global_seqno, verify_checksums_before_ingest,
+      &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), ++last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(1, NumTableFilesAtLevel(options.num_levels - 1));
+
+  // snapshot unneeded now that all range deletions are persisted
+  db_->ReleaseSnapshot(snapshot);
+
+  // overlaps with nothing, so places at bottom level and skips incrementing
+  // seqnum.
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {151, 175}, {ValueType::kTypeValue, ValueType::kTypeValue},
+      {{160, 200}}, file_id++, write_global_seqno,
+      verify_checksums_before_ingest, &true_data));
+  ASSERT_EQ(dbfull()->GetLatestSequenceNumber(), last_seqno);
+  ASSERT_EQ(4, NumTableFilesAtLevel(0));
+  ASSERT_EQ(1, NumTableFilesAtLevel(kNumLevels - 2));
+  ASSERT_EQ(2, NumTableFilesAtLevel(options.num_levels - 1));
+}
+
+TEST_F(ExternalSSTFileBasicTest, AdjacentRangeDeletionTombstones) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file8.sst (delete 300 => 400)
+  std::string file8 = sst_files_dir_ + "file8.sst";
+  ASSERT_OK(sst_file_writer.Open(file8));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(300), Key(400)));
+  ExternalSstFileInfo file8_info;
+  Status s = sst_file_writer.Finish(&file8_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file8_info.file_path, file8);
+  ASSERT_EQ(file8_info.num_entries, 0);
+  ASSERT_EQ(file8_info.smallest_key, "");
+  ASSERT_EQ(file8_info.largest_key, "");
+  ASSERT_EQ(file8_info.num_range_del_entries, 1);
+  ASSERT_EQ(file8_info.smallest_range_del_key, Key(300));
+  ASSERT_EQ(file8_info.largest_range_del_key, Key(400));
+
+  // file9.sst (delete 400 => 500)
+  std::string file9 = sst_files_dir_ + "file9.sst";
+  ASSERT_OK(sst_file_writer.Open(file9));
+  ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+  ExternalSstFileInfo file9_info;
+  s = sst_file_writer.Finish(&file9_info);
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(file9_info.file_path, file9);
+  ASSERT_EQ(file9_info.num_entries, 0);
+  ASSERT_EQ(file9_info.smallest_key, "");
+  ASSERT_EQ(file9_info.largest_key, "");
+  ASSERT_EQ(file9_info.num_range_del_entries, 1);
+  ASSERT_EQ(file9_info.smallest_range_del_key, Key(400));
+  ASSERT_EQ(file9_info.largest_range_del_key, Key(500));
+
+  // Range deletion tombstones are exclusive on their end key, so these SSTs
+  // should not be considered as overlapping.
+  s = DeprecatedAddFile({file8, file9});
+  ASSERT_OK(s) << s.ToString();
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+  DestroyAndRecreateExternalSSTFilesDir();
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) {
+  bool change_checksum_called = false;
+  const auto& change_checksum = [&](void* arg) {
+    if (!change_checksum_called) {
+      char* buf = reinterpret_cast<char*>(arg);
+      assert(nullptr != buf);
+      buf[0] ^= 0x1;
+      change_checksum_called = true;
+    }
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteMaybeCompressedBlock:TamperWithChecksum",
+      change_checksum);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    std::map<std::string, std::string> true_data;
+    Status s = GenerateAndAddExternalFile(
+        options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++,
+        write_global_seqno, verify_checksums_before_ingest, &true_data);
+    if (verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+    change_checksum_called = false;
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) {
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+    return;
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+  int file_id = 0;
+  EnvOptions env_options;
+  do {
+    Options options = CurrentOptions();
+    std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+    SstFileWriter sst_file_writer(env_options, options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + std::to_string(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+    {
+      // Get file size
+      uint64_t file_size = 0;
+      ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+      ASSERT_GT(file_size, 8);
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      // We deterministically corrupt the first byte because we currently
+      // cannot choose a random offset. The reason for this limitation is that
+      // we do not checksum property block at present.
+      const uint64_t offset = 0;
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+    s = db_->IngestExternalFile({file_path}, ifo);
+    if (ifo.verify_checksums_before_ingest) {
+      ASSERT_NOK(s);
+    } else {
+      ASSERT_OK(s);
+    }
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_P(ExternalSSTFileBasicTest, IngestExternalFileWithCorruptedPropsBlock) {
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  if (!verify_checksums_before_ingest) {
+    ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest");
+    return;
+  }
+  if (!random_rwfile_supported_) {
+    ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support");
+    return;
+  }
+  uint64_t props_block_offset = 0;
+  size_t props_block_size = 0;
+  const auto& get_props_block_offset = [&](void* arg) {
+    props_block_offset = *reinterpret_cast<uint64_t*>(arg);
+  };
+  const auto& get_props_block_size = [&](void* arg) {
+    props_block_size = *reinterpret_cast<uint64_t*>(arg);
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockOffset",
+      get_props_block_offset);
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WritePropertiesBlock:GetPropsBlockSize",
+      get_props_block_size);
+  SyncPoint::GetInstance()->EnableProcessing();
+  int file_id = 0;
+  Random64 rand(time(nullptr));
+  do {
+    std::string file_path = sst_files_dir_ + std::to_string(file_id++);
+    Options options = CurrentOptions();
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    Status s = sst_file_writer.Open(file_path);
+    ASSERT_OK(s);
+    for (int i = 0; i != 100; ++i) {
+      std::string key = Key(i);
+      std::string value = Key(i) + std::to_string(0);
+      ASSERT_OK(sst_file_writer.Put(key, value));
+    }
+    ASSERT_OK(sst_file_writer.Finish());
+
+    {
+      std::unique_ptr<RandomRWFile> rwfile;
+      ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions()));
+      // Manually corrupt the file
+      ASSERT_GT(props_block_size, 8);
+      uint64_t offset =
+          props_block_offset + rand.Next() % (props_block_size - 8);
+      char scratch[8] = {0};
+      Slice buf;
+      ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch));
+      scratch[0] ^= 0xff;  // flip one bit
+      ASSERT_OK(rwfile->Write(offset, buf));
+    }
+
+    // Ingest file.
+    IngestExternalFileOptions ifo;
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    ifo.verify_checksums_before_ingest = true;
+    s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_NOK(s);
+  } while (ChangeOptionsForFileIngestionTest());
+}
+
+TEST_F(ExternalSSTFileBasicTest, OverlappingFiles) {
+  Options options = CurrentOptions();
+
+  std::vector<std::string> files;
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    ASSERT_OK(sst_file_writer.Put("a", "z"));
+    ASSERT_OK(sst_file_writer.Put("i", "m"));
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    files.push_back(std::move(file1));
+  }
+  {
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    ASSERT_OK(sst_file_writer.Put("i", "k"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    files.push_back(std::move(file2));
+  }
+
+  IngestExternalFileOptions ifo;
+  ASSERT_OK(db_->IngestExternalFile(files, ifo));
+  ASSERT_EQ(Get("a"), "z");
+  ASSERT_EQ(Get("i"), "k");
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(total_keys, 2);
+
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestFileAfterDBPut) {
+  // Repro https://github.com/facebook/rocksdb/issues/6245.
+  // Flush three files to L0. Ingest one more file to trigger L0->L1 compaction
+  // via trivial move. The bug happened when L1 files were incorrectly sorted
+  // resulting in an old value for "k" returned by `Get()`.
+  Options options = CurrentOptions();
+
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  ASSERT_OK(Put("k", "a"));
+  Flush();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Current file size should be 0 after sst_file_writer init and before open a
+  // file.
+  ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  ASSERT_OK(sst_file_writer.Put("k", "b"));
+
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s) << s.ToString();
+
+  // Current file size should be non-zero after success write.
+  ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+  IngestExternalFileOptions ifo;
+  s = db_->IngestExternalFile({file1}, ifo);
+  ASSERT_OK(s);
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(Get("k"), "b");
+}
+
+TEST_F(ExternalSSTFileBasicTest, IngestWithTemperature) {
+  Options options = CurrentOptions();
+  const ImmutableCFOptions ioptions(options);
+  options.bottommost_temperature = Temperature::kWarm;
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  options.level0_file_num_compaction_trigger = 2;
+  Reopen(options);
+
+  auto size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+
+  // create file01.sst (1000 => 1099) and ingest it
+  std::string file1 = sst_files_dir_ + "file01.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 1000; k < 1100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  Status s = sst_file_writer.Finish(&file1_info);
+  ASSERT_OK(s);
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(1000));
+  ASSERT_EQ(file1_info.largest_key, Key(1099));
+
+  std::vector<std::string> files;
+  std::vector<std::string> files_checksums;
+  std::vector<std::string> files_checksum_func_names;
+  Temperature file_temperature = Temperature::kWarm;
+
+  files.push_back(file1);
+  IngestExternalFileOptions in_opts;
+  in_opts.move_files = false;
+  in_opts.snapshot_consistency = true;
+  in_opts.allow_global_seqno = false;
+  in_opts.allow_blocking_flush = false;
+  in_opts.write_global_seqno = true;
+  in_opts.verify_file_checksum = false;
+  IngestExternalFileArg arg;
+  arg.column_family = db_->DefaultColumnFamily();
+  arg.external_files = files;
+  arg.options = in_opts;
+  arg.files_checksums = files_checksums;
+  arg.files_checksum_func_names = files_checksum_func_names;
+  arg.file_temperature = file_temperature;
+  s = db_->IngestExternalFiles({arg});
+  ASSERT_OK(s);
+
+  // check the temperature of the file being ingested
+  ColumnFamilyMetaData metadata;
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(1, metadata.file_count);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 1);
+
+  // non-bottommost file still has unknown temperature
+  ASSERT_OK(Put("foo", "bar"));
+  ASSERT_OK(Put("bar", "bar"));
+  ASSERT_OK(Flush());
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // reopen and check the information is persisted
+  Reopen(options);
+  db_->GetColumnFamilyMetaData(&metadata);
+  ASSERT_EQ(2, metadata.file_count);
+  ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature);
+  ASSERT_EQ(Temperature::kWarm, metadata.levels[6].files[0].temperature);
+  size = GetSstSizeHelper(Temperature::kUnknown);
+  ASSERT_GT(size, 0);
+  size = GetSstSizeHelper(Temperature::kWarm);
+  ASSERT_GT(size, 0);
+
+  // check other non-exist temperatures
+  size = GetSstSizeHelper(Temperature::kHot);
+  ASSERT_EQ(size, 0);
+  size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_EQ(size, 0);
+  std::string prop;
+  ASSERT_TRUE(dbfull()->GetProperty(
+      DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22),
+      &prop));
+  ASSERT_EQ(std::atoi(prop.c_str()), 0);
+}
+
+TEST_F(ExternalSSTFileBasicTest, FailIfNotBottommostLevel) {
+  Options options = GetDefaultOptions();
+
+  std::string file_path = sst_files_dir_ + std::to_string(1);
+  SstFileWriter sfw(EnvOptions(), options);
+
+  ASSERT_OK(sfw.Open(file_path));
+  ASSERT_OK(sfw.Put("b", "dontcare"));
+  ASSERT_OK(sfw.Finish());
+
+  // Test universal compaction + ingest with snapshot consistency
+  options.create_if_missing = true;
+  options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+  DestroyAndReopen(options);
+  {
+    const Snapshot* snapshot = db_->GetSnapshot();
+    ManagedSnapshot snapshot_guard(db_, snapshot);
+    IngestExternalFileOptions ifo;
+    ifo.fail_if_not_bottommost_level = true;
+    ifo.snapshot_consistency = true;
+    const Status s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_TRUE(s.IsTryAgain());
+  }
+
+  // Test level compaction
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+  ASSERT_OK(db_->Put(WriteOptions(), "a", "dontcare"));
+  ASSERT_OK(db_->Put(WriteOptions(), "c", "dontcare"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "b", "dontcare"));
+  ASSERT_OK(db_->Put(WriteOptions(), "d", "dontcare"));
+  ASSERT_OK(db_->Flush(FlushOptions()));
+
+  {
+    CompactRangeOptions cro;
+    cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+    IngestExternalFileOptions ifo;
+    ifo.fail_if_not_bottommost_level = true;
+    const Status s = db_->IngestExternalFile({file_path}, ifo);
+    ASSERT_TRUE(s.IsTryAgain());
+  }
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifyChecksum) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  ASSERT_OK(db_->VerifyChecksum());
+}
+
+TEST_F(ExternalSSTFileBasicTest, VerifySstUniqueId) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  // Test ingest file without session_id and db_id (for example generated by an
+  // older version of sst_writer)
+  SyncPoint::GetInstance()->SetCallBack(
+      "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) {
+        auto props = static_cast<TableProperties*>(props_vs);
+        // update table property session_id to a different one
+        props->db_session_id = "";
+        props->db_id = "";
+      });
+  std::atomic_int skipped = 0, passed = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::SkippedVerifyUniqueId",
+      [&](void* /*arg*/) { skipped++; });
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTable::Open::PassedVerifyUniqueId",
+      [&](void* /*arg*/) { passed++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto options = CurrentOptions();
+  ASSERT_TRUE(options.verify_sst_unique_id_in_manifest);
+  Reopen(options);
+  ASSERT_EQ(skipped, 0);
+  ASSERT_EQ(passed, 2);  // one flushed + one ingested
+
+  external_file = sst_files_dir_ + "/file_to_ingest2.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+
+  // Two table file opens skipping verification:
+  // * ExternalSstFileIngestionJob::GetIngestedFileInfo
+  // * TableCache::GetTableReader
+  ASSERT_EQ(skipped, 2);
+  ASSERT_EQ(passed, 2);
+
+  // Check same after re-open (except no GetIngestedFileInfo)
+  skipped = 0;
+  passed = 0;
+  Reopen(options);
+  ASSERT_EQ(skipped, 1);
+  ASSERT_EQ(passed, 2);
+}
+
+TEST_F(ExternalSSTFileBasicTest, StableSnapshotWhileLoggingToManifest) {
+  const std::string kPutVal = "put_val";
+  const std::string kIngestedVal = "ingested_val";
+
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_OK(Flush());
+
+  std::string external_file = sst_files_dir_ + "/file_to_ingest.sst";
+  {
+    SstFileWriter sst_file_writer{EnvOptions(), CurrentOptions()};
+    ASSERT_OK(sst_file_writer.Open(external_file));
+    ASSERT_OK(sst_file_writer.Put("k", kIngestedVal));
+    ASSERT_OK(sst_file_writer.Finish());
+  }
+
+  const Snapshot* snapshot = nullptr;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /* arg */) {
+        // prevent background compaction job to call this callback
+        ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+        snapshot = db_->GetSnapshot();
+        ReadOptions read_opts;
+        read_opts.snapshot = snapshot;
+        std::string value;
+        ASSERT_OK(db_->Get(read_opts, "k", &value));
+        ASSERT_EQ(kPutVal, value);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(db_->IngestExternalFile(db_->DefaultColumnFamily(), {external_file},
+                                    IngestExternalFileOptions()));
+  auto ingested_file_seqno = db_->GetLatestSequenceNumber();
+  ASSERT_NE(nullptr, snapshot);
+  // snapshot is taken before SST ingestion is done
+  ASSERT_EQ(ingested_file_seqno, snapshot->GetSequenceNumber() + 1);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+  std::string value;
+  ASSERT_OK(db_->Get(read_opts, "k", &value));
+  ASSERT_EQ(kPutVal, value);
+  db_->ReleaseSnapshot(snapshot);
+
+  // After reopen, sequence number should be up current such that
+  // ingested value is read
+  Reopen(CurrentOptions());
+  ASSERT_OK(db_->Get(ReadOptions(), "k", &value));
+  ASSERT_EQ(kIngestedVal, value);
+
+  // New write should get higher seqno compared to ingested file
+  ASSERT_OK(Put("k", kPutVal, WriteOptions()));
+  ASSERT_EQ(db_->GetLatestSequenceNumber(), ingested_file_seqno + 1);
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileBasicTest, ExternalSSTFileBasicTest,
+                        testing::Values(std::make_tuple(true, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(false, false)));
+
+#endif  // ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.cc b/src/rocksdb/db/external_sst_file_ingestion_job.cc
new file mode 100644
index 000000000..ba1277eab
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.cc
@@ -0,0 +1,1020 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/external_sst_file_ingestion_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ExternalSstFileIngestionJob::Prepare(
+    const std::vector<std::string>& external_files_paths,
+    const std::vector<std::string>& files_checksums,
+    const std::vector<std::string>& files_checksum_func_names,
+    const Temperature& file_temperature, uint64_t next_file_number,
+    SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are ingesting
+  for (const std::string& file_path : external_files_paths) {
+    IngestedFileInfo file_to_ingest;
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_ingest, sv);
+    if (!status.ok()) {
+      return status;
+    }
+
+    if (file_to_ingest.cf_id !=
+            TablePropertiesCollectorFactory::Context::kUnknownColumnFamily &&
+        file_to_ingest.cf_id != cfd_->GetID()) {
+      return Status::InvalidArgument(
+          "External file column family id don't match");
+    }
+
+    if (file_to_ingest.num_entries == 0 &&
+        file_to_ingest.num_range_deletions == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!file_to_ingest.smallest_internal_key.Valid() ||
+        !file_to_ingest.largest_internal_key.Valid()) {
+      return Status::Corruption("Generated table have corrupted keys");
+    }
+
+    files_to_ingest_.emplace_back(std::move(file_to_ingest));
+  }
+
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  auto num_files = files_to_ingest_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges
+    autovector<const IngestedFileInfo*> sorted_files;
+    for (size_t i = 0; i < num_files; i++) {
+      sorted_files.push_back(&files_to_ingest_[i]);
+    }
+
+    std::sort(
+        sorted_files.begin(), sorted_files.end(),
+        [&ucmp](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+          return sstableKeyCompare(ucmp, info1->smallest_internal_key,
+                                   info2->smallest_internal_key) < 0;
+        });
+
+    for (size_t i = 0; i + 1 < num_files; i++) {
+      if (sstableKeyCompare(ucmp, sorted_files[i]->largest_internal_key,
+                            sorted_files[i + 1]->smallest_internal_key) >= 0) {
+        files_overlap_ = true;
+        break;
+      }
+    }
+  }
+
+  // Hanlde the file temperature
+  for (size_t i = 0; i < num_files; i++) {
+    files_to_ingest_[i].file_temperature = file_temperature;
+  }
+
+  if (ingestion_options_.ingest_behind && files_overlap_) {
+    return Status::NotSupported("Files have overlapping ranges");
+  }
+
+  // Copy/Move external files into DB
+  std::unordered_set<size_t> ingestion_path_ids;
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    f.copy_file = false;
+    const std::string path_outside_db = f.external_file_path;
+    const std::string path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+    if (ingestion_options_.move_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.ok()) {
+        // It is unsafe to assume application had sync the file and file
+        // directory before ingest the file. For integrity of RocksDB we need
+        // to sync the file.
+        std::unique_ptr<FSWritableFile> file_to_sync;
+        Status s = fs_->ReopenWritableFile(path_inside_db, env_options_,
+                                           &file_to_sync, nullptr);
+        TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:Reopen",
+                                 &s);
+        // Some file systems (especially remote/distributed) don't support
+        // reopening a file for writing and don't require reopening and
+        // syncing the file. Ignore the NotSupported error in that case.
+        if (!s.IsNotSupported()) {
+          status = s;
+          if (status.ok()) {
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::BeforeSyncIngestedFile");
+            status = SyncIngestedFile(file_to_sync.get());
+            TEST_SYNC_POINT(
+                "ExternalSstFileIngestionJob::AfterSyncIngestedFile");
+            if (!status.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Failed to sync ingested file %s: %s",
+                             path_inside_db.c_str(), status.ToString().c_str());
+            }
+          }
+        }
+      } else if (status.IsNotSupported() &&
+                 ingestion_options_.failed_move_fall_back_to_copy) {
+        // Original file is on a different FS, use copy instead of hard linking.
+        f.copy_file = true;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Triy to link file %s but it's not supported : %s",
+                       path_outside_db.c_str(), status.ToString().c_str());
+      }
+    } else {
+      f.copy_file = true;
+    }
+
+    if (f.copy_file) {
+      TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Prepare:CopyFile",
+                               nullptr);
+      // CopyFile also sync the new file.
+      status =
+          CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                   db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+    }
+    TEST_SYNC_POINT("ExternalSstFileIngestionJob::Prepare:FileAdded");
+    if (!status.ok()) {
+      break;
+    }
+    f.internal_file_path = path_inside_db;
+    // Initialize the checksum information of ingested files.
+    f.file_checksum = kUnknownFileChecksum;
+    f.file_checksum_func_name = kUnknownFileChecksumFuncName;
+    ingestion_path_ids.insert(f.fd.GetPathId());
+  }
+
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncDir");
+  if (status.ok()) {
+    for (auto path_id : ingestion_path_ids) {
+      status = directories_->GetDataDir(path_id)->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Failed to sync directory %" ROCKSDB_PRIszt
+                       " while ingest file: %s",
+                       path_id, status.ToString().c_str());
+        break;
+      }
+    }
+  }
+  TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncDir");
+
+  // Generate and check the sst file checksum. Note that, if
+  // IngestExternalFileOptions::write_global_seqno is true, we will not update
+  // the checksum information in the files_to_ingests_ here, since the file is
+  // upadted with the new global_seqno. After global_seqno is updated, DB will
+  // generate the new checksum and store it in the Manifest. In all other cases
+  // if ingestion_options_.write_global_seqno == true and
+  // verify_file_checksum is false, we only check the checksum function name.
+  if (status.ok() && db_options_.file_checksum_gen_factory != nullptr) {
+    if (ingestion_options_.verify_file_checksum == false &&
+        files_checksums.size() == files_to_ingest_.size() &&
+        files_checksum_func_names.size() == files_to_ingest_.size()) {
+      // Only when verify_file_checksum == false and the checksum for ingested
+      // files are provided, DB will use the provided checksum and does not
+      // generate the checksum for ingested files.
+      need_generate_file_checksum_ = false;
+    } else {
+      need_generate_file_checksum_ = true;
+    }
+    FileChecksumGenContext gen_context;
+    std::unique_ptr<FileChecksumGenerator> file_checksum_gen =
+        db_options_.file_checksum_gen_factory->CreateFileChecksumGenerator(
+            gen_context);
+    std::vector<std::string> generated_checksums;
+    std::vector<std::string> generated_checksum_func_names;
+    // Step 1: generate the checksum for ingested sst file.
+    if (need_generate_file_checksum_) {
+      for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+        std::string generated_checksum;
+        std::string generated_checksum_func_name;
+        std::string requested_checksum_func_name;
+        // TODO: rate limit file reads for checksum calculation during file
+        // ingestion.
+        IOStatus io_s = GenerateOneFileChecksum(
+            fs_.get(), files_to_ingest_[i].internal_file_path,
+            db_options_.file_checksum_gen_factory.get(),
+            requested_checksum_func_name, &generated_checksum,
+            &generated_checksum_func_name,
+            ingestion_options_.verify_checksums_readahead_size,
+            db_options_.allow_mmap_reads, io_tracer_,
+            db_options_.rate_limiter.get(),
+            Env::IO_TOTAL /* rate_limiter_priority */);
+        if (!io_s.ok()) {
+          status = io_s;
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Sst file checksum generation of file: %s failed: %s",
+                         files_to_ingest_[i].internal_file_path.c_str(),
+                         status.ToString().c_str());
+          break;
+        }
+        if (ingestion_options_.write_global_seqno == false) {
+          files_to_ingest_[i].file_checksum = generated_checksum;
+          files_to_ingest_[i].file_checksum_func_name =
+              generated_checksum_func_name;
+        }
+        generated_checksums.push_back(generated_checksum);
+        generated_checksum_func_names.push_back(generated_checksum_func_name);
+      }
+    }
+
+    // Step 2: based on the verify_file_checksum and ingested checksum
+    // information, do the verification.
+    if (status.ok()) {
+      if (files_checksums.size() == files_to_ingest_.size() &&
+          files_checksum_func_names.size() == files_to_ingest_.size()) {
+        // Verify the checksum and checksum function name.
+        if (ingestion_options_.verify_file_checksum) {
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] !=
+                generated_checksum_func_names[i]) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            if (files_checksums[i] != generated_checksums[i]) {
+              status = Status::Corruption(
+                  "Ingested checksum does not match with the generated "
+                  "checksum");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  files_to_ingest_[i].internal_file_path.c_str(),
+                  status.ToString().c_str());
+              break;
+            }
+          }
+        } else {
+          // If verify_file_checksum is not enabled, we only verify the
+          // checksum function name. If it does not match, fail the ingestion.
+          // If matches, we trust the ingested checksum information and store
+          // in the Manifest.
+          for (size_t i = 0; i < files_to_ingest_.size(); i++) {
+            if (files_checksum_func_names[i] != file_checksum_gen->Name()) {
+              status = Status::InvalidArgument(
+                  "Checksum function name does not match with the checksum "
+                  "function name of this DB");
+              ROCKS_LOG_WARN(
+                  db_options_.info_log,
+                  "Sst file checksum verification of file: %s failed: %s",
+                  external_files_paths[i].c_str(), status.ToString().c_str());
+              break;
+            }
+            files_to_ingest_[i].file_checksum = files_checksums[i];
+            files_to_ingest_[i].file_checksum_func_name =
+                files_checksum_func_names[i];
+          }
+        }
+      } else if (files_checksums.size() != files_checksum_func_names.size() ||
+                 (files_checksums.size() == files_checksum_func_names.size() &&
+                  files_checksums.size() != 0)) {
+        // The checksum or checksum function name vector are not both empty
+        // and they are incomplete.
+        status = Status::InvalidArgument(
+            "The checksum information of ingested sst files are nonempty and "
+            "the size of checksums or the size of the checksum function "
+            "names "
+            "does not match with the number of ingested sst files");
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "The ingested sst files checksum information is incomplete: %s",
+            status.ToString().c_str());
+      }
+    }
+  }
+
+  // TODO: The following is duplicated with Cleanup().
+  if (!status.ok()) {
+    IOOptions io_opts;
+    // We failed, remove all files that we copied into the db
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::NeedsFlush(bool* flush_needed,
+                                               SuperVersion* super_version) {
+  autovector<Range> ranges;
+  autovector<std::string> keys;
+  size_t ts_sz = cfd_->user_comparator()->timestamp_size();
+  if (ts_sz) {
+    // Check all ranges [begin, end] inclusively. Add maximum
+    // timestamp to include all `begin` keys, and add minimal timestamp to
+    // include all `end` keys.
+    for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+      std::string begin_str;
+      std::string end_str;
+      AppendUserKeyWithMaxTimestamp(
+          &begin_str, file_to_ingest.smallest_internal_key.user_key(), ts_sz);
+      AppendKeyWithMinTimestamp(
+          &end_str, file_to_ingest.largest_internal_key.user_key(), ts_sz);
+      keys.emplace_back(std::move(begin_str));
+      keys.emplace_back(std::move(end_str));
+    }
+    for (size_t i = 0; i < files_to_ingest_.size(); ++i) {
+      ranges.emplace_back(keys[2 * i], keys[2 * i + 1]);
+    }
+  } else {
+    for (const IngestedFileInfo& file_to_ingest : files_to_ingest_) {
+      ranges.emplace_back(file_to_ingest.smallest_internal_key.user_key(),
+                          file_to_ingest.largest_internal_key.user_key());
+    }
+  }
+  Status status = cfd_->RangesOverlapWithMemtables(
+      ranges, super_version, db_options_.allow_data_in_errors, flush_needed);
+  if (status.ok() && *flush_needed &&
+      !ingestion_options_.allow_blocking_flush) {
+    status = Status::InvalidArgument("External file requires flush");
+  }
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ExternalSstFileIngestionJob::Run() {
+  Status status;
+  SuperVersion* super_version = cfd_->GetSuperVersion();
+#ifndef NDEBUG
+  // We should never run the job with a memtable that is overlapping
+  // with the files we are ingesting
+  bool need_flush = false;
+  status = NeedsFlush(&need_flush, super_version);
+  if (!status.ok()) {
+    return status;
+  }
+  if (need_flush) {
+    return Status::TryAgain();
+  }
+  assert(status.ok() && need_flush == false);
+#endif
+
+  bool force_global_seqno = false;
+
+  if (ingestion_options_.snapshot_consistency && !db_snapshots_->empty()) {
+    // We need to assign a global sequence number to all the files even
+    // if the don't overlap with any ranges since we have snapshots
+    force_global_seqno = true;
+  }
+  // It is safe to use this instead of LastAllocatedSequence since we are
+  // the only active writer, and hence they are equal
+  SequenceNumber last_seqno = versions_->LastSequence();
+  edit_.SetColumnFamily(cfd_->GetID());
+  // The levels that the files will be ingested into
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    SequenceNumber assigned_seqno = 0;
+    if (ingestion_options_.ingest_behind) {
+      status = CheckLevelForIngestedBehindFile(&f);
+    } else {
+      status = AssignLevelAndSeqnoForIngestedFile(
+          super_version, force_global_seqno, cfd_->ioptions()->compaction_style,
+          last_seqno, &f, &assigned_seqno);
+    }
+
+    // Modify the smallest/largest internal key to include the sequence number
+    // that we just learned. Only overwrite sequence number zero. There could
+    // be a nonzero sequence number already to indicate a range tombstone's
+    // exclusive endpoint.
+    ParsedInternalKey smallest_parsed, largest_parsed;
+    if (status.ok()) {
+      status = ParseInternalKey(*f.smallest_internal_key.rep(),
+                                &smallest_parsed, false /* log_err_key */);
+    }
+    if (status.ok()) {
+      status = ParseInternalKey(*f.largest_internal_key.rep(), &largest_parsed,
+                                false /* log_err_key */);
+    }
+    if (!status.ok()) {
+      return status;
+    }
+    if (smallest_parsed.sequence == 0) {
+      UpdateInternalKey(f.smallest_internal_key.rep(), assigned_seqno,
+                        smallest_parsed.type);
+    }
+    if (largest_parsed.sequence == 0) {
+      UpdateInternalKey(f.largest_internal_key.rep(), assigned_seqno,
+                        largest_parsed.type);
+    }
+
+    status = AssignGlobalSeqnoForIngestedFile(&f, assigned_seqno);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::Run",
+                             &assigned_seqno);
+    if (assigned_seqno > last_seqno) {
+      assert(assigned_seqno == last_seqno + 1);
+      last_seqno = assigned_seqno;
+      ++consumed_seqno_count_;
+    }
+    if (!status.ok()) {
+      return status;
+    }
+
+    status = GenerateChecksumForIngestedFile(&f);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // We use the import time as the ancester time. This is the time the data
+    // is written to the database.
+    int64_t temp_current_time = 0;
+    uint64_t current_time = kUnknownFileCreationTime;
+    uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+    if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+      current_time = oldest_ancester_time =
+          static_cast<uint64_t>(temp_current_time);
+    }
+    FileMetaData f_metadata(
+        f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(),
+        f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno,
+        f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber,
+        oldest_ancester_time, current_time, f.file_checksum,
+        f.file_checksum_func_name, f.unique_id);
+    f_metadata.temperature = f.file_temperature;
+    edit_.AddFile(f.picked_level, f_metadata);
+  }
+  return status;
+}
+
+void ExternalSstFileIngestionJob::UpdateStats() {
+  // Update internal stats for new ingested files
+  uint64_t total_keys = 0;
+  uint64_t total_l0_files = 0;
+  uint64_t total_time = clock_->NowMicros() - job_start_time_;
+
+  EventLoggerStream stream = event_logger_->Log();
+  stream << "event"
+         << "ingest_finished";
+  stream << "files_ingested";
+  stream.StartArray();
+
+  for (IngestedFileInfo& f : files_to_ingest_) {
+    InternalStats::CompactionStats stats(
+        CompactionReason::kExternalSstIngestion, 1);
+    stats.micros = total_time;
+    // If actual copy occurred for this file, then we need to count the file
+    // size as the actual bytes written. If the file was linked, then we ignore
+    // the bytes written for file metadata.
+    // TODO (yanqin) maybe account for file metadata bytes for exact accuracy?
+    if (f.copy_file) {
+      stats.bytes_written = f.fd.GetFileSize();
+    } else {
+      stats.bytes_moved = f.fd.GetFileSize();
+    }
+    stats.num_output_files = 1;
+    cfd_->internal_stats()->AddCompactionStats(f.picked_level,
+                                               Env::Priority::USER, stats);
+    cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_INGESTED_ADD_FILE,
+                                       f.fd.GetFileSize());
+    total_keys += f.num_entries;
+    if (f.picked_level == 0) {
+      total_l0_files += 1;
+    }
+    ROCKS_LOG_INFO(
+        db_options_.info_log,
+        "[AddFile] External SST file %s was ingested in L%d with path %s "
+        "(global_seqno=%" PRIu64 ")\n",
+        f.external_file_path.c_str(), f.picked_level,
+        f.internal_file_path.c_str(), f.assigned_seqno);
+    stream << "file" << f.internal_file_path << "level" << f.picked_level;
+  }
+  stream.EndArray();
+
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_KEYS_TOTAL,
+                                     total_keys);
+  cfd_->internal_stats()->AddCFStats(InternalStats::INGESTED_NUM_FILES_TOTAL,
+                                     files_to_ingest_.size());
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::INGESTED_LEVEL0_NUM_FILES_TOTAL, total_l0_files);
+}
+
+void ExternalSstFileIngestionJob::Cleanup(const Status& status) {
+  IOOptions io_opts;
+  if (!status.ok()) {
+    // We failed to add the files to the database
+    // remove all the files we copied
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      if (f.internal_file_path.empty()) {
+        continue;
+      }
+      Status s = fs_->DeleteFile(f.internal_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+    consumed_seqno_count_ = 0;
+    files_overlap_ = false;
+  } else if (status.ok() && ingestion_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_ingest_) {
+      Status s = fs_->DeleteFile(f.external_file_path, io_opts, nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_ingest, SuperVersion* sv) {
+  file_to_ingest->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_ingest->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Assign FD with number
+  file_to_ingest->fd =
+      FileDescriptor(new_file_number, 0, file_to_ingest->file_size);
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status =
+      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
+      std::move(sst_file_reader), file_to_ingest->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  if (ingestion_options_.verify_checksums_before_ingest) {
+    // If customized readahead size is needed, we can pass a user option
+    // all the way to here. Right now we just rely on the default readahead
+    // to keep things simple.
+    ReadOptions ro;
+    ro.readahead_size = ingestion_options_.verify_checksums_readahead_size;
+    status = table_reader->VerifyChecksum(
+        ro, TableReaderCaller::kExternalSSTIngestion);
+  }
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+  const auto& uprops = props->user_collected_properties;
+
+  // Get table version
+  auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
+  if (version_iter == uprops.end()) {
+    return Status::Corruption("External file version not found");
+  }
+  file_to_ingest->version = DecodeFixed32(version_iter->second.c_str());
+
+  auto seqno_iter = uprops.find(ExternalSstFilePropertyNames::kGlobalSeqno);
+  if (file_to_ingest->version == 2) {
+    // version 2 imply that we have global sequence number
+    if (seqno_iter == uprops.end()) {
+      return Status::Corruption(
+          "External file global sequence number not found");
+    }
+
+    // Set the global sequence number
+    file_to_ingest->original_seqno = DecodeFixed64(seqno_iter->second.c_str());
+    if (props->external_sst_file_global_seqno_offset == 0) {
+      file_to_ingest->global_seqno_offset = 0;
+      return Status::Corruption("Was not able to find file global seqno field");
+    }
+    file_to_ingest->global_seqno_offset =
+        static_cast<size_t>(props->external_sst_file_global_seqno_offset);
+  } else if (file_to_ingest->version == 1) {
+    // SST file V1 should not have global seqno field
+    assert(seqno_iter == uprops.end());
+    file_to_ingest->original_seqno = 0;
+    if (ingestion_options_.allow_blocking_flush ||
+        ingestion_options_.allow_global_seqno) {
+      return Status::InvalidArgument(
+          "External SST file V1 does not support global seqno");
+    }
+  } else {
+    return Status::InvalidArgument("External file version is not supported");
+  }
+  // Get number of entries in table
+  file_to_ingest->num_entries = props->num_entries;
+  file_to_ingest->num_range_deletions = props->num_range_deletions;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+  std::unique_ptr<InternalIterator> range_del_iter(
+      table_reader->NewRangeTombstoneIterator(ro));
+
+  // Get first (smallest) and last (largest) key from file.
+  file_to_ingest->smallest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  file_to_ingest->largest_internal_key =
+      InternalKey("", 0, ValueType::kTypeValue);
+  bool bounds_set = false;
+  bool allow_data_in_errors = db_options_.allow_data_in_errors;
+  iter->SeekToFirst();
+  if (iter->Valid()) {
+    Status pik_status =
+        ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("External file has non zero sequence number");
+    }
+    file_to_ingest->smallest_internal_key.SetFrom(key);
+
+    iter->SeekToLast();
+    pik_status = ParseInternalKey(iter->key(), &key, allow_data_in_errors);
+    if (!pik_status.ok()) {
+      return Status::Corruption("Corrupted key in external file. ",
+                                pik_status.getState());
+    }
+    if (key.sequence != 0) {
+      return Status::Corruption("External file has non zero sequence number");
+    }
+    file_to_ingest->largest_internal_key.SetFrom(key);
+
+    bounds_set = true;
+  }
+
+  // We may need to adjust these key bounds, depending on whether any range
+  // deletion tombstones extend past them.
+  const Comparator* ucmp = cfd_->internal_comparator().user_comparator();
+  if (range_del_iter != nullptr) {
+    for (range_del_iter->SeekToFirst(); range_del_iter->Valid();
+         range_del_iter->Next()) {
+      Status pik_status =
+          ParseInternalKey(range_del_iter->key(), &key, allow_data_in_errors);
+      if (!pik_status.ok()) {
+        return Status::Corruption("Corrupted key in external file. ",
+                                  pik_status.getState());
+      }
+      RangeTombstone tombstone(key, range_del_iter->value());
+
+      InternalKey start_key = tombstone.SerializeKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, start_key,
+                            file_to_ingest->smallest_internal_key) < 0) {
+        file_to_ingest->smallest_internal_key = start_key;
+      }
+      InternalKey end_key = tombstone.SerializeEndKey();
+      if (!bounds_set ||
+          sstableKeyCompare(ucmp, end_key,
+                            file_to_ingest->largest_internal_key) > 0) {
+        file_to_ingest->largest_internal_key = end_key;
+      }
+      bounds_set = true;
+    }
+  }
+
+  file_to_ingest->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_ingest->table_properties = *props;
+
+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_ingest->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_ingest->internal_file_path.c_str());
+    file_to_ingest->unique_id = kNullUniqueId64x2;
+  }
+
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
+    SuperVersion* sv, bool force_global_seqno, CompactionStyle compaction_style,
+    SequenceNumber last_seqno, IngestedFileInfo* file_to_ingest,
+    SequenceNumber* assigned_seqno) {
+  Status status;
+  *assigned_seqno = 0;
+  if (force_global_seqno) {
+    *assigned_seqno = last_seqno + 1;
+    if (compaction_style == kCompactionStyleUniversal || files_overlap_) {
+      if (ingestion_options_.fail_if_not_bottommost_level) {
+        status = Status::TryAgain(
+            "Files cannot be ingested to Lmax. Please make sure key range of "
+            "Lmax does not overlap with files to ingest.");
+        return status;
+      }
+      file_to_ingest->picked_level = 0;
+      return status;
+    }
+  }
+
+  bool overlap_with_db = false;
+  Arena arena;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  int target_level = 0;
+  auto* vstorage = cfd_->current()->storage_info();
+
+  for (int lvl = 0; lvl < cfd_->NumberLevels(); lvl++) {
+    if (lvl > 0 && lvl < vstorage->base_level()) {
+      continue;
+    }
+
+    if (vstorage->NumLevelFiles(lvl) > 0) {
+      bool overlap_with_level = false;
+      status = sv->current->OverlapWithLevelIterator(
+          ro, env_options_, file_to_ingest->smallest_internal_key.user_key(),
+          file_to_ingest->largest_internal_key.user_key(), lvl,
+          &overlap_with_level);
+      if (!status.ok()) {
+        return status;
+      }
+      if (overlap_with_level) {
+        // We must use L0 or any level higher than `lvl` to be able to overwrite
+        // the keys that we overlap with in this level, We also need to assign
+        // this file a seqno to overwrite the existing keys in level `lvl`
+        overlap_with_db = true;
+        break;
+      }
+
+      if (compaction_style == kCompactionStyleUniversal && lvl != 0) {
+        const std::vector<FileMetaData*>& level_files =
+            vstorage->LevelFiles(lvl);
+        const SequenceNumber level_largest_seqno =
+            (*std::max_element(level_files.begin(), level_files.end(),
+                               [](FileMetaData* f1, FileMetaData* f2) {
+                                 return f1->fd.largest_seqno <
+                                        f2->fd.largest_seqno;
+                               }))
+                ->fd.largest_seqno;
+        // should only assign seqno to current level's largest seqno when
+        // the file fits
+        if (level_largest_seqno != 0 &&
+            IngestedFileFitInLevel(file_to_ingest, lvl)) {
+          *assigned_seqno = level_largest_seqno;
+        } else {
+          continue;
+        }
+      }
+    } else if (compaction_style == kCompactionStyleUniversal) {
+      continue;
+    }
+
+    // We don't overlap with any keys in this level, but we still need to check
+    // if our file can fit in it
+    if (IngestedFileFitInLevel(file_to_ingest, lvl)) {
+      target_level = lvl;
+    }
+  }
+  // If files overlap, we have to ingest them at level 0 and assign the newest
+  // sequence number
+  if (files_overlap_) {
+    target_level = 0;
+    *assigned_seqno = last_seqno + 1;
+  }
+
+  if (ingestion_options_.fail_if_not_bottommost_level &&
+      target_level < cfd_->NumberLevels() - 1) {
+    status = Status::TryAgain(
+        "Files cannot be ingested to Lmax. Please make sure key range of Lmax "
+        "does not overlap with files to ingest.");
+    return status;
+  }
+
+  TEST_SYNC_POINT_CALLBACK(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      &overlap_with_db);
+  file_to_ingest->picked_level = target_level;
+  if (overlap_with_db && *assigned_seqno == 0) {
+    *assigned_seqno = last_seqno + 1;
+  }
+  return status;
+}
+
+Status ExternalSstFileIngestionJob::CheckLevelForIngestedBehindFile(
+    IngestedFileInfo* file_to_ingest) {
+  auto* vstorage = cfd_->current()->storage_info();
+  // first check if new files fit in the bottommost level
+  int bottom_lvl = cfd_->NumberLevels() - 1;
+  if (!IngestedFileFitInLevel(file_to_ingest, bottom_lvl)) {
+    return Status::InvalidArgument(
+        "Can't ingest_behind file as it doesn't fit "
+        "at the bottommost level!");
+  }
+
+  // second check if despite allow_ingest_behind=true we still have 0 seqnums
+  // at some upper level
+  for (int lvl = 0; lvl < cfd_->NumberLevels() - 1; lvl++) {
+    for (auto file : vstorage->LevelFiles(lvl)) {
+      if (file->fd.smallest_seqno == 0) {
+        return Status::InvalidArgument(
+            "Can't ingest_behind file as despite allow_ingest_behind=true "
+            "there are files with 0 seqno in database at upper levels!");
+      }
+    }
+  }
+
+  file_to_ingest->picked_level = bottom_lvl;
+  return Status::OK();
+}
+
+Status ExternalSstFileIngestionJob::AssignGlobalSeqnoForIngestedFile(
+    IngestedFileInfo* file_to_ingest, SequenceNumber seqno) {
+  if (file_to_ingest->original_seqno == seqno) {
+    // This file already have the correct global seqno
+    return Status::OK();
+  } else if (!ingestion_options_.allow_global_seqno) {
+    return Status::InvalidArgument("Global seqno is required, but disabled");
+  } else if (file_to_ingest->global_seqno_offset == 0) {
+    return Status::InvalidArgument(
+        "Trying to set global seqno for a file that don't have a global seqno "
+        "field");
+  }
+
+  if (ingestion_options_.write_global_seqno) {
+    // Determine if we can write global_seqno to a given offset of file.
+    // If the file system does not support random write, then we should not.
+    // Otherwise we should.
+    std::unique_ptr<FSRandomRWFile> rwfile;
+    Status status = fs_->NewRandomRWFile(file_to_ingest->internal_file_path,
+                                         env_options_, &rwfile, nullptr);
+    TEST_SYNC_POINT_CALLBACK("ExternalSstFileIngestionJob::NewRandomRWFile",
+                             &status);
+    if (status.ok()) {
+      FSRandomRWFilePtr fsptr(std::move(rwfile), io_tracer_,
+                              file_to_ingest->internal_file_path);
+      std::string seqno_val;
+      PutFixed64(&seqno_val, seqno);
+      status = fsptr->Write(file_to_ingest->global_seqno_offset, seqno_val,
+                            IOOptions(), nullptr);
+      if (status.ok()) {
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::BeforeSyncGlobalSeqno");
+        status = SyncIngestedFile(fsptr.get());
+        TEST_SYNC_POINT("ExternalSstFileIngestionJob::AfterSyncGlobalSeqno");
+        if (!status.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Failed to sync ingested file %s after writing global "
+                         "sequence number: %s",
+                         file_to_ingest->internal_file_path.c_str(),
+                         status.ToString().c_str());
+        }
+      }
+      if (!status.ok()) {
+        return status;
+      }
+    } else if (!status.IsNotSupported()) {
+      return status;
+    }
+  }
+
+  file_to_ingest->assigned_seqno = seqno;
+  return Status::OK();
+}
+
+IOStatus ExternalSstFileIngestionJob::GenerateChecksumForIngestedFile(
+    IngestedFileInfo* file_to_ingest) {
+  if (db_options_.file_checksum_gen_factory == nullptr ||
+      need_generate_file_checksum_ == false ||
+      ingestion_options_.write_global_seqno == false) {
+    // If file_checksum_gen_factory is not set, we are not able to generate
+    // the checksum. if write_global_seqno is false, it means we will use
+    // file checksum generated during Prepare(). This step will be skipped.
+    return IOStatus::OK();
+  }
+  std::string file_checksum;
+  std::string file_checksum_func_name;
+  std::string requested_checksum_func_name;
+  // TODO: rate limit file reads for checksum calculation during file ingestion.
+  IOStatus io_s = GenerateOneFileChecksum(
+      fs_.get(), file_to_ingest->internal_file_path,
+      db_options_.file_checksum_gen_factory.get(), requested_checksum_func_name,
+      &file_checksum, &file_checksum_func_name,
+      ingestion_options_.verify_checksums_readahead_size,
+      db_options_.allow_mmap_reads, io_tracer_, db_options_.rate_limiter.get(),
+      Env::IO_TOTAL /* rate_limiter_priority */);
+  if (!io_s.ok()) {
+    return io_s;
+  }
+  file_to_ingest->file_checksum = file_checksum;
+  file_to_ingest->file_checksum_func_name = file_checksum_func_name;
+  return IOStatus::OK();
+}
+
+bool ExternalSstFileIngestionJob::IngestedFileFitInLevel(
+    const IngestedFileInfo* file_to_ingest, int level) {
+  if (level == 0) {
+    // Files can always fit in L0
+    return true;
+  }
+
+  auto* vstorage = cfd_->current()->storage_info();
+  Slice file_smallest_user_key(
+      file_to_ingest->smallest_internal_key.user_key());
+  Slice file_largest_user_key(file_to_ingest->largest_internal_key.user_key());
+
+  if (vstorage->OverlapInLevel(level, &file_smallest_user_key,
+                               &file_largest_user_key)) {
+    // File overlap with another files in this level, we cannot
+    // add it to this level
+    return false;
+  }
+  if (cfd_->RangeOverlapWithCompaction(file_smallest_user_key,
+                                       file_largest_user_key, level)) {
+    // File overlap with a running compaction output that will be stored
+    // in this level, we cannot add this file to this level
+    return false;
+  }
+
+  // File did not overlap with level files, our compaction output
+  return true;
+}
+
+template <typename TWritableFile>
+Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) {
+  assert(file != nullptr);
+  if (db_options_.use_fsync) {
+    return file->Fsync(IOOptions(), nullptr);
+  } else {
+    return file->Sync(IOOptions(), nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/external_sst_file_ingestion_job.h b/src/rocksdb/db/external_sst_file_ingestion_job.h
new file mode 100644
index 000000000..ce50ae86d
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_ingestion_job.h
@@ -0,0 +1,201 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/internal_stats.h"
+#include "db/snapshot_impl.h"
+#include "env/file_system_tracer.h"
+#include "logging/event_logger.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Directories;
+class SystemClock;
+
+struct IngestedFileInfo {
+  // External file path
+  std::string external_file_path;
+  // Smallest internal key in external file
+  InternalKey smallest_internal_key;
+  // Largest internal key in external file
+  InternalKey largest_internal_key;
+  // Sequence number for keys in external file
+  SequenceNumber original_seqno;
+  // Offset of the global sequence number field in the file, will
+  // be zero if version is 1 (global seqno is not supported)
+  size_t global_seqno_offset;
+  // External file size
+  uint64_t file_size;
+  // total number of keys in external file
+  uint64_t num_entries;
+  // total number of range deletions in external file
+  uint64_t num_range_deletions;
+  // Id of column family this file shoule be ingested into
+  uint32_t cf_id;
+  // TableProperties read from external file
+  TableProperties table_properties;
+  // Version of external file
+  int version;
+
+  // FileDescriptor for the file inside the DB
+  FileDescriptor fd;
+  // file path that we picked for file inside the DB
+  std::string internal_file_path;
+  // Global sequence number that we picked for the file inside the DB
+  SequenceNumber assigned_seqno = 0;
+  // Level inside the DB we picked for the external file.
+  int picked_level = 0;
+  // Whether to copy or link the external sst file. copy_file will be set to
+  // false if ingestion_options.move_files is true and underlying FS
+  // supports link operation. Need to provide a default value to make the
+  // undefined-behavior sanity check of llvm happy. Since
+  // ingestion_options.move_files is false by default, thus copy_file is true
+  // by default.
+  bool copy_file = true;
+  // The checksum of ingested file
+  std::string file_checksum;
+  // The name of checksum function that generate the checksum
+  std::string file_checksum_func_name;
+  // The temperature of the file to be ingested
+  Temperature file_temperature = Temperature::kUnknown;
+  // Unique id of the file to be ingested
+  UniqueId64x2 unique_id{};
+};
+
+class ExternalSstFileIngestionJob {
+ public:
+  ExternalSstFileIngestionJob(
+      VersionSet* versions, ColumnFamilyData* cfd,
+      const ImmutableDBOptions& db_options, const EnvOptions& env_options,
+      SnapshotList* db_snapshots,
+      const IngestExternalFileOptions& ingestion_options,
+      Directories* directories, EventLogger* event_logger,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
+        fs_(db_options.fs, io_tracer),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        env_options_(env_options),
+        db_snapshots_(db_snapshots),
+        ingestion_options_(ingestion_options),
+        directories_(directories),
+        event_logger_(event_logger),
+        job_start_time_(clock_->NowMicros()),
+        consumed_seqno_count_(0),
+        io_tracer_(io_tracer) {
+    assert(directories != nullptr);
+  }
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(const std::vector<std::string>& external_files_paths,
+                 const std::vector<std::string>& files_checksums,
+                 const std::vector<std::string>& files_checksum_func_names,
+                 const Temperature& file_temperature, uint64_t next_file_number,
+                 SuperVersion* sv);
+
+  // Check if we need to flush the memtable before running the ingestion job
+  // This will be true if the files we are ingesting are overlapping with any
+  // key range in the memtable.
+  //
+  // @param super_version A referenced SuperVersion that will be held for the
+  //    duration of this function.
+  //
+  // Thread-safe
+  Status NeedsFlush(bool* flush_needed, SuperVersion* super_version);
+
+  // Will execute the ingestion job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Update column family stats.
+  // REQUIRES: Mutex held
+  void UpdateStats();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_ingest() const {
+    return files_to_ingest_;
+  }
+
+  // How many sequence numbers did we consume as part of the ingest job?
+  int ConsumedSequenceNumbersCount() const { return consumed_seqno_count_; }
+
+ private:
+  // Open the external file and populate `file_to_ingest` with all the
+  // external information we need to ingest this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
+                             IngestedFileInfo* file_to_ingest,
+                             SuperVersion* sv);
+
+  // Assign `file_to_ingest` the appropriate sequence number and the lowest
+  // possible level that it can be ingested to according to compaction_style.
+  // REQUIRES: Mutex held
+  Status AssignLevelAndSeqnoForIngestedFile(SuperVersion* sv,
+                                            bool force_global_seqno,
+                                            CompactionStyle compaction_style,
+                                            SequenceNumber last_seqno,
+                                            IngestedFileInfo* file_to_ingest,
+                                            SequenceNumber* assigned_seqno);
+
+  // File that we want to ingest behind always goes to the lowest level;
+  // we just check that it fits in the level, that DB allows ingest_behind,
+  // and that we don't have 0 seqnums at the upper levels.
+  // REQUIRES: Mutex held
+  Status CheckLevelForIngestedBehindFile(IngestedFileInfo* file_to_ingest);
+
+  // Set the file global sequence number to `seqno`
+  Status AssignGlobalSeqnoForIngestedFile(IngestedFileInfo* file_to_ingest,
+                                          SequenceNumber seqno);
+  // Generate the file checksum and store in the IngestedFileInfo
+  IOStatus GenerateChecksumForIngestedFile(IngestedFileInfo* file_to_ingest);
+
+  // Check if `file_to_ingest` can fit in level `level`
+  // REQUIRES: Mutex held
+  bool IngestedFileFitInLevel(const IngestedFileInfo* file_to_ingest,
+                              int level);
+
+  // Helper method to sync given file.
+  template <typename TWritableFile>
+  Status SyncIngestedFile(TWritableFile* file);
+
+  SystemClock* clock_;
+  FileSystemPtr fs_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const EnvOptions& env_options_;
+  SnapshotList* db_snapshots_;
+  autovector<IngestedFileInfo> files_to_ingest_;
+  const IngestExternalFileOptions& ingestion_options_;
+  Directories* directories_;
+  EventLogger* event_logger_;
+  VersionEdit edit_;
+  uint64_t job_start_time_;
+  int consumed_seqno_count_;
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true all files are
+  // ingested in L0
+  bool files_overlap_{false};
+  // Set in ExternalSstFileIngestionJob::Prepare(), if true and DB
+  // file_checksum_gen_factory is set, DB will generate checksum each file.
+  bool need_generate_file_checksum_{true};
+  std::shared_ptr<IOTracer> io_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/external_sst_file_test.cc b/src/rocksdb/db/external_sst_file_test.cc
new file mode 100644
index 000000000..d16f6a58c
--- /dev/null
+++ b/src/rocksdb/db/external_sst_file_test.cc
@@ -0,0 +1,2967 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "file/filename.h"
+#include "options/options_helper.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_reader.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/thread_guard.h"
+#include "utilities/fault_injection_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A test environment that can be configured to fail the Link operation.
+class ExternalSSTTestEnv : public EnvWrapper {
+ public:
+  ExternalSSTTestEnv(Env* t, bool fail_link)
+      : EnvWrapper(t), fail_link_(fail_link) {}
+  static const char* kClassName() { return "ExternalSSTTestEnv"; }
+  const char* Name() const override { return kClassName(); }
+
+  Status LinkFile(const std::string& s, const std::string& t) override {
+    if (fail_link_) {
+      return Status::NotSupported("Link failed");
+    }
+    return target()->LinkFile(s, t);
+  }
+
+  void set_fail_link(bool fail_link) { fail_link_ = fail_link; }
+
+ private:
+  bool fail_link_;
+};
+
+class ExternalSSTFileTestBase : public DBTestBase {
+ public:
+  ExternalSSTFileTestBase()
+      : DBTestBase("external_sst_file_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    DestroyAndRecreateExternalSSTFilesDir();
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    ASSERT_OK(DestroyDir(env_, sst_files_dir_));
+    ASSERT_OK(env_->CreateDir(sst_files_dir_));
+  }
+
+  ~ExternalSSTFileTestBase() override {
+    DestroyDir(env_, sst_files_dir_).PermitUncheckedError();
+  }
+
+ protected:
+  std::string sst_files_dir_;
+};
+
+class ExternSSTFileLinkFailFallbackTest
+    : public ExternalSSTFileTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternSSTFileLinkFailFallbackTest()
+      : test_env_(new ExternalSSTTestEnv(env_, true)) {
+    options_ = CurrentOptions();
+    options_.disable_auto_compactions = true;
+    options_.env = test_env_;
+  }
+
+  void TearDown() override {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, options_));
+    delete test_env_;
+    test_env_ = nullptr;
+  }
+
+ protected:
+  Options options_;
+  ExternalSSTTestEnv* test_env_;
+};
+
+class ExternalSSTFileTest
+    : public ExternalSSTFileTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+ public:
+  ExternalSSTFileTest() {}
+
+  Status GenerateOneExternalFile(
+      const Options& options, ColumnFamilyHandle* cfh,
+      std::vector<std::pair<std::string, std::string>>& data, int file_id,
+      bool sort_data, std::string* external_file_path,
+      std::map<std::string, std::string>* true_data) {
+    // Generate a file id if not provided
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (const auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish().PermitUncheckedError();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+    if (s.ok() && external_file_path != nullptr) {
+      *external_file_path = file_path;
+    }
+    if (s.ok() && nullptr != true_data) {
+      for (const auto& entry : data) {
+        true_data->insert({entry.first, entry.second});
+      }
+    }
+    return s;
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options,
+      std::vector<std::pair<std::string, std::string>> data, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    // Generate a file id if not provided
+    if (file_id == -1) {
+      file_id = last_file_id_ + 1;
+      last_file_id_++;
+    }
+
+    // Sort data if asked to do so
+    if (sort_data) {
+      std::sort(data.begin(), data.end(),
+                [&](const std::pair<std::string, std::string>& e1,
+                    const std::pair<std::string, std::string>& e2) {
+                  return options.comparator->Compare(e1.first, e2.first) < 0;
+                });
+      auto uniq_iter = std::unique(
+          data.begin(), data.end(),
+          [&](const std::pair<std::string, std::string>& e1,
+              const std::pair<std::string, std::string>& e2) {
+            return options.comparator->Compare(e1.first, e2.first) == 0;
+          });
+      data.resize(uniq_iter - data.begin());
+    }
+    std::string file_path = sst_files_dir_ + std::to_string(file_id);
+    SstFileWriter sst_file_writer(EnvOptions(), options, cfh);
+
+    Status s = sst_file_writer.Open(file_path);
+    if (!s.ok()) {
+      return s;
+    }
+    for (auto& entry : data) {
+      s = sst_file_writer.Put(entry.first, entry.second);
+      if (!s.ok()) {
+        sst_file_writer.Finish().PermitUncheckedError();
+        return s;
+      }
+    }
+    s = sst_file_writer.Finish();
+
+    if (s.ok()) {
+      IngestExternalFileOptions ifo;
+      ifo.allow_global_seqno = allow_global_seqno;
+      ifo.write_global_seqno = allow_global_seqno ? write_global_seqno : false;
+      ifo.verify_checksums_before_ingest = verify_checksums_before_ingest;
+      ifo.ingest_behind = ingest_behind;
+      if (cfh) {
+        s = db_->IngestExternalFile(cfh, {file_path}, ifo);
+      } else {
+        s = db_->IngestExternalFile({file_path}, ifo);
+      }
+    }
+
+    if (s.ok() && true_data) {
+      for (auto& entry : data) {
+        (*true_data)[entry.first] = entry.second;
+      }
+    }
+
+    return s;
+  }
+
+  Status GenerateAndAddExternalFiles(
+      const Options& options,
+      const std::vector<ColumnFamilyHandle*>& column_families,
+      const std::vector<IngestExternalFileOptions>& ifos,
+      std::vector<std::vector<std::pair<std::string, std::string>>>& data,
+      int file_id, bool sort_data,
+      std::vector<std::map<std::string, std::string>>& true_data) {
+    if (-1 == file_id) {
+      file_id = (++last_file_id_);
+    }
+    // Generate external SST files, one for each column family
+    size_t num_cfs = column_families.size();
+    assert(ifos.size() == num_cfs);
+    assert(data.size() == num_cfs);
+    std::vector<IngestExternalFileArg> args(num_cfs);
+    for (size_t i = 0; i != num_cfs; ++i) {
+      std::string external_file_path;
+      Status s = GenerateOneExternalFile(
+          options, column_families[i], data[i], file_id, sort_data,
+          &external_file_path,
+          true_data.size() == num_cfs ? &true_data[i] : nullptr);
+      if (!s.ok()) {
+        return s;
+      }
+      ++file_id;
+
+      args[i].column_family = column_families[i];
+      args[i].external_files.push_back(external_file_path);
+      args[i].options = ifos[i];
+    }
+    return db_->IngestExternalFiles(args);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<std::pair<int, std::string>> data,
+      int file_id = -1, bool allow_global_seqno = false,
+      bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& entry : data) {
+      file_data.emplace_back(Key(entry.first), entry.second);
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status GenerateAndAddExternalFile(
+      const Options options, std::vector<int> keys, int file_id = -1,
+      bool allow_global_seqno = false, bool write_global_seqno = false,
+      bool verify_checksums_before_ingest = true, bool ingest_behind = false,
+      bool sort_data = false,
+      std::map<std::string, std::string>* true_data = nullptr,
+      ColumnFamilyHandle* cfh = nullptr) {
+    std::vector<std::pair<std::string, std::string>> file_data;
+    for (auto& k : keys) {
+      file_data.emplace_back(Key(k), Key(k) + std::to_string(file_id));
+    }
+    return GenerateAndAddExternalFile(options, file_data, file_id,
+                                      allow_global_seqno, write_global_seqno,
+                                      verify_checksums_before_ingest,
+                                      ingest_behind, sort_data, true_data, cfh);
+  }
+
+  Status DeprecatedAddFile(const std::vector<std::string>& files,
+                           bool move_files = false,
+                           bool skip_snapshot_check = false,
+                           bool skip_write_global_seqno = false) {
+    IngestExternalFileOptions opts;
+    opts.move_files = move_files;
+    opts.snapshot_consistency = !skip_snapshot_check;
+    opts.allow_global_seqno = false;
+    opts.allow_blocking_flush = false;
+    opts.write_global_seqno = !skip_write_global_seqno;
+    return db_->IngestExternalFile(files, opts);
+  }
+
+ protected:
+  int last_file_id_ = 0;
+};
+
+TEST_F(ExternalSSTFileTest, Basic) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // Current file size should be 0 after sst_file_writer init and before open
+    // a file.
+    ASSERT_EQ(sst_file_writer.FileSize(), 0);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+
+    // Current file size should be non-zero after success write.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    ASSERT_EQ(file1_info.num_range_del_entries, 0);
+    ASSERT_EQ(file1_info.smallest_range_del_key, "");
+    ASSERT_EQ(file1_info.largest_range_del_key, "");
+    // sst_file_writer already finished, cannot add this value
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 299)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
+
+    // Current file size should be non-zero after success finish.
+    ASSERT_GT(sst_file_writer.FileSize(), 0);
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 105);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(299));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (400 => 499)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(400));
+    ASSERT_EQ(file5_info.largest_key, Key(499));
+
+    // file6.sst (delete 400 => 500)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(400), Key(500)));
+    ExternalSstFileInfo file6_info;
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 1);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(400));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(500));
+
+    // file7.sst (delete 500 => 570, put 520 => 599 divisible by 2)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(500), Key(550)));
+    for (int k = 520; k < 560; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(525), Key(575)));
+    for (int k = 560; k < 600; k += 2) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file7_info;
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 40);
+    ASSERT_EQ(file7_info.smallest_key, Key(520));
+    ASSERT_EQ(file7_info.largest_key, Key(598));
+    ASSERT_EQ(file7_info.num_range_del_entries, 2);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(500));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(575));
+
+    // file8.sst (delete 600 => 700)
+    std::string file8 = sst_files_dir_ + "file8.sst";
+    ASSERT_OK(sst_file_writer.Open(file8));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(600), Key(700)));
+    ExternalSstFileInfo file8_info;
+    ASSERT_OK(sst_file_writer.Finish(&file8_info));
+    ASSERT_EQ(file8_info.file_path, file8);
+    ASSERT_EQ(file8_info.num_entries, 0);
+    ASSERT_EQ(file8_info.smallest_key, "");
+    ASSERT_EQ(file8_info.largest_key, "");
+    ASSERT_EQ(file8_info.num_range_del_entries, 1);
+    ASSERT_EQ(file8_info.smallest_range_del_key, Key(600));
+    ASSERT_EQ(file8_info.largest_range_del_key, Key(700));
+
+    // Cannot create an empty sst file
+    std::string file_empty = sst_files_dir_ + "file_empty.sst";
+    ExternalSstFileInfo file_empty_info;
+    ASSERT_NOK(sst_file_writer.Finish(&file_empty_info));
+
+    DestroyAndReopen(options);
+    // Add file using file path
+    ASSERT_OK(DeprecatedAddFile({file1}));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 100; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile({file2}));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile({file2}));
+
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file3}));
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file4}));
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Key range of file5 (400 => 499) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // This file has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile({file6}));
+
+    // Key range of file7 (500 => 598) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file7}));
+
+    // Key range of file7 (600 => 700) don't overlap with any keys in DB
+    ASSERT_OK(DeprecatedAddFile({file8}));
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 400; k < 500; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 500; k < 600; k++) {
+        std::string value = Key(k) + "_val";
+        if (k < 520 || k % 2 == 1) {
+          value = "NOT_FOUND";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    Close();
+    options.disable_auto_compactions = true;
+    Reopen(options);
+
+    // Delete keys in range (400 => 499)
+    for (int k = 400; k < 500; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (400 => 499) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile({file5}));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile({file5}));
+
+    // Verify values of file5 in DB
+    for (int k = 400; k < 500; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+class SstFileWriterCollector : public TablePropertiesCollector {
+ public:
+  explicit SstFileWriterCollector(const std::string prefix) : prefix_(prefix) {
+    name_ = prefix_ + "_SstFileWriterCollector";
+  }
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string count = std::to_string(count_);
+    *properties = UserCollectedProperties{
+        {prefix_ + "_SstFileWriterCollector", "YES"},
+        {prefix_ + "_Count", count},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& /*user_key*/, const Slice& /*value*/,
+                    EntryType /*type*/, SequenceNumber /*seq*/,
+                    uint64_t /*file_size*/) override {
+    ++count_;
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+  std::string prefix_;
+  std::string name_;
+};
+
+class SstFileWriterCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  explicit SstFileWriterCollectorFactory(std::string prefix)
+      : prefix_(prefix), num_created_(0) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    num_created_++;
+    return new SstFileWriterCollector(prefix_);
+  }
+  const char* Name() const override { return "SstFileWriterCollectorFactory"; }
+
+  std::string prefix_;
+  uint32_t num_created_;
+};
+
+TEST_F(ExternalSSTFileTest, AddList) {
+  do {
+    Options options = CurrentOptions();
+
+    auto abc_collector = std::make_shared<SstFileWriterCollectorFactory>("abc");
+    auto xyz_collector = std::make_shared<SstFileWriterCollectorFactory>("xyz");
+
+    options.table_properties_collector_factories.emplace_back(abc_collector);
+    options.table_properties_collector_factories.emplace_back(xyz_collector);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // file1.sst (0 => 99)
+    std::string file1 = sst_files_dir_ + "file1.sst";
+    ASSERT_OK(sst_file_writer.Open(file1));
+    for (int k = 0; k < 100; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file1_info;
+    ASSERT_OK(sst_file_writer.Finish(&file1_info));
+    ASSERT_EQ(file1_info.file_path, file1);
+    ASSERT_EQ(file1_info.num_entries, 100);
+    ASSERT_EQ(file1_info.smallest_key, Key(0));
+    ASSERT_EQ(file1_info.largest_key, Key(99));
+    // sst_file_writer already finished, cannot add this value
+    ASSERT_NOK(sst_file_writer.Put(Key(100), "bad_val"));
+
+    // file2.sst (100 => 199)
+    std::string file2 = sst_files_dir_ + "file2.sst";
+    ASSERT_OK(sst_file_writer.Open(file2));
+    for (int k = 100; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    // Cannot add this key because it's not after last added key
+    ASSERT_NOK(sst_file_writer.Put(Key(99), "bad_val"));
+    ExternalSstFileInfo file2_info;
+    ASSERT_OK(sst_file_writer.Finish(&file2_info));
+    ASSERT_EQ(file2_info.file_path, file2);
+    ASSERT_EQ(file2_info.num_entries, 100);
+    ASSERT_EQ(file2_info.smallest_key, Key(100));
+    ASSERT_EQ(file2_info.largest_key, Key(199));
+
+    // file3.sst (195 => 199)
+    // This file values overlap with file2 values
+    std::string file3 = sst_files_dir_ + "file3.sst";
+    ASSERT_OK(sst_file_writer.Open(file3));
+    for (int k = 195; k < 200; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file3_info;
+    ASSERT_OK(sst_file_writer.Finish(&file3_info));
+    ASSERT_EQ(file3_info.file_path, file3);
+    ASSERT_EQ(file3_info.num_entries, 5);
+    ASSERT_EQ(file3_info.smallest_key, Key(195));
+    ASSERT_EQ(file3_info.largest_key, Key(199));
+
+    // file4.sst (30 => 39)
+    // This file values overlap with file1 values
+    std::string file4 = sst_files_dir_ + "file4.sst";
+    ASSERT_OK(sst_file_writer.Open(file4));
+    for (int k = 30; k < 40; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val_overlap"));
+    }
+    ExternalSstFileInfo file4_info;
+    ASSERT_OK(sst_file_writer.Finish(&file4_info));
+    ASSERT_EQ(file4_info.file_path, file4);
+    ASSERT_EQ(file4_info.num_entries, 10);
+    ASSERT_EQ(file4_info.smallest_key, Key(30));
+    ASSERT_EQ(file4_info.largest_key, Key(39));
+
+    // file5.sst (200 => 299)
+    std::string file5 = sst_files_dir_ + "file5.sst";
+    ASSERT_OK(sst_file_writer.Open(file5));
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+    }
+    ExternalSstFileInfo file5_info;
+    ASSERT_OK(sst_file_writer.Finish(&file5_info));
+    ASSERT_EQ(file5_info.file_path, file5);
+    ASSERT_EQ(file5_info.num_entries, 100);
+    ASSERT_EQ(file5_info.smallest_key, Key(200));
+    ASSERT_EQ(file5_info.largest_key, Key(299));
+
+    // file6.sst (delete 0 => 100)
+    std::string file6 = sst_files_dir_ + "file6.sst";
+    ASSERT_OK(sst_file_writer.Open(file6));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(0), Key(75)));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(25), Key(100)));
+    ExternalSstFileInfo file6_info;
+    ASSERT_OK(sst_file_writer.Finish(&file6_info));
+    ASSERT_EQ(file6_info.file_path, file6);
+    ASSERT_EQ(file6_info.num_entries, 0);
+    ASSERT_EQ(file6_info.smallest_key, "");
+    ASSERT_EQ(file6_info.largest_key, "");
+    ASSERT_EQ(file6_info.num_range_del_entries, 2);
+    ASSERT_EQ(file6_info.smallest_range_del_key, Key(0));
+    ASSERT_EQ(file6_info.largest_range_del_key, Key(100));
+
+    // file7.sst (delete 99 => 201)
+    std::string file7 = sst_files_dir_ + "file7.sst";
+    ASSERT_OK(sst_file_writer.Open(file7));
+    ASSERT_OK(sst_file_writer.DeleteRange(Key(99), Key(201)));
+    ExternalSstFileInfo file7_info;
+    ASSERT_OK(sst_file_writer.Finish(&file7_info));
+    ASSERT_EQ(file7_info.file_path, file7);
+    ASSERT_EQ(file7_info.num_entries, 0);
+    ASSERT_EQ(file7_info.smallest_key, "");
+    ASSERT_EQ(file7_info.largest_key, "");
+    ASSERT_EQ(file7_info.num_range_del_entries, 1);
+    ASSERT_EQ(file7_info.smallest_range_del_key, Key(99));
+    ASSERT_EQ(file7_info.largest_range_del_key, Key(201));
+
+    // list 1 has internal key range conflict
+    std::vector<std::string> file_list0({file1, file2});
+    std::vector<std::string> file_list1({file3, file2, file1});
+    std::vector<std::string> file_list2({file5});
+    std::vector<std::string> file_list3({file3, file4});
+    std::vector<std::string> file_list4({file5, file7});
+    std::vector<std::string> file_list5({file6, file7});
+
+    DestroyAndReopen(options);
+
+    // These lists of files have key ranges that overlap with each other
+    ASSERT_NOK(DeprecatedAddFile(file_list1));
+    // Both of the following overlap on the range deletion tombstone.
+    ASSERT_NOK(DeprecatedAddFile(file_list4));
+    ASSERT_NOK(DeprecatedAddFile(file_list5));
+
+    // Add files using file path list
+    ASSERT_OK(DeprecatedAddFile(file_list0));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 200; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    TablePropertiesCollection props;
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 2);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // Add file while holding a snapshot will fail
+    const Snapshot* s1 = db_->GetSnapshot();
+    if (s1 != nullptr) {
+      ASSERT_NOK(DeprecatedAddFile(file_list2));
+      db_->ReleaseSnapshot(s1);
+    }
+    // We can add the file after releaseing the snapshot
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+    ASSERT_EQ(db_->GetLatestSequenceNumber(), 0U);
+    for (int k = 0; k < 300; k++) {
+      ASSERT_EQ(Get(Key(k)), Key(k) + "_val");
+    }
+
+    ASSERT_OK(db_->GetPropertiesOfAllTables(&props));
+    ASSERT_EQ(props.size(), 3);
+    for (auto file_props : props) {
+      auto user_props = file_props.second->user_collected_properties;
+      ASSERT_EQ(user_props["abc_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["xyz_SstFileWriterCollector"], "YES");
+      ASSERT_EQ(user_props["abc_Count"], "100");
+      ASSERT_EQ(user_props["xyz_Count"], "100");
+    }
+
+    // This file list has overlapping values with the existing data
+    ASSERT_NOK(DeprecatedAddFile(file_list3));
+
+    // Overwrite values of keys divisible by 5
+    for (int k = 0; k < 200; k += 5) {
+      ASSERT_OK(Put(Key(k), Key(k) + "_val_new"));
+    }
+    ASSERT_NE(db_->GetLatestSequenceNumber(), 0U);
+
+    // Make sure values are correct before and after flush/compaction
+    for (int i = 0; i < 2; i++) {
+      for (int k = 0; k < 200; k++) {
+        std::string value = Key(k) + "_val";
+        if (k % 5 == 0) {
+          value += "_new";
+        }
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      for (int k = 200; k < 300; k++) {
+        std::string value = Key(k) + "_val";
+        ASSERT_EQ(Get(Key(k)), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    // Delete keys in range (200 => 299)
+    for (int k = 200; k < 300; k++) {
+      ASSERT_OK(Delete(Key(k)));
+    }
+    // We deleted range (200 => 299) but cannot add file5 because
+    // of the range tombstones
+    ASSERT_NOK(DeprecatedAddFile(file_list2));
+
+    // Compacting the DB will remove the tombstones
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    // Now we can add the file
+    ASSERT_OK(DeprecatedAddFile(file_list2));
+
+    // Verify values of file5 in DB
+    for (int k = 200; k < 300; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction |
+                         kRangeDelSkipConfigs));
+}
+
+TEST_F(ExternalSSTFileTest, AddListAtomicity) {
+  do {
+    Options options = CurrentOptions();
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // files[0].sst (0 => 99)
+    // files[1].sst (100 => 199)
+    // ...
+    // file[8].sst (800 => 899)
+    int n = 9;
+    std::vector<std::string> files(n);
+    std::vector<ExternalSstFileInfo> files_info(n);
+    for (int i = 0; i < n; i++) {
+      files[i] = sst_files_dir_ + "file" + std::to_string(i) + ".sst";
+      ASSERT_OK(sst_file_writer.Open(files[i]));
+      for (int k = i * 100; k < (i + 1) * 100; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+      }
+      ASSERT_OK(sst_file_writer.Finish(&files_info[i]));
+      ASSERT_EQ(files_info[i].file_path, files[i]);
+      ASSERT_EQ(files_info[i].num_entries, 100);
+      ASSERT_EQ(files_info[i].smallest_key, Key(i * 100));
+      ASSERT_EQ(files_info[i].largest_key, Key((i + 1) * 100 - 1));
+    }
+    files.push_back(sst_files_dir_ + "file" + std::to_string(n) + ".sst");
+    ASSERT_NOK(DeprecatedAddFile(files));
+    for (int k = 0; k < n * 100; k++) {
+      ASSERT_EQ("NOT_FOUND", Get(Key(k)));
+    }
+    files.pop_back();
+    ASSERT_OK(DeprecatedAddFile(files));
+    for (int k = 0; k < n * 100; k++) {
+      std::string value = Key(k) + "_val";
+      ASSERT_EQ(Get(Key(k)), value);
+    }
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+// This test reporduce a bug that can happen in some cases if the DB started
+// purging obsolete files when we are adding an external sst file.
+// This situation may result in deleting the file while it's being added.
+TEST_F(ExternalSSTFileTest, PurgeObsoleteFilesBug) {
+  Options options = CurrentOptions();
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 500)
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    ASSERT_OK(sst_file_writer.Put(k, k + "_val"));
+  }
+
+  ExternalSstFileInfo sst_file_info;
+  ASSERT_OK(sst_file_writer.Finish(&sst_file_info));
+
+  options.delete_obsolete_files_period_micros = 0;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:FileAdded", [&](void* /* arg */) {
+        ASSERT_OK(Put("aaa", "bbb"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(Put("aaa", "xxx"));
+        ASSERT_OK(Flush());
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(DeprecatedAddFile({sst_file_path}));
+
+  for (int i = 0; i < 500; i++) {
+    std::string k = Key(i);
+    std::string v = k + "_val";
+    ASSERT_EQ(Get(k), v);
+  }
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, SkipSnapshot) {
+  Options options = CurrentOptions();
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // file1.sst (0 => 99)
+  std::string file1 = sst_files_dir_ + "file1.sst";
+  ASSERT_OK(sst_file_writer.Open(file1));
+  for (int k = 0; k < 100; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file1_info;
+  ASSERT_OK(sst_file_writer.Finish(&file1_info));
+  ASSERT_EQ(file1_info.file_path, file1);
+  ASSERT_EQ(file1_info.num_entries, 100);
+  ASSERT_EQ(file1_info.smallest_key, Key(0));
+  ASSERT_EQ(file1_info.largest_key, Key(99));
+
+  // file2.sst (100 => 299)
+  std::string file2 = sst_files_dir_ + "file2.sst";
+  ASSERT_OK(sst_file_writer.Open(file2));
+  for (int k = 100; k < 300; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file2_info;
+  ASSERT_OK(sst_file_writer.Finish(&file2_info));
+  ASSERT_EQ(file2_info.file_path, file2);
+  ASSERT_EQ(file2_info.num_entries, 200);
+  ASSERT_EQ(file2_info.smallest_key, Key(100));
+  ASSERT_EQ(file2_info.largest_key, Key(299));
+
+  ASSERT_OK(DeprecatedAddFile({file1}));
+
+  // Add file will fail when holding snapshot and use the default
+  // skip_snapshot_check to false
+  const Snapshot* s1 = db_->GetSnapshot();
+  if (s1 != nullptr) {
+    ASSERT_NOK(DeprecatedAddFile({file2}));
+  }
+
+  // Add file will success when set skip_snapshot_check to true even db holding
+  // snapshot
+  if (s1 != nullptr) {
+    ASSERT_OK(DeprecatedAddFile({file2}, false, true));
+    db_->ReleaseSnapshot(s1);
+  }
+
+  // file3.sst (300 => 399)
+  std::string file3 = sst_files_dir_ + "file3.sst";
+  ASSERT_OK(sst_file_writer.Open(file3));
+  for (int k = 300; k < 400; k++) {
+    ASSERT_OK(sst_file_writer.Put(Key(k), Key(k) + "_val"));
+  }
+  ExternalSstFileInfo file3_info;
+  ASSERT_OK(sst_file_writer.Finish(&file3_info));
+  ASSERT_EQ(file3_info.file_path, file3);
+  ASSERT_EQ(file3_info.num_entries, 100);
+  ASSERT_EQ(file3_info.smallest_key, Key(300));
+  ASSERT_EQ(file3_info.largest_key, Key(399));
+
+  // check that we have change the old key
+  ASSERT_EQ(Get(Key(300)), "NOT_FOUND");
+  const Snapshot* s2 = db_->GetSnapshot();
+  ASSERT_OK(DeprecatedAddFile({file3}, false, true));
+  ASSERT_EQ(Get(Key(300)), Key(300) + ("_val"));
+  ASSERT_EQ(Get(Key(300), s2), Key(300) + ("_val"));
+
+  db_->ReleaseSnapshot(s2);
+}
+
+TEST_F(ExternalSSTFileTest, MultiThreaded) {
+  env_->skip_fsync_ = true;
+  // Bulk load 10 files every file contain 1000 keys
+  int num_files = 10;
+  int keys_per_file = 1000;
+
+  // Generate file names
+  std::vector<std::string> file_names;
+  for (int i = 0; i < num_files; i++) {
+    std::string file_name = "file_" + std::to_string(i) + ".sst";
+    file_names.push_back(sst_files_dir_ + file_name);
+  }
+
+  do {
+    Options options = CurrentOptions();
+
+    std::atomic<int> thread_num(0);
+    std::function<void()> write_file_func = [&]() {
+      int file_idx = thread_num.fetch_add(1);
+      int range_start = file_idx * keys_per_file;
+      int range_end = range_start + keys_per_file;
+
+      SstFileWriter sst_file_writer(EnvOptions(), options);
+
+      ASSERT_OK(sst_file_writer.Open(file_names[file_idx]));
+
+      for (int k = range_start; k < range_end; k++) {
+        ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+      }
+
+      ASSERT_OK(sst_file_writer.Finish());
+    };
+    // Write num_files files in parallel
+    std::vector<port::Thread> sst_writer_threads;
+    for (int i = 0; i < num_files; ++i) {
+      sst_writer_threads.emplace_back(write_file_func);
+    }
+
+    for (auto& t : sst_writer_threads) {
+      t.join();
+    }
+
+    fprintf(stderr, "Wrote %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    thread_num.store(0);
+    std::atomic<int> files_added(0);
+    // Thread 0 -> Load {f0,f1}
+    // Thread 1 -> Load {f0,f1}
+    // Thread 2 -> Load {f2,f3}
+    // Thread 3 -> Load {f2,f3}
+    // Thread 4 -> Load {f4,f5}
+    // Thread 5 -> Load {f4,f5}
+    // ...
+    std::function<void()> load_file_func = [&]() {
+      // We intentionally add every file twice, and assert that it was added
+      // only once and the other add failed
+      int thread_id = thread_num.fetch_add(1);
+      int file_idx = (thread_id / 2) * 2;
+      // sometimes we use copy, sometimes link .. the result should be the same
+      bool move_file = (thread_id % 3 == 0);
+
+      std::vector<std::string> files_to_add;
+
+      files_to_add = {file_names[file_idx]};
+      if (static_cast<size_t>(file_idx + 1) < file_names.size()) {
+        files_to_add.push_back(file_names[file_idx + 1]);
+      }
+
+      Status s = DeprecatedAddFile(files_to_add, move_file);
+      if (s.ok()) {
+        files_added += static_cast<int>(files_to_add.size());
+      }
+    };
+
+    // Bulk load num_files files in parallel
+    std::vector<port::Thread> add_file_threads;
+    DestroyAndReopen(options);
+    for (int i = 0; i < num_files; ++i) {
+      add_file_threads.emplace_back(load_file_func);
+    }
+
+    for (auto& t : add_file_threads) {
+      t.join();
+    }
+    ASSERT_EQ(files_added.load(), num_files);
+    fprintf(stderr, "Loaded %d files (%d keys)\n", num_files,
+            num_files * keys_per_file);
+
+    // Overwrite values of keys divisible by 100
+    for (int k = 0; k < num_files * keys_per_file; k += 100) {
+      std::string key = Key(k);
+      ASSERT_OK(Put(key, key + "_new"));
+    }
+
+    for (int i = 0; i < 2; i++) {
+      // Make sure the values are correct before and after flush/compaction
+      for (int k = 0; k < num_files * keys_per_file; ++k) {
+        std::string key = Key(k);
+        std::string value = (k % 100 == 0) ? (key + "_new") : key;
+        ASSERT_EQ(Get(key), value);
+      }
+      ASSERT_OK(Flush());
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    fprintf(stderr, "Verified %d values\n", num_files * keys_per_file);
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_F(ExternalSSTFileTest, OverlappingRanges) {
+  env_->skip_fsync_ = true;
+  Random rnd(301);
+  SequenceNumber assigned_seqno = 0;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Run", [&assigned_seqno](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        assigned_seqno = *(static_cast<SequenceNumber*>(arg));
+      });
+  bool need_flush = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&need_flush](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        need_flush = *(static_cast<bool*>(arg));
+      });
+  bool overlap_with_db = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile",
+      [&overlap_with_db](void* arg) {
+        ASSERT_TRUE(arg != nullptr);
+        overlap_with_db = *(static_cast<bool*>(arg));
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  do {
+    Options options = CurrentOptions();
+    env_->skip_fsync_ = true;
+    DestroyAndReopen(options);
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    printf("Option config = %d\n", option_config_);
+    std::vector<std::pair<int, int>> key_ranges;
+    for (int i = 0; i < 100; i++) {
+      int range_start = rnd.Uniform(20000);
+      int keys_per_range = 10 + rnd.Uniform(41);
+
+      key_ranges.emplace_back(range_start, range_start + keys_per_range);
+    }
+
+    int memtable_add = 0;
+    int success_add_file = 0;
+    int failed_add_file = 0;
+    std::map<std::string, std::string> true_data;
+    for (size_t i = 0; i < key_ranges.size(); i++) {
+      int range_start = key_ranges[i].first;
+      int range_end = key_ranges[i].second;
+
+      Status s;
+      std::string range_val = "range_" + std::to_string(i);
+
+      // For 20% of ranges we use DB::Put, for 80% we use DB::AddFile
+      if (i && i % 5 == 0) {
+        // Use DB::Put to insert range (insert into memtable)
+        range_val += "_put";
+        for (int k = range_start; k <= range_end; k++) {
+          s = Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        memtable_add++;
+      } else {
+        // Use DB::AddFile to insert range
+        range_val += "_add_file";
+
+        // Generate the file containing the range
+        std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+        s = sst_file_writer.Open(file_name);
+        ASSERT_OK(s);
+        for (int k = range_start; k <= range_end; k++) {
+          s = sst_file_writer.Put(Key(k), range_val);
+          ASSERT_OK(s);
+        }
+        ExternalSstFileInfo file_info;
+        s = sst_file_writer.Finish(&file_info);
+        ASSERT_OK(s);
+
+        // Insert the generated file
+        s = DeprecatedAddFile({file_name});
+        auto it = true_data.lower_bound(Key(range_start));
+        if (option_config_ != kUniversalCompaction &&
+            option_config_ != kUniversalCompactionMultiLevel &&
+            option_config_ != kUniversalSubcompactions) {
+          if (it != true_data.end() && it->first <= Key(range_end)) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        } else {
+          if ((it != true_data.end() && it->first <= Key(range_end)) ||
+              need_flush || assigned_seqno > 0 || overlap_with_db) {
+            // This range overlap with data already exist in DB
+            ASSERT_NOK(s);
+            failed_add_file++;
+          } else {
+            ASSERT_OK(s);
+            success_add_file++;
+          }
+        }
+      }
+
+      if (s.ok()) {
+        // Update true_data map to include the new inserted data
+        for (int k = range_start; k <= range_end; k++) {
+          true_data[Key(k)] = range_val;
+        }
+      }
+
+      // Flush / Compact the DB
+      if (i && i % 50 == 0) {
+        ASSERT_OK(Flush());
+      }
+      if (i && i % 75 == 0) {
+        ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+      }
+    }
+
+    printf("Total: %" ROCKSDB_PRIszt
+           " ranges\n"
+           "AddFile()|Success: %d ranges\n"
+           "AddFile()|RangeConflict: %d ranges\n"
+           "Put(): %d ranges\n",
+           key_ranges.size(), success_add_file, failed_add_file, memtable_add);
+
+    // Verify the correctness of the data
+    for (const auto& kv : true_data) {
+      ASSERT_EQ(Get(kv.first), kv.second);
+    }
+    printf("keys/values verified\n");
+    DestroyAndRecreateExternalSSTFilesDir();
+  } while (ChangeOptions(kSkipPlainTable | kSkipFIFOCompaction));
+}
+
+TEST_P(ExternalSSTFileTest, PickedLevel) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+
+  std::map<std::string, std::string> true_data;
+
+  // File 0 will go to last level (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,0,1");
+
+  // File 1 will go to level L2 (since it overlap with file 0 in L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "0,0,1,1");
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevel:0", "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevel:1"},
+      {"ExternalSSTFileTest::PickedLevel:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    ASSERT_OK(Put(Key(3), Key(3) + "put"));
+    ASSERT_OK(Put(Key(8), Key(8) + "put"));
+    true_data[Key(3)] = Key(3) + "put";
+    true_data[Key(8)] = Key(8) + "put";
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:1");
+
+  EXPECT_EQ(FilesPerLevel(), "4,0,1,1");
+
+  // This file overlaps with file 0 (L3), file 1 (L2) and the
+  // output of compaction going to L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, -1, false, false, true,
+                                       false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,1");
+
+  // This file does not overlap with any file or with the running compaction
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       false, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,1,2");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevel:2");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  EXPECT_EQ(FilesPerLevel(), "1,1,1,2");
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, PickedLevelBug) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 3;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::vector<int> file_keys;
+
+  // file #1 in L0
+  file_keys = {0, 5, 7};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // file #2 in L0
+  file_keys = {4, 6, 8, 9};
+  for (int k : file_keys) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+  }
+  ASSERT_OK(Flush());
+
+  // We have 2 overlapping files in L0
+  EXPECT_EQ(FilesPerLevel(), "2");
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::IngestExternalFile:AfterIncIngestFileCounter",
+        "ExternalSSTFileTest::PickedLevelBug:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:1", "DBImpl::AddFile:MutexUnlock"},
+       {"ExternalSSTFileTest::PickedLevelBug:2",
+        "DBImpl::RunManualCompaction:0"},
+       {"ExternalSSTFileTest::PickedLevelBug:3",
+        "DBImpl::RunManualCompaction:1"}});
+
+  std::atomic<bool> bg_compact_started(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundCompaction:Start",
+      [&](void* /*arg*/) { bg_compact_started.store(true); });
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Status bg_compact_status;
+  Status bg_addfile_status;
+
+  {
+    // While writing the MANIFEST start a thread that will ask for compaction
+    ThreadGuard bg_compact(port::Thread([&]() {
+      bg_compact_status =
+          db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
+    }));
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:2");
+
+    // Start a thread that will ingest a new file
+    ThreadGuard bg_addfile(port::Thread([&]() {
+      file_keys = {1, 2, 3};
+      bg_addfile_status = GenerateAndAddExternalFile(options, file_keys, 1);
+    }));
+
+    // Wait for AddFile to start picking levels and writing MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:0");
+
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:3");
+
+    // We need to verify that no compactions can run while AddFile is
+    // ingesting the files into the levels it find suitable. So we will
+    // wait for 2 seconds to give a chance for compactions to run during
+    // this period, and then make sure that no compactions where able to run
+    env_->SleepForMicroseconds(1000000 * 2);
+    bool bg_compact_started_tmp = bg_compact_started.load();
+
+    // Hold AddFile from finishing writing the MANIFEST
+    TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelBug:1");
+
+    // check the status at the end, so even if the ASSERT fails the threads
+    // could be joined and return.
+    ASSERT_FALSE(bg_compact_started_tmp);
+  }
+
+  ASSERT_OK(bg_addfile_status);
+  ASSERT_OK(bg_compact_status);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  int total_keys = 0;
+  Iterator* iter = db_->NewIterator(ReadOptions());
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    ASSERT_OK(iter->status());
+    total_keys++;
+  }
+  ASSERT_EQ(total_keys, 10);
+
+  delete iter;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, IngestNonExistingFile) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+
+  Status s = db_->IngestExternalFile({"non_existing_file"},
+                                     IngestExternalFileOptions());
+  ASSERT_NOK(s);
+
+  // Verify file deletion is not impacted (verify a bug fix)
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(1), Key(1)));
+  ASSERT_OK(Put(Key(9), Key(9)));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+
+  // After full compaction, there should be only 1 file.
+  std::vector<std::string> files;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  int num_sst_files = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kTableFile) {
+      num_sst_files++;
+    }
+  }
+  ASSERT_EQ(1, num_sst_files);
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_F(ExternalSSTFileTest, CompactDuringAddFileRandom) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 2;
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  std::function<void()> bg_compact = [&]() {
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  };
+
+  int range_id = 0;
+  std::vector<int> file_keys;
+  std::function<void()> bg_addfile = [&]() {
+    ASSERT_OK(GenerateAndAddExternalFile(options, file_keys, range_id));
+  };
+
+  const int num_of_ranges = 1000;
+  std::vector<port::Thread> threads;
+  while (range_id < num_of_ranges) {
+    int range_start = range_id * 10;
+    int range_end = range_start + 10;
+
+    file_keys.clear();
+    for (int k = range_start + 1; k < range_end; k++) {
+      file_keys.push_back(k);
+    }
+    ASSERT_OK(Put(Key(range_start), Key(range_start)));
+    ASSERT_OK(Put(Key(range_end), Key(range_end)));
+    ASSERT_OK(Flush());
+
+    if (range_id % 10 == 0) {
+      threads.emplace_back(bg_compact);
+    }
+    threads.emplace_back(bg_addfile);
+
+    for (auto& t : threads) {
+      t.join();
+    }
+    threads.clear();
+
+    range_id++;
+  }
+
+  for (int rid = 0; rid < num_of_ranges; rid++) {
+    int range_start = rid * 10;
+    int range_end = range_start + 10;
+
+    ASSERT_EQ(Get(Key(range_start)), Key(range_start)) << rid;
+    ASSERT_EQ(Get(Key(range_end)), Key(range_end)) << rid;
+    for (int k = range_start + 1; k < range_end; k++) {
+      std::string v = Key(k) + std::to_string(rid);
+      ASSERT_EQ(Get(Key(k)), v) << rid;
+    }
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(ExternalSSTFileTest, PickedLevelDynamic) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = false;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level_compaction_dynamic_level_bytes = true;
+  options.num_levels = 4;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"ExternalSSTFileTest::PickedLevelDynamic:0",
+       "BackgroundCallCompaction:0"},
+      {"DBImpl::BackgroundCompaction:Start",
+       "ExternalSSTFileTest::PickedLevelDynamic:1"},
+      {"ExternalSSTFileTest::PickedLevelDynamic:2",
+       "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Flush 4 files containing the same keys
+  for (int i = 0; i < 4; i++) {
+    for (int k = 20; k <= 30; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    for (int k = 50; k <= 60; k++) {
+      ASSERT_OK(Put(Key(k), Key(k) + "put"));
+      true_data[Key(k)] = Key(k) + "put";
+    }
+    ASSERT_OK(Flush());
+  }
+
+  // Wait for BackgroundCompaction() to be called
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:0");
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:1");
+
+  // This file overlaps with the output of the compaction (going to L3)
+  // so the file will be added to L0 since L3 is the base level
+  ASSERT_OK(GenerateAndAddExternalFile(options, {31, 32, 33, 34}, -1, false,
+                                       false, true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5");
+
+  // This file does not overlap with the current running compactiong
+  ASSERT_OK(GenerateAndAddExternalFile(options, {9000, 9001}, -1, false, false,
+                                       true, false, false, &true_data));
+  EXPECT_EQ(FilesPerLevel(), "5,0,0,1");
+
+  // Hold compaction from finishing
+  TEST_SYNC_POINT("ExternalSSTFileTest::PickedLevelDynamic:2");
+
+  // Output of the compaction will go to L3
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  EXPECT_EQ(FilesPerLevel(), "1,0,0,2");
+
+  Close();
+  options.disable_auto_compactions = true;
+  Reopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 15, 19}, -1, false, false,
+                                       true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,3");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1000, 1001, 1002}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,4");
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {500, 600, 700}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "1,0,0,5");
+
+  // File 5 overlaps with file 2 (L3 / base level)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 10}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "2,0,0,5");
+
+  // File 6 overlaps with file 2 (L3 / base level) and file 5 (L0)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 9}, -1, false, false, true,
+                                       false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "3,0,0,5");
+
+  // Verify data in files
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  // Write range [5 => 10] to L0
+  for (int i = 5; i <= 10; i++) {
+    std::string k = Key(i);
+    std::string v = k + "put";
+    ASSERT_OK(Put(k, v));
+    true_data[k] = v;
+  }
+  ASSERT_OK(Flush());
+  ASSERT_EQ(FilesPerLevel(), "4,0,0,5");
+
+  // File 7 overlaps with file 4 (L3)
+  ASSERT_OK(GenerateAndAddExternalFile(options, {650, 651, 652}, -1, false,
+                                       false, true, false, false, &true_data));
+  ASSERT_EQ(FilesPerLevel(), "5,0,0,5");
+
+  VerifyDBFromMap(true_data, &kcnt, false);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, AddExternalSstFileWithCustomCompartor) {
+  Options options = CurrentOptions();
+  options.comparator = ReverseBytewiseComparator();
+  DestroyAndReopen(options);
+
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  // Generate files with these key ranges
+  // {14  -> 0}
+  // {24 -> 10}
+  // {34 -> 20}
+  // {44 -> 30}
+  // ..
+  std::vector<std::string> generated_files;
+  for (int i = 0; i < 10; i++) {
+    std::string file_name = sst_files_dir_ + env_->GenerateUniqueId();
+    ASSERT_OK(sst_file_writer.Open(file_name));
+
+    int range_end = i * 10;
+    int range_start = range_end + 15;
+    for (int k = (range_start - 1); k >= range_end; k--) {
+      ASSERT_OK(sst_file_writer.Put(Key(k), Key(k)));
+    }
+    ExternalSstFileInfo file_info;
+    ASSERT_OK(sst_file_writer.Finish(&file_info));
+    generated_files.push_back(file_name);
+  }
+
+  std::vector<std::string> in_files;
+
+  // These 2nd and 3rd files overlap with each other
+  in_files = {generated_files[0], generated_files[4], generated_files[5],
+              generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // These 2 files don't overlap with each other
+  in_files = {generated_files[0], generated_files[2]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  // These 2 files don't overlap with each other but overlap with keys in DB
+  in_files = {generated_files[3], generated_files[7]};
+  ASSERT_NOK(DeprecatedAddFile(in_files));
+
+  // Files don't overlap and don't overlap with DB key range
+  in_files = {generated_files[4], generated_files[6], generated_files[8]};
+  ASSERT_OK(DeprecatedAddFile(in_files));
+
+  for (int i = 0; i < 100; i++) {
+    if (i % 20 <= 14) {
+      ASSERT_EQ(Get(Key(i)), Key(i));
+    } else {
+      ASSERT_EQ(Get(Key(i)), "NOT_FOUND");
+    }
+  }
+}
+
+TEST_F(ExternalSSTFileTest, AddFileTrivialMoveBug) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  options.IncreaseParallelism(20);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 4}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 3}, 2));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {10, 14}, 3));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {12, 13}, 4));  // L2
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {20, 24}, 5));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {22, 23}, 6));  // L2
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "CompactionJob::Run():Start", [&](void* /*arg*/) {
+        // fit in L3 but will overlap with compaction so will be added
+        // to L2 but a compaction will trivially move it to L3
+        // and break LSM consistency
+        static std::atomic<bool> called = {false};
+        if (!called) {
+          called = true;
+          ASSERT_OK(dbfull()->SetOptions({{"max_bytes_for_level_base", "1"}}));
+          ASSERT_OK(GenerateAndAddExternalFile(options, {15, 16}, 7));
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  CompactRangeOptions cro;
+  cro.exclusive_manual_compaction = false;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST_F(ExternalSSTFileTest, CompactAddedFiles) {
+  Options options = CurrentOptions();
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(GenerateAndAddExternalFile(options, {1, 10}, 1));  // L3
+  ASSERT_OK(GenerateAndAddExternalFile(options, {2, 9}, 2));   // L2
+  ASSERT_OK(GenerateAndAddExternalFile(options, {3, 8}, 3));   // L1
+  ASSERT_OK(GenerateAndAddExternalFile(options, {4, 7}, 4));   // L0
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+}
+
+TEST_F(ExternalSSTFileTest, SstFileWriterNonSharedKeys) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/not_shared";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+
+  std::string suffix(100, 'X');
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  ASSERT_OK(sst_file_writer.Put("A" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("BB" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CC" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CXD" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("CZZZ" + suffix, "VAL"));
+  ASSERT_OK(sst_file_writer.Put("ZAAAX" + suffix, "VAL"));
+
+  ASSERT_OK(sst_file_writer.Finish());
+  ASSERT_OK(DeprecatedAddFile({file_path}));
+}
+
+TEST_F(ExternalSSTFileTest, WithUnorderedWrite) {
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::WriteImpl:UnorderedWriteAfterWriteWAL",
+        "ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL"},
+       {"DBImpl::WaitForPendingWrites:BeforeBlock",
+        "DBImpl::WriteImpl:BeforeUnorderedWriteMemtable"}});
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::IngestExternalFile:NeedFlush", [&](void* need_flush) {
+        ASSERT_TRUE(*reinterpret_cast<bool*>(need_flush));
+      });
+
+  Options options = CurrentOptions();
+  options.unordered_write = true;
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "v1"));
+  SyncPoint::GetInstance()->EnableProcessing();
+  port::Thread writer([&]() { ASSERT_OK(Put("bar", "v2")); });
+
+  TEST_SYNC_POINT("ExternalSSTFileTest::WithUnorderedWrite:WaitWriteWAL");
+  ASSERT_OK(GenerateAndAddExternalFile(options, {{"bar", "v3"}}, -1,
+                                       true /* allow_global_seqno */));
+  ASSERT_EQ(Get("bar"), "v3");
+
+  writer.join();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoRandomized) {
+  env_->skip_fsync_ = true;
+  Options options = CurrentOptions();
+  options.IncreaseParallelism(20);
+  options.level0_slowdown_writes_trigger = 256;
+  options.level0_stop_writes_trigger = 256;
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  for (int iter = 0; iter < 2; iter++) {
+    bool write_to_memtable = (iter == 0);
+    DestroyAndReopen(options);
+
+    Random rnd(301);
+    std::map<std::string, std::string> true_data;
+    for (int i = 0; i < 500; i++) {
+      std::vector<std::pair<std::string, std::string>> random_data;
+      for (int j = 0; j < 100; j++) {
+        std::string k = rnd.RandomString(rnd.Next() % 20);
+        std::string v = rnd.RandomString(rnd.Next() % 50);
+        random_data.emplace_back(k, v);
+      }
+
+      if (write_to_memtable && rnd.OneIn(4)) {
+        // 25% of writes go through memtable
+        for (auto& entry : random_data) {
+          ASSERT_OK(Put(entry.first, entry.second));
+          true_data[entry.first] = entry.second;
+        }
+      } else {
+        ASSERT_OK(GenerateAndAddExternalFile(
+            options, random_data, -1, true, write_global_seqno,
+            verify_checksums_before_ingest, false, true, &true_data));
+      }
+    }
+    size_t kcnt = 0;
+    VerifyDBFromMap(true_data, &kcnt, false);
+    ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    VerifyDBFromMap(true_data, &kcnt, false);
+  }
+}
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoAssignedLevel) {
+  Options options = CurrentOptions();
+  options.num_levels = 5;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 0 -> 20 using AddFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file don't overlap with anything in the DB, will go to L4
+  ASSERT_EQ("0,0,0,0,1", FilesPerLevel());
+
+  // Insert 80 -> 130 using AddFile
+  file_data.clear();
+  for (int i = 80; i <= 130; i++) {
+    file_data.emplace_back(Key(i), "L0");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with the memtable, so it will flush it and add
+  // it self to L0
+  ASSERT_EQ("2,0,0,0,1", FilesPerLevel());
+
+  // Insert 30 -> 50 using AddFile
+  file_data.clear();
+  for (int i = 30; i <= 50; i++) {
+    file_data.emplace_back(Key(i), "L4");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file don't overlap with anything in the DB and fit in L4 as well
+  ASSERT_EQ("2,0,0,0,2", FilesPerLevel());
+
+  // Insert 10 -> 40 using AddFile
+  file_data.clear();
+  for (int i = 10; i <= 40; i++) {
+    file_data.emplace_back(Key(i), "L3");
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+
+  // This file overlap with files in L4, we will ingest it in L3
+  ASSERT_EQ("2,0,0,1,2", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFileWithGlobalSeqnoMemtableFlush) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  uint64_t entries_in_memtable;
+  std::map<std::string, std::string> true_data;
+
+  for (int k : {10, 20, 40, 80}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No need for flush
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {90, 100, 110}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {19, 20, 21}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  for (int k : {200, 201, 205, 206}) {
+    ASSERT_OK(Put(Key(k), "memtable"));
+    true_data[Key(k)] = "memtable";
+  }
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // No need for flush, this file keys fit between the memtable keys
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {202, 203, 204}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_GE(entries_in_memtable, 1);
+
+  // This file will flush the memtable
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {206, 207}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, false, &true_data));
+  ASSERT_TRUE(db_->GetIntProperty(DB::Properties::kNumEntriesActiveMemTable,
+                                  &entries_in_memtable));
+  ASSERT_EQ(entries_in_memtable, 0);
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_P(ExternalSSTFileTest, L0SortingIssue) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+  std::map<std::string, std::string> true_data;
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Put(Key(10), "memtable"));
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // No Flush needed, No global seqno needed, Ingest in L1
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  // No Flush needed, but need a global seqno, Ingest in L0
+  ASSERT_OK(
+      GenerateAndAddExternalFile(options, {7, 8}, -1, true, write_global_seqno,
+                                 verify_checksums_before_ingest, false, false));
+  printf("%s\n", FilesPerLevel().c_str());
+
+  // Overwrite what we added using external files
+  ASSERT_OK(Put(Key(7), "memtable"));
+  ASSERT_OK(Put(Key(8), "memtable"));
+
+  // Read values from memtable
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+
+  // Flush and read from L0
+  ASSERT_OK(Flush());
+  printf("%s\n", FilesPerLevel().c_str());
+  ASSERT_EQ(Get(Key(7)), "memtable");
+  ASSERT_EQ(Get(Key(8)), "memtable");
+}
+
+TEST_F(ExternalSSTFileTest, CompactionDeadlock) {
+  Options options = CurrentOptions();
+  options.num_levels = 2;
+  options.level0_file_num_compaction_trigger = 4;
+  options.level0_slowdown_writes_trigger = 4;
+  options.level0_stop_writes_trigger = 4;
+  DestroyAndReopen(options);
+
+  // atomic conter of currently running bg threads
+  std::atomic<int> running_threads(0);
+
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::DelayWrite:Wait", "ExternalSSTFileTest::DeadLock:0"},
+      {"ExternalSSTFileTest::DeadLock:1", "DBImpl::AddFile:Start"},
+      {"DBImpl::AddFile:MutexLock", "ExternalSSTFileTest::DeadLock:2"},
+      {"ExternalSSTFileTest::DeadLock:3", "BackgroundCallCompaction:0"},
+  });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  // Start ingesting and extrnal file in the background
+  ROCKSDB_NAMESPACE::port::Thread bg_ingest_file([&]() {
+    running_threads += 1;
+    ASSERT_OK(GenerateAndAddExternalFile(options, {5, 6}));
+    running_threads -= 1;
+  });
+
+  ASSERT_OK(Put(Key(1), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(2), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(3), "memtable"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put(Key(4), "memtable"));
+  ASSERT_OK(Flush());
+
+  // This thread will try to insert into the memtable but since we have 4 L0
+  // files this thread will be blocked and hold the writer thread
+  ROCKSDB_NAMESPACE::port::Thread bg_block_put([&]() {
+    running_threads += 1;
+    ASSERT_OK(Put(Key(10), "memtable"));
+    running_threads -= 1;
+  });
+
+  // Make sure DelayWrite is called first
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:0");
+
+  // `DBImpl::AddFile:Start` will wait until we be here
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:1");
+
+  // Wait for IngestExternalFile() to start and aquire mutex
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:2");
+
+  // Now let compaction start
+  TEST_SYNC_POINT("ExternalSSTFileTest::DeadLock:3");
+
+  // Wait for max 5 seconds, if we did not finish all bg threads
+  // then we hit the deadlock bug
+  for (int i = 0; i < 10; i++) {
+    if (running_threads.load() == 0) {
+      break;
+    }
+    // Make sure we do a "real sleep", not a mock one.
+    SystemClock::Default()->SleepForMicroseconds(500000);
+  }
+
+  ASSERT_EQ(running_threads.load(), 0);
+
+  bg_ingest_file.join();
+  bg_block_put.join();
+}
+
+TEST_F(ExternalSSTFileTest, DirtyExit) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  std::string file_path = sst_files_dir_ + "/dirty_exit";
+  std::unique_ptr<SstFileWriter> sst_file_writer;
+
+  // Destruct SstFileWriter without calling Finish()
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  sst_file_writer.reset();
+
+  // Destruct SstFileWriter with a failing Finish
+  sst_file_writer.reset(new SstFileWriter(EnvOptions(), options));
+  ASSERT_OK(sst_file_writer->Open(file_path));
+  ASSERT_NOK(sst_file_writer->Finish());
+}
+
+TEST_F(ExternalSSTFileTest, FileWithCFInfo) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  SstFileWriter sfw_default(EnvOptions(), options, handles_[0]);
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_cf2(EnvOptions(), options, handles_[2]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // default_cf.sst
+  const std::string cf_default_sst = sst_files_dir_ + "/default_cf.sst";
+  ASSERT_OK(sfw_default.Open(cf_default_sst));
+  ASSERT_OK(sfw_default.Put("K1", "V1"));
+  ASSERT_OK(sfw_default.Put("K2", "V2"));
+  ASSERT_OK(sfw_default.Finish());
+
+  // cf1.sst
+  const std::string cf1_sst = sst_files_dir_ + "/cf1.sst";
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K3", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K4", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst = sst_files_dir_ + "/cf_unknown.sst";
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K5", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K6", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  IngestExternalFileOptions ifo;
+
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[0], {cf1_sst}, ifo));
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf1_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {cf1_sst}, ifo));
+
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {cf_default_sst}, ifo));
+  // SST CF don't match
+  ASSERT_NOK(db_->IngestExternalFile(handles_[2], {cf_default_sst}, ifo));
+  // SST CF match
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {cf_default_sst}, ifo));
+
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+  // SST CF unknown
+  ASSERT_OK(db_->IngestExternalFile(handles_[0], {unknown_sst}, ifo));
+
+  // Cannot ingest a file into a dropped CF
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_NOK(db_->IngestExternalFile(handles_[1], {unknown_sst}, ifo));
+
+  // CF was not dropped, ok to Ingest
+  ASSERT_OK(db_->IngestExternalFile(handles_[2], {unknown_sst}, ifo));
+}
+
+/*
+ * Test and verify the functionality of ingestion_options.move_files and
+ * ingestion_options.failed_move_fall_back_to_copy
+ */
+TEST_P(ExternSSTFileLinkFailFallbackTest, LinkFailFallBackExternalSst) {
+  const bool fail_link = std::get<0>(GetParam());
+  const bool failed_move_fall_back_to_copy = std::get<1>(GetParam());
+  test_env_->set_fail_link(fail_link);
+  const EnvOptions env_options;
+  DestroyAndReopen(options_);
+  const int kNumKeys = 10000;
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ifo.failed_move_fall_back_to_copy = failed_move_fall_back_to_copy;
+
+  std::string file_path = sst_files_dir_ + "file1.sst";
+  // Create SstFileWriter for default column family
+  SstFileWriter sst_file_writer(env_options, options_);
+  ASSERT_OK(sst_file_writer.Open(file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_value"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+  uint64_t file_size = 0;
+  ASSERT_OK(env_->GetFileSize(file_path, &file_size));
+
+  bool copyfile = false;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "ExternalSstFileIngestionJob::Prepare:CopyFile",
+      [&](void* /* arg */) { copyfile = true; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  const Status s = db_->IngestExternalFile({file_path}, ifo);
+
+  ColumnFamilyHandleImpl* cfh =
+      static_cast<ColumnFamilyHandleImpl*>(dbfull()->DefaultColumnFamily());
+  ColumnFamilyData* cfd = cfh->cfd();
+  const InternalStats* internal_stats_ptr = cfd->internal_stats();
+  const std::vector<InternalStats::CompactionStats>& comp_stats =
+      internal_stats_ptr->TEST_GetCompactionStats();
+  uint64_t bytes_copied = 0;
+  uint64_t bytes_moved = 0;
+  for (const auto& stats : comp_stats) {
+    bytes_copied += stats.bytes_written;
+    bytes_moved += stats.bytes_moved;
+  }
+
+  if (!fail_link) {
+    // Link operation succeeds. External SST should be moved.
+    ASSERT_OK(s);
+    ASSERT_EQ(0, bytes_copied);
+    ASSERT_EQ(file_size, bytes_moved);
+    ASSERT_FALSE(copyfile);
+  } else {
+    // Link operation fails.
+    ASSERT_EQ(0, bytes_moved);
+    if (failed_move_fall_back_to_copy) {
+      ASSERT_OK(s);
+      // Copy file is true since a failed link falls back to copy file.
+      ASSERT_TRUE(copyfile);
+      ASSERT_EQ(file_size, bytes_copied);
+    } else {
+      ASSERT_TRUE(s.IsNotSupported());
+      // Copy file is false since a failed link does not fall back to copy file.
+      ASSERT_FALSE(copyfile);
+      ASSERT_EQ(0, bytes_copied);
+    }
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+}
+
+class TestIngestExternalFileListener : public EventListener {
+ public:
+  void OnExternalFileIngested(DB* /*db*/,
+                              const ExternalFileIngestionInfo& info) override {
+    ingested_files.push_back(info);
+  }
+
+  std::vector<ExternalFileIngestionInfo> ingested_files;
+};
+
+TEST_P(ExternalSSTFileTest, IngestionListener) {
+  Options options = CurrentOptions();
+  TestIngestExternalFileListener* listener =
+      new TestIngestExternalFileListener();
+  options.listeners.emplace_back(listener);
+  CreateAndReopenWithCF({"koko", "toto"}, options);
+
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+  // Ingest into default cf
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[0]));
+  ASSERT_EQ(listener->ingested_files.size(), 1);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "default");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "default");
+
+  // Ingest into cf1
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[1]));
+  ASSERT_EQ(listener->ingested_files.size(), 2);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "koko");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            1);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "koko");
+
+  // Ingest into cf2
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, {1, 2}, -1, true, write_global_seqno,
+      verify_checksums_before_ingest, false, true, nullptr, handles_[2]));
+  ASSERT_EQ(listener->ingested_files.size(), 3);
+  ASSERT_EQ(listener->ingested_files.back().cf_name, "toto");
+  ASSERT_EQ(listener->ingested_files.back().global_seqno, 0);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_id,
+            2);
+  ASSERT_EQ(listener->ingested_files.back().table_properties.column_family_name,
+            "toto");
+}
+
+TEST_F(ExternalSSTFileTest, SnapshotInconsistencyBug) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  const int kNumKeys = 10000;
+
+  // Insert keys using normal path and take a snapshot
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(i), Key(i) + "_V1"));
+  }
+  const Snapshot* snap = db_->GetSnapshot();
+
+  // Overwrite all keys using IngestExternalFile
+  std::string sst_file_path = sst_files_dir_ + "file1.sst";
+  SstFileWriter sst_file_writer(EnvOptions(), options);
+  ASSERT_OK(sst_file_writer.Open(sst_file_path));
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(sst_file_writer.Put(Key(i), Key(i) + "_V2"));
+  }
+  ASSERT_OK(sst_file_writer.Finish());
+
+  IngestExternalFileOptions ifo;
+  ifo.move_files = true;
+  ASSERT_OK(db_->IngestExternalFile({sst_file_path}, ifo));
+
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_EQ(Get(Key(i), snap), Key(i) + "_V1");
+    ASSERT_EQ(Get(Key(i)), Key(i) + "_V2");
+  }
+
+  db_->ReleaseSnapshot(snap);
+}
+
+TEST_P(ExternalSSTFileTest, IngestBehind) {
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = 3;
+  options.disable_auto_compactions = false;
+  DestroyAndReopen(options);
+  std::vector<std::pair<std::string, std::string>> file_data;
+  std::map<std::string, std::string> true_data;
+
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+
+  // Insert 100 -> 200 using IngestExternalFile
+  file_data.clear();
+  for (int i = 0; i <= 20; i++) {
+    file_data.emplace_back(Key(i), "ingest_behind");
+  }
+
+  bool allow_global_seqno = true;
+  bool ingest_behind = true;
+  bool write_global_seqno = std::get<0>(GetParam());
+  bool verify_checksums_before_ingest = std::get<1>(GetParam());
+
+  // Can't ingest behind since allow_ingest_behind isn't set to true
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, ingest_behind, false /*sort_data*/,
+      &true_data));
+
+  options.allow_ingest_behind = true;
+  // check that we still can open the DB, as num_levels should be
+  // sanitized to 3
+  options.num_levels = 2;
+  DestroyAndReopen(options);
+
+  options.num_levels = 3;
+  DestroyAndReopen(options);
+  // Insert 100 -> 200 into the memtable
+  for (int i = 100; i <= 200; i++) {
+    ASSERT_OK(Put(Key(i), "memtable"));
+    true_data[Key(i)] = "memtable";
+  }
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // Universal picker should go at second from the bottom level
+  ASSERT_EQ("0,1", FilesPerLevel());
+  ASSERT_OK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  // this time ingest should fail as the file doesn't fit to the bottom level
+  ASSERT_NOK(GenerateAndAddExternalFile(
+      options, file_data, -1, allow_global_seqno, write_global_seqno,
+      verify_checksums_before_ingest, true /*ingest_behind*/,
+      false /*sort_data*/, &true_data));
+  ASSERT_EQ("0,1,1", FilesPerLevel());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  // bottom level should be empty
+  ASSERT_EQ("0,1", FilesPerLevel());
+
+  size_t kcnt = 0;
+  VerifyDBFromMap(true_data, &kcnt, false);
+}
+
+TEST_F(ExternalSSTFileTest, SkipBloomFilter) {
+  Options options = CurrentOptions();
+
+  BlockBasedTableOptions table_options;
+  table_options.filter_policy.reset(NewBloomFilterPolicy(10));
+  table_options.cache_index_and_filter_blocks = true;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+  // Create external SST file and include bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_GE(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 1);
+  }
+
+  // Create external SST file but skip bloom filters
+  options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
+  DestroyAndReopen(options);
+  {
+    std::string file_path = sst_files_dir_ + "sst_with_no_bloom.sst";
+    SstFileWriter sst_file_writer(EnvOptions(), options, nullptr, true,
+                                  Env::IOPriority::IO_TOTAL,
+                                  true /* skip_filters */);
+    ASSERT_OK(sst_file_writer.Open(file_path));
+    ASSERT_OK(sst_file_writer.Put("Key1", "Value1"));
+    ASSERT_OK(sst_file_writer.Finish());
+
+    ASSERT_OK(
+        db_->IngestExternalFile({file_path}, IngestExternalFileOptions()));
+
+    ASSERT_EQ(Get("Key1"), "Value1");
+    ASSERT_EQ(
+        options.statistics->getTickerCount(Tickers::BLOCK_CACHE_FILTER_ADD), 0);
+  }
+}
+
+TEST_F(ExternalSSTFileTest, IngestFileWrittenWithCompressionDictionary) {
+  if (!ZSTD_Supported()) {
+    return;
+  }
+  const int kNumEntries = 1 << 10;
+  const int kNumBytesPerEntry = 1 << 10;
+  Options options = CurrentOptions();
+  options.compression = kZSTD;
+  options.compression_opts.max_dict_bytes = 1 << 14;        // 16KB
+  options.compression_opts.zstd_max_train_bytes = 1 << 18;  // 256KB
+  DestroyAndReopen(options);
+
+  std::atomic<int> num_compression_dicts(0);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict",
+      [&](void* /* arg */) { ++num_compression_dicts; });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  Random rnd(301);
+  std::vector<std::pair<std::string, std::string>> random_data;
+  for (int i = 0; i < kNumEntries; i++) {
+    std::string val = rnd.RandomString(kNumBytesPerEntry);
+    random_data.emplace_back(Key(i), std::move(val));
+  }
+  ASSERT_OK(GenerateAndAddExternalFile(options, std::move(random_data)));
+  ASSERT_EQ(1, num_compression_dicts);
+}
+
+class ExternalSSTBlockChecksumTest
+    : public ExternalSSTFileTestBase,
+      public testing::WithParamInterface<uint32_t> {};
+
+INSTANTIATE_TEST_CASE_P(FormatVersions, ExternalSSTBlockChecksumTest,
+                        testing::ValuesIn(test::kFooterFormatVersionsToTest));
+
+// Very slow, not worth the cost to run regularly
+TEST_P(ExternalSSTBlockChecksumTest, DISABLED_HugeBlockChecksum) {
+  BlockBasedTableOptions table_options;
+  table_options.format_version = GetParam();
+  for (auto t : GetSupportedChecksums()) {
+    table_options.checksum = t;
+    Options options = CurrentOptions();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    SstFileWriter sst_file_writer(EnvOptions(), options);
+
+    // 2^32 - 1, will lead to data block with more than 2^32 bytes
+    size_t huge_size = std::numeric_limits<uint32_t>::max();
+
+    std::string f = sst_files_dir_ + "f.sst";
+    ASSERT_OK(sst_file_writer.Open(f));
+    {
+      Random64 r(123);
+      std::string huge(huge_size, 0);
+      for (size_t j = 0; j + 7 < huge_size; j += 8) {
+        EncodeFixed64(&huge[j], r.Next());
+      }
+      ASSERT_OK(sst_file_writer.Put("Huge", huge));
+    }
+
+    ExternalSstFileInfo f_info;
+    ASSERT_OK(sst_file_writer.Finish(&f_info));
+    ASSERT_GT(f_info.file_size, uint64_t{huge_size} + 10);
+
+    SstFileReader sst_file_reader(options);
+    ASSERT_OK(sst_file_reader.Open(f));
+    ASSERT_OK(sst_file_reader.VerifyChecksum());
+  }
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_Success) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  // Exercise different situations in different column families: two are empty
+  // (so no new sequence number is needed), but at least one overlaps with the
+  // DB and needs to bump the sequence number.
+  ASSERT_OK(db_->Put(WriteOptions(), "foo1", "oldvalue"));
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                        -1, true, true_data));
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_NoMixedStateWithSnapshot) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:InstallSVForFirstCF:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "BeforeRead"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+       "AfterRead",
+       "DBImpl::IngestExternalFiles:InstallSVForFirstCF:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  const std::vector<std::map<std::string, std::string>> data_before_ingestion =
+      {{{"foo1", "fv1_0"}, {"foo2", "fv2_0"}, {"foo3", "fv3_0"}},
+       {{"bar1", "bv1_0"}, {"bar2", "bv2_0"}, {"bar3", "bv3_0"}},
+       {{"bar4", "bv4_0"}, {"bar5", "bv5_0"}, {"bar6", "bv6_0"}}};
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    int cf = static_cast<int>(i);
+    const auto& orig_data = data_before_ingestion[i];
+    for (const auto& kv : orig_data) {
+      ASSERT_OK(Put(cf, kv.first, kv.second));
+    }
+    ASSERT_OK(Flush(cf));
+  }
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  // Take snapshot before ingestion starts
+  ReadOptions read_opts;
+  read_opts.total_order_seek = true;
+  read_opts.snapshot = dbfull()->GetSnapshot();
+  std::vector<Iterator*> iters(handles_.size());
+
+  // Range scan checks first kv of each CF before ingestion starts.
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    iters[i] = dbfull()->NewIterator(read_opts, handles_[i]);
+    iters[i]->SeekToFirst();
+    ASSERT_TRUE(iters[i]->Valid());
+    const std::string& key = iters[i]->key().ToString();
+    const std::string& value = iters[i]->value().ToString();
+    const std::map<std::string, std::string>& orig_data =
+        data_before_ingestion[i];
+    std::map<std::string, std::string>::const_iterator it = orig_data.find(key);
+    ASSERT_NE(orig_data.end(), it);
+    ASSERT_EQ(it->second, value);
+    iters[i]->Next();
+  }
+  port::Thread ingest_thread([&]() {
+    ASSERT_OK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                          -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "BeforeRead");
+  // Should see only data before ingestion
+  for (size_t i = 0; i != handles_.size(); ++i) {
+    const auto& orig_data = data_before_ingestion[i];
+    for (; iters[i]->Valid(); iters[i]->Next()) {
+      const std::string& key = iters[i]->key().ToString();
+      const std::string& value = iters[i]->value().ToString();
+      std::map<std::string, std::string>::const_iterator it =
+          orig_data.find(key);
+      ASSERT_NE(orig_data.end(), it);
+      ASSERT_EQ(it->second, value);
+    }
+  }
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_MixedState:"
+      "AfterRead");
+  ingest_thread.join();
+  for (auto* iter : iters) {
+    delete iter;
+  }
+  iters.clear();
+  dbfull()->ReleaseSnapshot(read_opts.snapshot);
+
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  // Should see consistent state after ingestion for all column families even
+  // without snapshot.
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      const std::string& value = elem.second;
+      ASSERT_EQ(value, Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_PrepareFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeLastJobPrepare:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeLastJobPrepare:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingest
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_PrepareFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies:PrepareFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesIntoMultipleColumnFamilies_CommitFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::IngestExternalFiles:BeforeJobsRun:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+       "1",
+       "DBImpl::IngestExternalFiles:BeforeJobsRun:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_CommitFail:"
+      "1");
+  ingest_thread.join();
+
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest,
+       IngestFilesIntoMultipleColumnFamilies_PartialManifestWriteFail) {
+  std::unique_ptr<FaultInjectionTestEnv> fault_injection_env(
+      new FaultInjectionTestEnv(env_));
+  Options options = CurrentOptions();
+  options.env = fault_injection_env.get();
+
+  CreateAndReopenWithCF({"pikachu", "eevee"}, options);
+
+  SyncPoint::GetInstance()->ClearTrace();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+       "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:0"},
+      {"ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+       "PartialManifestWriteFail:1",
+       "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  std::vector<ColumnFamilyHandle*> column_families;
+  column_families.push_back(handles_[0]);
+  column_families.push_back(handles_[1]);
+  column_families.push_back(handles_[2]);
+  std::vector<IngestExternalFileOptions> ifos(column_families.size());
+  for (auto& ifo : ifos) {
+    ifo.allow_global_seqno = true;  // Always allow global_seqno
+    // May or may not write global_seqno
+    ifo.write_global_seqno = std::get<0>(GetParam());
+    // Whether to verify block checksums before ingestion
+    ifo.verify_checksums_before_ingest = std::get<1>(GetParam());
+  }
+  std::vector<std::vector<std::pair<std::string, std::string>>> data;
+  data.push_back(
+      {std::make_pair("foo1", "fv1"), std::make_pair("foo2", "fv2")});
+  data.push_back(
+      {std::make_pair("bar1", "bv1"), std::make_pair("bar2", "bv2")});
+  data.push_back(
+      {std::make_pair("bar3", "bv3"), std::make_pair("bar4", "bv4")});
+  // Resize the true_data vector upon construction to avoid re-alloc
+  std::vector<std::map<std::string, std::string>> true_data(
+      column_families.size());
+  port::Thread ingest_thread([&]() {
+    ASSERT_NOK(GenerateAndAddExternalFiles(options, column_families, ifos, data,
+                                           -1, true, true_data));
+  });
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:0");
+  fault_injection_env->SetFilesystemActive(false);
+  TEST_SYNC_POINT(
+      "ExternalSSTFileTest::IngestFilesIntoMultipleColumnFamilies_"
+      "PartialManifestWriteFail:1");
+  ingest_thread.join();
+
+  ASSERT_OK(fault_injection_env->DropUnsyncedFileData());
+  fault_injection_env->SetFilesystemActive(true);
+  Close();
+  ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"},
+                           options);
+  ASSERT_EQ(3, handles_.size());
+  int cf = 0;
+  for (const auto& verify_map : true_data) {
+    for (const auto& elem : verify_map) {
+      const std::string& key = elem.first;
+      ASSERT_EQ("NOT_FOUND", Get(cf, key));
+    }
+    ++cf;
+  }
+  Close();
+  Destroy(options, true /* delete_cf_paths */);
+}
+
+TEST_P(ExternalSSTFileTest, IngestFilesTriggerFlushingWithTwoWriteQueue) {
+  Options options = CurrentOptions();
+  // Use large buffer to avoid memtable flush
+  options.write_buffer_size = 1024 * 1024;
+  options.two_write_queues = true;
+  DestroyAndReopen(options);
+
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1000", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "1001", "v1"));
+  ASSERT_OK(dbfull()->Put(WriteOptions(), "9999", "v1"));
+
+  // Put one key which is overlap with keys in memtable.
+  // It will trigger flushing memtable and require this thread is
+  // currently at the front of the 2nd writer queue. We must make
+  // sure that it won't enter the 2nd writer queue for the second time.
+  std::vector<std::pair<std::string, std::string>> data;
+  data.push_back(std::make_pair("1001", "v2"));
+  ASSERT_OK(GenerateAndAddExternalFile(options, data, -1, true));
+}
+
+TEST_P(ExternalSSTFileTest, DeltaEncodingWhileGlobalSeqnoPresent) {
+  Options options = CurrentOptions();
+  DestroyAndReopen(options);
+  constexpr size_t kValueSize = 8;
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  // Write some key to make global seqno larger than zero
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put("ab" + Key(i), value));
+  }
+  // Get a Snapshot to make RocksDB assign global seqno to ingested sst files.
+  auto snap = dbfull()->GetSnapshot();
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+  std::string key1 = "ab";
+  std::string key2 = "ab";
+
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  dbfull()->ReleaseSnapshot(snap);
+  ASSERT_EQ(value, Get(key1));
+  // You will get error here
+  ASSERT_EQ(value, Get(key2));
+}
+
+TEST_P(ExternalSSTFileTest,
+       DeltaEncodingWhileGlobalSeqnoPresentIteratorSwitch) {
+  // Regression test for bug where global seqno corrupted the shared bytes
+  // buffer when switching from reverse iteration to forward iteration.
+  constexpr size_t kValueSize = 8;
+  Options options = CurrentOptions();
+
+  Random rnd(301);
+  std::string value = rnd.RandomString(kValueSize);
+
+  std::string key0 = "aa";
+  std::string key1 = "ab";
+  // Make the prefix of key2 is same with key1 add zero seqno. The tail of every
+  // key is composed as (seqno << 8 | value_type), and here `1` represents
+  // ValueType::kTypeValue
+  std::string key2 = "ab";
+  PutFixed64(&key2, PackSequenceAndType(0, kTypeValue));
+  key2 += "cdefghijkl";
+  std::string key3 = key2 + "_";
+
+  // Write some key to make global seqno larger than zero
+  ASSERT_OK(Put(key0, value));
+
+  std::string fname = sst_files_dir_ + "test_file";
+  ROCKSDB_NAMESPACE::SstFileWriter writer(EnvOptions(), options);
+  ASSERT_OK(writer.Open(fname));
+
+  // key0 is a dummy to ensure the turnaround point (key1) comes from Prev
+  // cache rather than block (restart keys are pinned in block).
+  ASSERT_OK(writer.Put(key0, value));
+  ASSERT_OK(writer.Put(key1, value));
+  ASSERT_OK(writer.Put(key2, value));
+  ASSERT_OK(writer.Put(key3, value));
+
+  ExternalSstFileInfo info;
+  ASSERT_OK(writer.Finish(&info));
+
+  ASSERT_OK(dbfull()->IngestExternalFile({info.file_path},
+                                         IngestExternalFileOptions()));
+  ReadOptions read_opts;
+  // Prevents Seek() when switching directions, which circumvents the bug.
+  read_opts.total_order_seek = true;
+  Iterator* iter = db_->NewIterator(read_opts);
+  // Scan backwards to key2. File iterator will then be positioned at key1.
+  iter->Seek(key3);
+  ASSERT_EQ(key3, iter->key());
+  iter->Prev();
+  ASSERT_EQ(key2, iter->key());
+  // Scan forwards and make sure key3 is present. Previously key3 would be
+  // corrupted by the global seqno from key1.
+  iter->Next();
+  ASSERT_EQ(key3, iter->key());
+  delete iter;
+}
+
+INSTANTIATE_TEST_CASE_P(ExternalSSTFileTest, ExternalSSTFileTest,
+                        testing::Values(std::make_tuple(false, false),
+                                        std::make_tuple(false, true),
+                                        std::make_tuple(true, false),
+                                        std::make_tuple(true, true)));
+
+INSTANTIATE_TEST_CASE_P(ExternSSTFileLinkFailFallbackTest,
+                        ExternSSTFileLinkFailFallbackTest,
+                        testing::Values(std::make_tuple(true, false),
+                                        std::make_tuple(true, true),
+                                        std::make_tuple(false, false)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Ingestion are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/fault_injection_test.cc b/src/rocksdb/db/fault_injection_test.cc
new file mode 100644
index 000000000..ddd4b47cc
--- /dev/null
+++ b/src/rocksdb/db/fault_injection_test.cc
@@ -0,0 +1,637 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright 2014 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+// This test uses a custom Env to keep track of the state of a filesystem as of
+// the last "sync". It then checks for data loss errors by purposely dropping
+// file data (or entire files) not protected by a "sync".
+
+#include "db/db_impl/db_impl.h"
+#include "db/log_format.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/table.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "utilities/fault_injection_env.h"
+#ifndef NDEBUG
+#include "utilities/fault_injection_fs.h"
+#endif
+
+namespace ROCKSDB_NAMESPACE {
+
+static const int kValueSize = 1000;
+static const int kMaxNumValues = 2000;
+static const size_t kNumIterations = 3;
+
+enum FaultInjectionOptionConfig {
+  kDefault,
+  kDifferentDataDir,
+  kWalDir,
+  kSyncWal,
+  kWalDirSyncWal,
+  kMultiLevels,
+  kEnd,
+};
+class FaultInjectionTest
+    : public testing::Test,
+      public testing::WithParamInterface<std::tuple<
+          bool, FaultInjectionOptionConfig, FaultInjectionOptionConfig>> {
+ protected:
+  int option_config_;
+  int non_inclusive_end_range_;  // kEnd or equivalent to that
+  // When need to make sure data is persistent, sync WAL
+  bool sync_use_wal_;
+  // When need to make sure data is persistent, call DB::CompactRange()
+  bool sync_use_compact_;
+
+  bool sequential_order_;
+
+ public:
+  enum ExpectedVerifResult { kValExpectFound, kValExpectNoError };
+  enum ResetMethod {
+    kResetDropUnsyncedData,
+    kResetDropRandomUnsyncedData,
+    kResetDeleteUnsyncedFiles,
+    kResetDropAndDeleteUnsynced
+  };
+
+  std::unique_ptr<Env> base_env_;
+  FaultInjectionTestEnv* env_;
+  std::string dbname_;
+  std::shared_ptr<Cache> tiny_cache_;
+  Options options_;
+  DB* db_;
+
+  FaultInjectionTest()
+      : option_config_(std::get<1>(GetParam())),
+        non_inclusive_end_range_(std::get<2>(GetParam())),
+        sync_use_wal_(false),
+        sync_use_compact_(true),
+        base_env_(nullptr),
+        env_(nullptr),
+        db_(nullptr) {
+    EXPECT_OK(
+        test::CreateEnvFromSystem(ConfigOptions(), &system_env_, &env_guard_));
+    EXPECT_NE(system_env_, nullptr);
+  }
+
+  ~FaultInjectionTest() override {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+  }
+
+  bool ChangeOptions() {
+    option_config_++;
+    if (option_config_ >= non_inclusive_end_range_) {
+      return false;
+    } else {
+      if (option_config_ == kMultiLevels) {
+        base_env_.reset(MockEnv::Create(system_env_));
+      }
+      return true;
+    }
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    sync_use_wal_ = false;
+    sync_use_compact_ = true;
+    Options options;
+    switch (option_config_) {
+      case kWalDir:
+        options.wal_dir = test::PerThreadDBPath(env_, "fault_test_wal");
+        break;
+      case kDifferentDataDir:
+        options.db_paths.emplace_back(
+            test::PerThreadDBPath(env_, "fault_test_data"), 1000000U);
+        break;
+      case kSyncWal:
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kWalDirSyncWal:
+        options.wal_dir = test::PerThreadDBPath(env_, "/fault_test_wal");
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      case kMultiLevels:
+        options.write_buffer_size = 64 * 1024;
+        options.target_file_size_base = 64 * 1024;
+        options.level0_file_num_compaction_trigger = 2;
+        options.level0_slowdown_writes_trigger = 2;
+        options.level0_stop_writes_trigger = 4;
+        options.max_bytes_for_level_base = 128 * 1024;
+        options.max_write_buffer_number = 2;
+        options.max_background_compactions = 8;
+        options.max_background_flushes = 8;
+        sync_use_wal_ = true;
+        sync_use_compact_ = false;
+        break;
+      default:
+        break;
+    }
+    return options;
+  }
+
+  Status NewDB() {
+    assert(db_ == nullptr);
+    assert(tiny_cache_ == nullptr);
+    assert(env_ == nullptr);
+
+    env_ = new FaultInjectionTestEnv(base_env_ ? base_env_.get() : system_env_);
+
+    options_ = CurrentOptions();
+    options_.env = env_;
+    options_.paranoid_checks = true;
+
+    BlockBasedTableOptions table_options;
+    tiny_cache_ = NewLRUCache(100);
+    table_options.block_cache = tiny_cache_;
+    options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    dbname_ = test::PerThreadDBPath("fault_test");
+
+    EXPECT_OK(DestroyDB(dbname_, options_));
+
+    options_.create_if_missing = true;
+    Status s = OpenDB();
+    options_.create_if_missing = false;
+    return s;
+  }
+
+  void SetUp() override {
+    sequential_order_ = std::get<0>(GetParam());
+    ASSERT_OK(NewDB());
+  }
+
+  void TearDown() override {
+    CloseDB();
+
+    Status s = DestroyDB(dbname_, options_);
+
+    delete env_;
+    env_ = nullptr;
+
+    tiny_cache_.reset();
+
+    ASSERT_OK(s);
+  }
+
+  void Build(const WriteOptions& write_options, int start_idx, int num_vals) {
+    std::string key_space, value_space;
+    WriteBatch batch;
+    for (int i = start_idx; i < start_idx + num_vals; i++) {
+      Slice key = Key(i, &key_space);
+      batch.Clear();
+      ASSERT_OK(batch.Put(key, Value(i, &value_space)));
+      ASSERT_OK(db_->Write(write_options, &batch));
+    }
+  }
+
+  Status ReadValue(int i, std::string* val) const {
+    std::string key_space, value_space;
+    Slice key = Key(i, &key_space);
+    Value(i, &value_space);
+    ReadOptions options;
+    return db_->Get(options, key, val);
+  }
+
+  Status Verify(int start_idx, int num_vals,
+                ExpectedVerifResult expected) const {
+    std::string val;
+    std::string value_space;
+    Status s;
+    for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) {
+      Value(i, &value_space);
+      s = ReadValue(i, &val);
+      if (s.ok()) {
+        EXPECT_EQ(value_space, val);
+      }
+      if (expected == kValExpectFound) {
+        if (!s.ok()) {
+          fprintf(stderr, "Error when read %dth record (expect found): %s\n", i,
+                  s.ToString().c_str());
+          return s;
+        }
+      } else if (!s.ok() && !s.IsNotFound()) {
+        fprintf(stderr, "Error when read %dth record: %s\n", i,
+                s.ToString().c_str());
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return the ith key
+  Slice Key(int i, std::string* storage) const {
+    unsigned long long num = i;
+    if (!sequential_order_) {
+      // random transfer
+      const int m = 0x5bd1e995;
+      num *= m;
+      num ^= num << 24;
+    }
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%016d", static_cast<int>(num));
+    storage->assign(buf, strlen(buf));
+    return Slice(*storage);
+  }
+
+  // Return the value to associate with the specified key
+  Slice Value(int k, std::string* storage) const {
+    Random r(k);
+    *storage = r.RandomString(kValueSize);
+    return Slice(*storage);
+  }
+
+  void CloseDB() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  Status OpenDB() {
+    CloseDB();
+    env_->ResetState();
+    Status s = DB::Open(options_, dbname_, &db_);
+    assert(db_ != nullptr);
+    return s;
+  }
+
+  void DeleteAllData() {
+    Iterator* iter = db_->NewIterator(ReadOptions());
+    WriteOptions options;
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ASSERT_OK(db_->Delete(WriteOptions(), iter->key()));
+    }
+    ASSERT_OK(iter->status());
+    delete iter;
+
+    FlushOptions flush_options;
+    flush_options.wait = true;
+    ASSERT_OK(db_->Flush(flush_options));
+  }
+
+  // rnd cannot be null for kResetDropRandomUnsyncedData
+  void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) {
+    env_->AssertNoOpenFile();
+    switch (reset_method) {
+      case kResetDropUnsyncedData:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        break;
+      case kResetDropRandomUnsyncedData:
+        ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd));
+        break;
+      case kResetDeleteUnsyncedFiles:
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      case kResetDropAndDeleteUnsynced:
+        ASSERT_OK(env_->DropUnsyncedFileData());
+        ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync());
+        break;
+      default:
+        assert(false);
+    }
+  }
+
+  void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) {
+    DeleteAllData();
+
+    WriteOptions write_options;
+    write_options.sync = sync_use_wal_;
+
+    Build(write_options, 0, num_pre_sync);
+    if (sync_use_compact_) {
+      ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+    write_options.sync = false;
+    Build(write_options, num_pre_sync, num_post_sync);
+  }
+
+  void PartialCompactTestReopenWithFault(ResetMethod reset_method,
+                                         int num_pre_sync, int num_post_sync,
+                                         Random* rnd = nullptr) {
+    env_->SetFilesystemActive(false);
+    CloseDB();
+    ResetDBState(reset_method, rnd);
+    ASSERT_OK(OpenDB());
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+    WaitCompactionFinish();
+    ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound));
+    ASSERT_OK(Verify(num_pre_sync, num_post_sync,
+                     FaultInjectionTest::kValExpectNoError));
+  }
+
+  void NoWriteTestPreFault() {}
+
+  void NoWriteTestReopenWithFault(ResetMethod reset_method) {
+    CloseDB();
+    ResetDBState(reset_method);
+    ASSERT_OK(OpenDB());
+  }
+
+  void WaitCompactionFinish() {
+    ASSERT_OK(static_cast<DBImpl*>(db_->GetRootDB())->TEST_WaitForCompact());
+    ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  }
+
+ private:
+  Env* system_env_;
+  std::shared_ptr<Env> env_guard_;
+};
+
+class FaultInjectionTestSplitted : public FaultInjectionTest {};
+
+TEST_P(FaultInjectionTestSplitted, FaultTest) {
+  do {
+    Random rnd(301);
+
+    for (size_t idx = 0; idx < kNumIterations; idx++) {
+      int num_pre_sync = rnd.Uniform(kMaxNumValues);
+      int num_post_sync = rnd.Uniform(kMaxNumValues);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData,
+                                        num_pre_sync, num_post_sync, &rnd);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropUnsyncedData);
+
+      // Setting a separate data path won't pass the test as we don't sync
+      // it after creating new files,
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced,
+                                        num_pre_sync, num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+
+      PartialCompactTestPreFault(num_pre_sync, num_post_sync);
+      // No new files created so we expect all values since no files will be
+      // dropped.
+      PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync,
+                                        num_post_sync);
+      NoWriteTestPreFault();
+      NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles);
+    }
+  } while (ChangeOptions());
+}
+
+// Previous log file is not fsynced if sync is forced after log rolling.
+TEST_P(FaultInjectionTest, WriteOptionSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  write_options.sync = true;
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->FlushWAL(false));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, UninstalledCompaction) {
+  options_.target_file_size_base = 32 * 1024;
+  options_.write_buffer_size = 100 << 10;  // 100KB
+  options_.level0_file_num_compaction_trigger = 6;
+  options_.level0_stop_writes_trigger = 1 << 10;
+  options_.level0_slowdown_writes_trigger = 1 << 10;
+  options_.max_background_compactions = 1;
+  OpenDB();
+
+  if (!sequential_order_) {
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({
+        {"FaultInjectionTest::FaultTest:0", "DBImpl::BGWorkCompaction"},
+        {"CompactionJob::Run():End", "FaultInjectionTest::FaultTest:1"},
+        {"FaultInjectionTest::FaultTest:2",
+         "DBImpl::BackgroundCompaction:NonTrivial:AfterRun"},
+    });
+  }
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  int kNumKeys = 1000;
+  Build(WriteOptions(), 0, kNumKeys);
+  FlushOptions flush_options;
+  flush_options.wait = true;
+  ASSERT_OK(db_->Flush(flush_options));
+  ASSERT_OK(db_->Put(WriteOptions(), "", ""));
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:0");
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:1");
+  env_->SetFilesystemActive(false);
+  TEST_SYNC_POINT("FaultInjectionTest::FaultTest:2");
+  CloseDB();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ResetDBState(kResetDropUnsyncedData);
+
+  std::atomic<bool> opened(false);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::Open:Opened", [&](void* /*arg*/) { opened.store(true); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BGWorkCompaction",
+      [&](void* /*arg*/) { ASSERT_TRUE(opened.load()); });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(OpenDB());
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  WaitCompactionFinish();
+  ASSERT_OK(Verify(0, kNumKeys, FaultInjectionTest::kValExpectFound));
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_P(FaultInjectionTest, ManualLogSyncTest) {
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->SetBackgroundThreads(1, Env::HIGH);
+  // Block the job queue to prevent flush job from running.
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::HIGH);
+  sleeping_task_low.WaitUntilSleeping();
+
+  WriteOptions write_options;
+  write_options.sync = false;
+
+  std::string key_space, value_space;
+  ASSERT_OK(
+      db_->Put(write_options, Key(1, &key_space), Value(1, &value_space)));
+  FlushOptions flush_options;
+  flush_options.wait = false;
+  ASSERT_OK(db_->Flush(flush_options));
+  ASSERT_OK(
+      db_->Put(write_options, Key(2, &key_space), Value(2, &value_space)));
+  ASSERT_OK(db_->FlushWAL(true));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+
+  ASSERT_OK(OpenDB());
+  std::string val;
+  Value(2, &value_space);
+  ASSERT_OK(ReadValue(2, &val));
+  ASSERT_EQ(value_space, val);
+
+  Value(1, &value_space);
+  ASSERT_OK(ReadValue(1, &val));
+  ASSERT_EQ(value_space, val);
+}
+
+TEST_P(FaultInjectionTest, WriteBatchWalTerminationTest) {
+  ReadOptions ro;
+  Options options = CurrentOptions();
+  options.env = env_;
+
+  WriteOptions wo;
+  wo.sync = true;
+  wo.disableWAL = false;
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("cats", "dogs"));
+  batch.MarkWalTerminationPoint();
+  ASSERT_OK(batch.Put("boys", "girls"));
+  ASSERT_OK(db_->Write(wo, &batch));
+
+  env_->SetFilesystemActive(false);
+  NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced);
+  ASSERT_OK(OpenDB());
+
+  std::string val;
+  ASSERT_OK(db_->Get(ro, "cats", &val));
+  ASSERT_EQ("dogs", val);
+  ASSERT_EQ(db_->Get(ro, "boys", &val), Status::NotFound());
+}
+
+TEST_P(FaultInjectionTest, NoDuplicateTrailingEntries) {
+  auto fault_fs = std::make_shared<FaultInjectionTestFS>(FileSystem::Default());
+  fault_fs->EnableWriteErrorInjection();
+  fault_fs->SetFilesystemDirectWritable(false);
+  const std::string file_name = NormalizePath(dbname_ + "/test_file");
+  std::unique_ptr<log::Writer> log_writer = nullptr;
+  constexpr uint64_t log_number = 0;
+  {
+    std::unique_ptr<FSWritableFile> file;
+    const Status s =
+        fault_fs->NewWritableFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<WritableFileWriter> fwriter(
+        new WritableFileWriter(std::move(file), file_name, FileOptions()));
+    log_writer.reset(new log::Writer(std::move(fwriter), log_number,
+                                     /*recycle_log_files=*/false));
+  }
+
+  fault_fs->SetRandomWriteError(
+      0xdeadbeef, /*one_in=*/1, IOStatus::IOError("Injected IOError"),
+      /*inject_for_all_file_types=*/true, /*types=*/{});
+
+  {
+    VersionEdit edit;
+    edit.SetColumnFamily(0);
+    std::string buf;
+    assert(edit.EncodeTo(&buf));
+    const Status s = log_writer->AddRecord(buf);
+    ASSERT_NOK(s);
+  }
+
+  fault_fs->DisableWriteErrorInjection();
+
+  // Closing the log writer will cause WritableFileWriter::Close() and flush
+  // remaining data from its buffer to underlying file.
+  log_writer.reset();
+
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    Status s =
+        fault_fs->NewSequentialFile(file_name, FileOptions(), &file, nullptr);
+    ASSERT_OK(s);
+    std::unique_ptr<SequentialFileReader> freader(
+        new SequentialFileReader(std::move(file), file_name));
+    Status log_read_s;
+    class LogReporter : public log::Reader::Reporter {
+     public:
+      Status* status_;
+      explicit LogReporter(Status* _s) : status_(_s) {}
+      void Corruption(size_t /*bytes*/, const Status& _s) override {
+        if (status_->ok()) {
+          *status_ = _s;
+        }
+      }
+    } reporter(&log_read_s);
+    std::unique_ptr<log::Reader> log_reader(new log::Reader(
+        nullptr, std::move(freader), &reporter, /*checksum=*/true, log_number));
+    Slice record;
+    std::string data;
+    size_t count = 0;
+    while (log_reader->ReadRecord(&record, &data) && log_read_s.ok()) {
+      VersionEdit edit;
+      ASSERT_OK(edit.DecodeFrom(data));
+      ++count;
+    }
+    // Verify that only one version edit exists in the file.
+    ASSERT_EQ(1, count);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTest,
+    ::testing::Values(std::make_tuple(false, kDefault, kEnd),
+                      std::make_tuple(true, kDefault, kEnd)));
+
+INSTANTIATE_TEST_CASE_P(
+    FaultTest, FaultInjectionTestSplitted,
+    ::testing::Values(std::make_tuple(false, kDefault, kSyncWal),
+                      std::make_tuple(true, kDefault, kSyncWal),
+                      std::make_tuple(false, kSyncWal, kEnd),
+                      std::make_tuple(true, kSyncWal, kEnd)));
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/file_indexer.cc b/src/rocksdb/db/file_indexer.cc
new file mode 100644
index 000000000..608f1cb28
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.cc
@@ -0,0 +1,218 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <algorithm>
+#include <functional>
+
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FileIndexer::FileIndexer(const Comparator* ucmp)
+    : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {}
+
+size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); }
+
+size_t FileIndexer::LevelIndexSize(size_t level) const {
+  if (level >= next_level_index_.size()) {
+    return 0;
+  }
+  return next_level_index_[level].num_index;
+}
+
+void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index,
+                                    const int cmp_smallest,
+                                    const int cmp_largest, int32_t* left_bound,
+                                    int32_t* right_bound) const {
+  assert(level > 0);
+
+  // Last level, no hint
+  if (level == num_levels_ - 1) {
+    *left_bound = 0;
+    *right_bound = -1;
+    return;
+  }
+
+  assert(level < num_levels_ - 1);
+  assert(static_cast<int32_t>(file_index) <= level_rb_[level]);
+
+  const IndexUnit* index_units = next_level_index_[level].index_units;
+  const auto& index = index_units[file_index];
+
+  if (cmp_smallest < 0) {
+    *left_bound = (level > 0 && file_index > 0)
+                      ? index_units[file_index - 1].largest_lb
+                      : 0;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest == 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.smallest_rb;
+  } else if (cmp_smallest > 0 && cmp_largest < 0) {
+    *left_bound = index.smallest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest == 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = index.largest_rb;
+  } else if (cmp_largest > 0) {
+    *left_bound = index.largest_lb;
+    *right_bound = level_rb_[level + 1];
+  } else {
+    assert(false);
+  }
+
+  assert(*left_bound >= 0);
+  assert(*left_bound <= *right_bound + 1);
+  assert(*right_bound <= level_rb_[level + 1]);
+}
+
+void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels,
+                              std::vector<FileMetaData*>* const files) {
+  if (files == nullptr) {
+    return;
+  }
+  if (num_levels == 0) {  // uint_32 0-1 would cause bad behavior
+    num_levels_ = num_levels;
+    return;
+  }
+  assert(level_rb_ == nullptr);  // level_rb_ should be init here
+
+  num_levels_ = num_levels;
+  next_level_index_.resize(num_levels);
+
+  char* mem = arena->AllocateAligned(num_levels_ * sizeof(int32_t));
+  level_rb_ = new (mem) int32_t[num_levels_];
+  for (size_t i = 0; i < num_levels_; i++) {
+    level_rb_[i] = -1;
+  }
+
+  // L1 - Ln-1
+  for (size_t level = 1; level < num_levels_ - 1; ++level) {
+    const auto& upper_files = files[level];
+    const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+    const auto& lower_files = files[level + 1];
+    level_rb_[level] = static_cast<int32_t>(upper_files.size()) - 1;
+    if (upper_size == 0) {
+      continue;
+    }
+    IndexLevel& index_level = next_level_index_[level];
+    index_level.num_index = upper_size;
+    mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit));
+    index_level.index_units = new (mem) IndexUnit[upper_size];
+
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_lb = f_idx; });
+    CalculateLB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->largest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_lb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->smallest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->smallest_rb = f_idx; });
+    CalculateRB(
+        upper_files, lower_files, &index_level,
+        [this](const FileMetaData* a, const FileMetaData* b) -> int {
+          return ucmp_->CompareWithoutTimestamp(a->largest.user_key(),
+                                                b->smallest.user_key());
+        },
+        [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; });
+  }
+
+  level_rb_[num_levels_ - 1] =
+      static_cast<int32_t>(files[num_levels_ - 1].size()) - 1;
+}
+
+void FileIndexer::CalculateLB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = 0;
+  int32_t lower_idx = 0;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx < upper_size && lower_idx < lower_size) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    } else if (cmp > 0) {
+      // Lower level's file (largest) is smaller, a key won't hit in that
+      // file. Move to next lower file
+      ++lower_idx;
+    } else {
+      // Lower level's file becomes larger, update the index, and
+      // move to the next upper file
+      set_index(&index[upper_idx], lower_idx);
+      ++upper_idx;
+    }
+  }
+
+  while (upper_idx < upper_size) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // greater than any lower files. Set the index to be the lower level size.
+    set_index(&index[upper_idx], lower_size);
+    ++upper_idx;
+  }
+}
+
+void FileIndexer::CalculateRB(
+    const std::vector<FileMetaData*>& upper_files,
+    const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+    std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+    std::function<void(IndexUnit*, int32_t)> set_index) {
+  const int32_t upper_size = static_cast<int32_t>(upper_files.size());
+  const int32_t lower_size = static_cast<int32_t>(lower_files.size());
+  int32_t upper_idx = upper_size - 1;
+  int32_t lower_idx = lower_size - 1;
+
+  IndexUnit* index = index_level->index_units;
+  while (upper_idx >= 0 && lower_idx >= 0) {
+    int cmp = cmp_op(upper_files[upper_idx], lower_files[lower_idx]);
+
+    if (cmp == 0) {
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    } else if (cmp < 0) {
+      // Lower level's file (smallest) is larger, a key won't hit in that
+      // file. Move to next lower file.
+      --lower_idx;
+    } else {
+      // Lower level's file becomes smaller, update the index, and move to
+      // the next the upper file
+      set_index(&index[upper_idx], lower_idx);
+      --upper_idx;
+    }
+  }
+  while (upper_idx >= 0) {
+    // Lower files are exhausted, that means the remaining upper files are
+    // smaller than any lower files. Set it to -1.
+    set_index(&index[upper_idx], -1);
+    --upper_idx;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer.h b/src/rocksdb/db/file_indexer.h
new file mode 100644
index 000000000..45cb13615
--- /dev/null
+++ b/src/rocksdb/db/file_indexer.h
@@ -0,0 +1,140 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <vector>
+
+#include "memory/arena.h"
+#include "port/port.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+struct FileMetaData;
+struct FdWithKeyRange;
+struct FileLevel;
+
+// The file tree structure in Version is prebuilt and the range of each file
+// is known. On Version::Get(), it uses binary search to find a potential file
+// and then check if a target key can be found in the file by comparing the key
+// to each file's smallest and largest key. The results of these comparisons
+// can be reused beyond checking if a key falls into a file's range.
+// With some pre-calculated knowledge, each key comparison that has been done
+// can serve as a hint to narrow down further searches: if a key compared to
+// be smaller than a file's smallest or largest, that comparison can be used
+// to find out the right bound of next binary search. Similarly, if a key
+// compared to be larger than a file's smallest or largest, it can be utilized
+// to find out the left bound of next binary search.
+// With these hints: it can greatly reduce the range of binary search,
+// especially for bottom levels, given that one file most likely overlaps with
+// only N files from level below (where N is max_bytes_for_level_multiplier).
+// So on level L, we will only look at ~N files instead of N^L files on the
+// naive approach.
+class FileIndexer {
+ public:
+  explicit FileIndexer(const Comparator* ucmp);
+
+  size_t NumLevelIndex() const;
+
+  size_t LevelIndexSize(size_t level) const;
+
+  // Return a file index range in the next level to search for a key based on
+  // smallest and largest key comparison for the current file specified by
+  // level and file_index. When *left_index < *right_index, both index should
+  // be valid and fit in the vector size.
+  void GetNextLevelIndex(const size_t level, const size_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_bound, int32_t* right_bound) const;
+
+  void UpdateIndex(Arena* arena, const size_t num_levels,
+                   std::vector<FileMetaData*>* const files);
+
+  enum { kLevelMaxIndex = std::numeric_limits<int32_t>::max() };
+
+ private:
+  size_t num_levels_;
+  const Comparator* ucmp_;
+
+  struct IndexUnit {
+    IndexUnit()
+        : smallest_lb(0), largest_lb(0), smallest_rb(-1), largest_rb(-1) {}
+    // During file search, a key is compared against smallest and largest
+    // from a FileMetaData. It can have 3 possible outcomes:
+    // (1) key is smaller than smallest, implying it is also smaller than
+    //     larger. Precalculated index based on "smallest < smallest" can
+    //     be used to provide right bound.
+    // (2) key is in between smallest and largest.
+    //     Precalculated index based on "smallest > greatest" can be used to
+    //     provide left bound.
+    //     Precalculated index based on "largest < smallest" can be used to
+    //     provide right bound.
+    // (3) key is larger than largest, implying it is also larger than smallest.
+    //     Precalculated index based on "largest > largest" can be used to
+    //     provide left bound.
+    //
+    // As a result, we will need to do:
+    // Compare smallest (<=) and largest keys from upper level file with
+    // smallest key from lower level to get a right bound.
+    // Compare smallest (>=) and largest keys from upper level file with
+    // largest key from lower level to get a left bound.
+    //
+    // Example:
+    //    level 1:              [50 - 60]
+    //    level 2:        [1 - 40], [45 - 55], [58 - 80]
+    // A key 35, compared to be less than 50, 3rd file on level 2 can be
+    // skipped according to rule (1). LB = 0, RB = 1.
+    // A key 53, sits in the middle 50 and 60. 1st file on level 2 can be
+    // skipped according to rule (2)-a, but the 3rd file cannot be skipped
+    // because 60 is greater than 58. LB = 1, RB = 2.
+    // A key 70, compared to be larger than 60. 1st and 2nd file can be skipped
+    // according to rule (3). LB = 2, RB = 2.
+    //
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than smallest of a FileMetaData (upper level)
+    int32_t smallest_lb;
+    // Point to a left most file in a lower level that may contain a key,
+    // which compares greater than largest of a FileMetaData (upper level)
+    int32_t largest_lb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than smallest of a FileMetaData (upper level)
+    int32_t smallest_rb;
+    // Point to a right most file in a lower level that may contain a key,
+    // which compares smaller than largest of a FileMetaData (upper level)
+    int32_t largest_rb;
+  };
+
+  // Data structure to store IndexUnits in a whole level
+  struct IndexLevel {
+    size_t num_index;
+    IndexUnit* index_units;
+
+    IndexLevel() : num_index(0), index_units(nullptr) {}
+  };
+
+  void CalculateLB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  void CalculateRB(
+      const std::vector<FileMetaData*>& upper_files,
+      const std::vector<FileMetaData*>& lower_files, IndexLevel* index_level,
+      std::function<int(const FileMetaData*, const FileMetaData*)> cmp_op,
+      std::function<void(IndexUnit*, int32_t)> set_index);
+
+  autovector<IndexLevel> next_level_index_;
+  int32_t* level_rb_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/file_indexer_test.cc b/src/rocksdb/db/file_indexer_test.cc
new file mode 100644
index 000000000..5c82189ef
--- /dev/null
+++ b/src/rocksdb/db/file_indexer_test.cc
@@ -0,0 +1,352 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/file_indexer.h"
+
+#include <string>
+
+#include "db/dbformat.h"
+#include "db/version_edit.h"
+#include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class IntComparator : public Comparator {
+ public:
+  int Compare(const Slice& a, const Slice& b) const override {
+    assert(a.size() == 8);
+    assert(b.size() == 8);
+    int64_t diff = *reinterpret_cast<const int64_t*>(a.data()) -
+                   *reinterpret_cast<const int64_t*>(b.data());
+    if (diff < 0) {
+      return -1;
+    } else if (diff == 0) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  const char* Name() const override { return "IntComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+class FileIndexerTest : public testing::Test {
+ public:
+  FileIndexerTest()
+      : kNumLevels(4), files(new std::vector<FileMetaData*>[kNumLevels]) {}
+
+  ~FileIndexerTest() override {
+    ClearFiles();
+    delete[] files;
+  }
+
+  void AddFile(int level, int64_t smallest, int64_t largest) {
+    auto* f = new FileMetaData();
+    f->smallest = IntKey(smallest);
+    f->largest = IntKey(largest);
+    files[level].push_back(f);
+  }
+
+  InternalKey IntKey(int64_t v) {
+    return InternalKey(Slice(reinterpret_cast<char*>(&v), 8), 0, kTypeValue);
+  }
+
+  void ClearFiles() {
+    for (uint32_t i = 0; i < kNumLevels; ++i) {
+      for (auto* f : files[i]) {
+        delete f;
+      }
+      files[i].clear();
+    }
+  }
+
+  void GetNextLevelIndex(const uint32_t level, const uint32_t file_index,
+                         const int cmp_smallest, const int cmp_largest,
+                         int32_t* left_index, int32_t* right_index) {
+    *left_index = 100;
+    *right_index = 100;
+    indexer->GetNextLevelIndex(level, file_index, cmp_smallest, cmp_largest,
+                               left_index, right_index);
+  }
+
+  int32_t left = 100;
+  int32_t right = 100;
+  const uint32_t kNumLevels;
+  IntComparator ucmp;
+  FileIndexer* indexer;
+
+  std::vector<FileMetaData*>* files;
+};
+
+// Case 0: Empty
+TEST_F(FileIndexerTest, Empty) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  indexer->UpdateIndex(&arena, 0, files);
+  delete indexer;
+}
+
+// Case 1: no overlap, files are on the left of next level files
+TEST_F(FileIndexerTest, no_overlap_left) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 300, 400);
+  AddFile(1, 500, 600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1601, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 2500, 2600);
+  AddFile(3, 2601, 2699);
+  AddFile(3, 2700, 2800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(-1, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(0, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 2: no overlap, files are on the right of next level files
+TEST_F(FileIndexerTest, no_overlap_right) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 2
+  AddFile(2, 1500, 1600);
+  AddFile(2, 1501, 1699);
+  AddFile(2, 1700, 1800);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t level = 1; level < 3; ++level) {
+    for (uint32_t f = 0; f < 3; ++f) {
+      GetNextLevelIndex(level, f, -1, -1, &left, &right);
+      ASSERT_EQ(f == 0 ? 0 : 3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 0, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, -1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 0, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+      GetNextLevelIndex(level, f, 1, 1, &left, &right);
+      ASSERT_EQ(3, left);
+      ASSERT_EQ(2, right);
+    }
+  }
+  delete indexer;
+}
+
+// Case 3: empty L2
+TEST_F(FileIndexerTest, empty_L2) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  for (uint32_t i = 1; i < kNumLevels; ++i) {
+    ASSERT_EQ(0U, indexer->LevelIndexSize(i));
+  }
+  // level 1
+  AddFile(1, 2100, 2200);
+  AddFile(1, 2300, 2400);
+  AddFile(1, 2500, 2600);
+  // level 3
+  AddFile(3, 500, 600);
+  AddFile(3, 501, 699);
+  AddFile(3, 700, 800);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  for (uint32_t f = 0; f < 3; ++f) {
+    GetNextLevelIndex(1, f, -1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 0, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, -1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 0, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+    GetNextLevelIndex(1, f, 1, 1, &left, &right);
+    ASSERT_EQ(0, left);
+    ASSERT_EQ(-1, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+// Case 4: mixed
+TEST_F(FileIndexerTest, mixed) {
+  Arena arena;
+  indexer = new FileIndexer(&ucmp);
+  // level 1
+  AddFile(1, 100, 200);
+  AddFile(1, 250, 400);
+  AddFile(1, 450, 500);
+  // level 2
+  AddFile(2, 100, 150);  // 0
+  AddFile(2, 200, 250);  // 1
+  AddFile(2, 251, 300);  // 2
+  AddFile(2, 301, 350);  // 3
+  AddFile(2, 500, 600);  // 4
+  // level 3
+  AddFile(3, 0, 50);
+  AddFile(3, 100, 200);
+  AddFile(3, 201, 250);
+  indexer->UpdateIndex(&arena, kNumLevels, files);
+  // level 1, 0
+  GetNextLevelIndex(1, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 0, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(0, right);
+  GetNextLevelIndex(1, 0, 1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(4, right);
+  // level 1, 1
+  GetNextLevelIndex(1, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(1, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 1, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 1, 2
+  GetNextLevelIndex(1, 2, -1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 0, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(3, right);
+  GetNextLevelIndex(1, 2, 1, -1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 0, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  GetNextLevelIndex(1, 2, 1, 1, &left, &right);
+  ASSERT_EQ(4, left);
+  ASSERT_EQ(4, right);
+  // level 2, 0
+  GetNextLevelIndex(2, 0, -1, -1, &left, &right);
+  ASSERT_EQ(0, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 0, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 0, 1, 1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  // level 2, 1
+  GetNextLevelIndex(2, 1, -1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 0, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(1, right);
+  GetNextLevelIndex(2, 1, 1, -1, &left, &right);
+  ASSERT_EQ(1, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 0, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  GetNextLevelIndex(2, 1, 1, 1, &left, &right);
+  ASSERT_EQ(2, left);
+  ASSERT_EQ(2, right);
+  // level 2, [2 - 4], no overlap
+  for (uint32_t f = 2; f <= 4; ++f) {
+    GetNextLevelIndex(2, f, -1, -1, &left, &right);
+    ASSERT_EQ(f == 2 ? 2 : 3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 0, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, -1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 0, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+    GetNextLevelIndex(2, f, 1, 1, &left, &right);
+    ASSERT_EQ(3, left);
+    ASSERT_EQ(2, right);
+  }
+  delete indexer;
+  ClearFiles();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/filename_test.cc b/src/rocksdb/db/filename_test.cc
new file mode 100644
index 000000000..04c81b333
--- /dev/null
+++ b/src/rocksdb/db/filename_test.cc
@@ -0,0 +1,241 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "file/filename.h"
+
+#include "db/dbformat.h"
+#include "port/port.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class FileNameTest : public testing::Test {};
+
+TEST_F(FileNameTest, Parse) {
+  Slice db;
+  FileType type;
+  uint64_t number;
+
+  char kDefautInfoLogDir = 1;
+  char kDifferentInfoLogDir = 2;
+  char kNoCheckLogDir = 4;
+  char kAllMode = kDefautInfoLogDir | kDifferentInfoLogDir | kNoCheckLogDir;
+
+  // Successful parses
+  static struct {
+    const char* fname;
+    uint64_t number;
+    FileType type;
+    char mode;
+  } cases[] = {
+      {"100.log", 100, kWalFile, kAllMode},
+      {"0.log", 0, kWalFile, kAllMode},
+      {"0.sst", 0, kTableFile, kAllMode},
+      {"CURRENT", 0, kCurrentFile, kAllMode},
+      {"LOCK", 0, kDBLockFile, kAllMode},
+      {"MANIFEST-2", 2, kDescriptorFile, kAllMode},
+      {"MANIFEST-7", 7, kDescriptorFile, kAllMode},
+      {"METADB-2", 2, kMetaDatabase, kAllMode},
+      {"METADB-7", 7, kMetaDatabase, kAllMode},
+      {"LOG", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old", 0, kInfoLogFile, kDefautInfoLogDir},
+      {"LOG.old.6688", 6688, kInfoLogFile, kDefautInfoLogDir},
+      {"rocksdb_dir_LOG", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old", 0, kInfoLogFile, kDifferentInfoLogDir},
+      {"rocksdb_dir_LOG.old.6688", 6688, kInfoLogFile, kDifferentInfoLogDir},
+      {"18446744073709551615.log", 18446744073709551615ull, kWalFile, kAllMode},
+  };
+  for (char mode : {kDifferentInfoLogDir, kDefautInfoLogDir, kNoCheckLogDir}) {
+    for (unsigned int i = 0; i < sizeof(cases) / sizeof(cases[0]); i++) {
+      InfoLogPrefix info_log_prefix(mode != kDefautInfoLogDir, "/rocksdb/dir");
+      if (cases[i].mode & mode) {
+        std::string f = cases[i].fname;
+        if (mode == kNoCheckLogDir) {
+          ASSERT_TRUE(ParseFileName(f, &number, &type)) << f;
+        } else {
+          ASSERT_TRUE(ParseFileName(f, &number, info_log_prefix.prefix, &type))
+              << f;
+        }
+        ASSERT_EQ(cases[i].type, type) << f;
+        ASSERT_EQ(cases[i].number, number) << f;
+      }
+    }
+  }
+
+  // Errors
+  static const char* errors[] = {"",
+                                 "foo",
+                                 "foo-dx-100.log",
+                                 ".log",
+                                 "",
+                                 "manifest",
+                                 "CURREN",
+                                 "CURRENTX",
+                                 "MANIFES",
+                                 "MANIFEST",
+                                 "MANIFEST-",
+                                 "XMANIFEST-3",
+                                 "MANIFEST-3x",
+                                 "META",
+                                 "METADB",
+                                 "METADB-",
+                                 "XMETADB-3",
+                                 "METADB-3x",
+                                 "LOC",
+                                 "LOCKx",
+                                 "LO",
+                                 "LOGx",
+                                 "18446744073709551616.log",
+                                 "184467440737095516150.log",
+                                 "100",
+                                 "100.",
+                                 "100.lop"};
+  for (unsigned int i = 0; i < sizeof(errors) / sizeof(errors[0]); i++) {
+    std::string f = errors[i];
+    ASSERT_TRUE(!ParseFileName(f, &number, &type)) << f;
+  };
+}
+
+TEST_F(FileNameTest, InfoLogFileName) {
+  std::string dbname = ("/data/rocksdb");
+  std::string db_absolute_path;
+  ASSERT_OK(Env::Default()->GetAbsolutePath(dbname, &db_absolute_path));
+
+  ASSERT_EQ("/data/rocksdb/LOG", InfoLogFileName(dbname, db_absolute_path, ""));
+  ASSERT_EQ("/data/rocksdb/LOG.old.666",
+            OldInfoLogFileName(dbname, 666u, db_absolute_path, ""));
+
+  ASSERT_EQ("/data/rocksdb_log/data_rocksdb_LOG",
+            InfoLogFileName(dbname, db_absolute_path, "/data/rocksdb_log"));
+  ASSERT_EQ(
+      "/data/rocksdb_log/data_rocksdb_LOG.old.666",
+      OldInfoLogFileName(dbname, 666u, db_absolute_path, "/data/rocksdb_log"));
+}
+
+TEST_F(FileNameTest, Construction) {
+  uint64_t number;
+  FileType type;
+  std::string fname;
+
+  fname = CurrentFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kCurrentFile, type);
+
+  fname = LockFileName("foo");
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(0U, number);
+  ASSERT_EQ(kDBLockFile, type);
+
+  fname = LogFileName("foo", 192);
+  ASSERT_EQ("foo/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(192U, number);
+  ASSERT_EQ(kWalFile, type);
+
+  fname = TableFileName({DbPath("bar", 0)}, 200, 0);
+  std::string fname1 =
+      TableFileName({DbPath("foo", 0), DbPath("bar", 0)}, 200, 1);
+  ASSERT_EQ(fname, fname1);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(200U, number);
+  ASSERT_EQ(kTableFile, type);
+
+  fname = DescriptorFileName("bar", 100);
+  ASSERT_EQ("bar/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kDescriptorFile, type);
+
+  fname = TempFileName("tmp", 999);
+  ASSERT_EQ("tmp/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(999U, number);
+  ASSERT_EQ(kTempFile, type);
+
+  fname = MetaDatabaseName("met", 100);
+  ASSERT_EQ("met/", std::string(fname.data(), 4));
+  ASSERT_TRUE(ParseFileName(fname.c_str() + 4, &number, &type));
+  ASSERT_EQ(100U, number);
+  ASSERT_EQ(kMetaDatabase, type);
+}
+
+TEST_F(FileNameTest, NormalizePath) {
+  // No leading slash
+  const std::string sep = std::string(1, kFilePathSeparator);
+
+  std::string expected = "FOLDER" + sep + "filename.ext";
+  std::string given = "FOLDER" + sep + "filename.ext";
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two chars /a
+
+  expected = sep + "a";
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two chars a/
+  expected = "a" + sep;
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Server only
+  expected = sep + sep + "a";
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // Two slashes after character
+  expected = "a" + sep;
+  given = "a" + sep + sep;
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // slash only   /
+  expected = sep;
+  given = expected;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // UNC only   //
+  expected = sep;
+  given = sep + sep;
+
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 3 slashesy   //
+  expected = sep + sep;
+  given = sep + sep + sep;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 3 slashes   //
+  expected = sep + sep + "a" + sep;
+  given = sep + sep + sep + "a" + sep;
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // 2 separators in the middle
+  expected = "a" + sep + "b";
+  given = "a" + sep + sep + "b";
+  ASSERT_EQ(expected, NormalizePath(given));
+
+  // UNC with duplicate slashes
+  expected = sep + sep + "SERVER" + sep + "a" + sep + "b" + sep + "c";
+  given = sep + sep + "SERVER" + sep + "a" + sep + sep + "b" + sep + "c";
+  ASSERT_EQ(expected, NormalizePath(given));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_job.cc b/src/rocksdb/db/flush_job.cc
new file mode 100644
index 000000000..645e42f44
--- /dev/null
+++ b/src/rocksdb/db/flush_job.cc
@@ -0,0 +1,1094 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <vector>
+
+#include "db/builder.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/event_helpers.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/memtable_list.h"
+#include "db/merge_context.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "logging/event_logger.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/iostats_context_imp.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/statistics.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "table/merging_iterator.h"
+#include "table/table_builder.h"
+#include "table/two_level_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const char* GetFlushReasonString(FlushReason flush_reason) {
+  switch (flush_reason) {
+    case FlushReason::kOthers:
+      return "Other Reasons";
+    case FlushReason::kGetLiveFiles:
+      return "Get Live Files";
+    case FlushReason::kShutDown:
+      return "Shut down";
+    case FlushReason::kExternalFileIngestion:
+      return "External File Ingestion";
+    case FlushReason::kManualCompaction:
+      return "Manual Compaction";
+    case FlushReason::kWriteBufferManager:
+      return "Write Buffer Manager";
+    case FlushReason::kWriteBufferFull:
+      return "Write Buffer Full";
+    case FlushReason::kTest:
+      return "Test";
+    case FlushReason::kDeleteFiles:
+      return "Delete Files";
+    case FlushReason::kAutoCompaction:
+      return "Auto Compaction";
+    case FlushReason::kManualFlush:
+      return "Manual Flush";
+    case FlushReason::kErrorRecovery:
+      return "Error Recovery";
+    case FlushReason::kWalFull:
+      return "WAL Full";
+    default:
+      return "Invalid";
+  }
+}
+
+FlushJob::FlushJob(
+    const std::string& dbname, ColumnFamilyData* cfd,
+    const ImmutableDBOptions& db_options,
+    const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+    const FileOptions& file_options, VersionSet* versions,
+    InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+    std::vector<SequenceNumber> existing_snapshots,
+    SequenceNumber earliest_write_conflict_snapshot,
+    SnapshotChecker* snapshot_checker, JobContext* job_context,
+    LogBuffer* log_buffer, FSDirectory* db_directory,
+    FSDirectory* output_file_directory, CompressionType output_compression,
+    Statistics* stats, EventLogger* event_logger, bool measure_io_stats,
+    const bool sync_output_directory, const bool write_manifest,
+    Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+    const SeqnoToTimeMapping& seqno_time_mapping, const std::string& db_id,
+    const std::string& db_session_id, std::string full_history_ts_low,
+    BlobFileCompletionCallback* blob_callback)
+    : dbname_(dbname),
+      db_id_(db_id),
+      db_session_id_(db_session_id),
+      cfd_(cfd),
+      db_options_(db_options),
+      mutable_cf_options_(mutable_cf_options),
+      max_memtable_id_(max_memtable_id),
+      file_options_(file_options),
+      versions_(versions),
+      db_mutex_(db_mutex),
+      shutting_down_(shutting_down),
+      existing_snapshots_(std::move(existing_snapshots)),
+      earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
+      snapshot_checker_(snapshot_checker),
+      job_context_(job_context),
+      log_buffer_(log_buffer),
+      db_directory_(db_directory),
+      output_file_directory_(output_file_directory),
+      output_compression_(output_compression),
+      stats_(stats),
+      event_logger_(event_logger),
+      measure_io_stats_(measure_io_stats),
+      sync_output_directory_(sync_output_directory),
+      write_manifest_(write_manifest),
+      edit_(nullptr),
+      base_(nullptr),
+      pick_memtable_called(false),
+      thread_pri_(thread_pri),
+      io_tracer_(io_tracer),
+      clock_(db_options_.clock),
+      full_history_ts_low_(std::move(full_history_ts_low)),
+      blob_callback_(blob_callback),
+      db_impl_seqno_time_mapping_(seqno_time_mapping) {
+  // Update the thread status to indicate flush.
+  ReportStartedFlush();
+  TEST_SYNC_POINT("FlushJob::FlushJob()");
+}
+
+FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); }
+
+void FlushJob::ReportStartedFlush() {
+  ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env,
+                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH);
+  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
+                                               job_context_->job_id);
+  IOSTATS_RESET(bytes_written);
+}
+
+void FlushJob::ReportFlushInputSize(const autovector<MemTable*>& mems) {
+  uint64_t input_size = 0;
+  for (auto* mem : mems) {
+    input_size += mem->ApproximateMemoryUsage();
+  }
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_MEMTABLES, input_size);
+}
+
+void FlushJob::RecordFlushIOStats() {
+  RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
+  ThreadStatusUtil::IncreaseThreadOperationProperty(
+      ThreadStatus::FLUSH_BYTES_WRITTEN, IOSTATS(bytes_written));
+  IOSTATS_RESET(bytes_written);
+}
+void FlushJob::PickMemTable() {
+  db_mutex_->AssertHeld();
+  assert(!pick_memtable_called);
+  pick_memtable_called = true;
+
+  // Maximum "NextLogNumber" of the memtables to flush.
+  // When mempurge feature is turned off, this variable is useless
+  // because the memtables are implicitly sorted by increasing order of creation
+  // time. Therefore mems_->back()->GetNextLogNumber() is already equal to
+  // max_next_log_number. However when Mempurge is on, the memtables are no
+  // longer sorted by increasing order of creation time. Therefore this variable
+  // becomes necessary because mems_->back()->GetNextLogNumber() is no longer
+  // necessarily equal to max_next_log_number.
+  uint64_t max_next_log_number = 0;
+
+  // Save the contents of the earliest memtable as a new Table
+  cfd_->imm()->PickMemtablesToFlush(max_memtable_id_, &mems_,
+                                    &max_next_log_number);
+  if (mems_.empty()) {
+    return;
+  }
+
+  ReportFlushInputSize(mems_);
+
+  // entries mems are (implicitly) sorted in ascending order by their created
+  // time. We will use the first memtable's `edit` to keep the meta info for
+  // this flush.
+  MemTable* m = mems_[0];
+  edit_ = m->GetEdits();
+  edit_->SetPrevLogNumber(0);
+  // SetLogNumber(log_num) indicates logs with number smaller than log_num
+  // will no longer be picked up for recovery.
+  edit_->SetLogNumber(max_next_log_number);
+  edit_->SetColumnFamily(cfd_->GetID());
+
+  // path 0 for level 0 file.
+  meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
+
+  base_ = cfd_->current();
+  base_->Ref();  // it is likely that we do not need this reference
+}
+
+Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta,
+                     bool* switched_to_mempurge) {
+  TEST_SYNC_POINT("FlushJob::Start");
+  db_mutex_->AssertHeld();
+  assert(pick_memtable_called);
+  // Mempurge threshold can be dynamically changed.
+  // For sake of consistency, mempurge_threshold is
+  // saved locally to maintain consistency in each
+  // FlushJob::Run call.
+  double mempurge_threshold =
+      mutable_cf_options_.experimental_mempurge_threshold;
+
+  AutoThreadOperationStageUpdater stage_run(ThreadStatus::STAGE_FLUSH_RUN);
+  if (mems_.empty()) {
+    ROCKS_LOG_BUFFER(log_buffer_, "[%s] Nothing in memtable to flush",
+                     cfd_->GetName().c_str());
+    return Status::OK();
+  }
+
+  // I/O measurement variables
+  PerfLevel prev_perf_level = PerfLevel::kEnableTime;
+  uint64_t prev_write_nanos = 0;
+  uint64_t prev_fsync_nanos = 0;
+  uint64_t prev_range_sync_nanos = 0;
+  uint64_t prev_prepare_write_nanos = 0;
+  uint64_t prev_cpu_write_nanos = 0;
+  uint64_t prev_cpu_read_nanos = 0;
+  if (measure_io_stats_) {
+    prev_perf_level = GetPerfLevel();
+    SetPerfLevel(PerfLevel::kEnableTime);
+    prev_write_nanos = IOSTATS(write_nanos);
+    prev_fsync_nanos = IOSTATS(fsync_nanos);
+    prev_range_sync_nanos = IOSTATS(range_sync_nanos);
+    prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
+    prev_cpu_write_nanos = IOSTATS(cpu_write_nanos);
+    prev_cpu_read_nanos = IOSTATS(cpu_read_nanos);
+  }
+  Status mempurge_s = Status::NotFound("No MemPurge.");
+  if ((mempurge_threshold > 0.0) &&
+      (cfd_->GetFlushReason() == FlushReason::kWriteBufferFull) &&
+      (!mems_.empty()) && MemPurgeDecider(mempurge_threshold) &&
+      !(db_options_.atomic_flush)) {
+    cfd_->SetMempurgeUsed();
+    mempurge_s = MemPurge();
+    if (!mempurge_s.ok()) {
+      // Mempurge is typically aborted when the output
+      // bytes cannot be contained onto a single output memtable.
+      if (mempurge_s.IsAborted()) {
+        ROCKS_LOG_INFO(db_options_.info_log, "Mempurge process aborted: %s\n",
+                       mempurge_s.ToString().c_str());
+      } else {
+        // However the mempurge process can also fail for
+        // other reasons (eg: new_mem->Add() fails).
+        ROCKS_LOG_WARN(db_options_.info_log, "Mempurge process failed: %s\n",
+                       mempurge_s.ToString().c_str());
+      }
+    } else {
+      if (switched_to_mempurge) {
+        *switched_to_mempurge = true;
+      } else {
+        // The mempurge process was successful, but no switch_to_mempurge
+        // pointer provided so no way to propagate the state of flush job.
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Mempurge process succeeded"
+                       "but no 'switched_to_mempurge' ptr provided.\n");
+      }
+    }
+  }
+  Status s;
+  if (mempurge_s.ok()) {
+    base_->Unref();
+    s = Status::OK();
+  } else {
+    // This will release and re-acquire the mutex.
+    s = WriteLevel0Table();
+  }
+
+  if (s.ok() && cfd_->IsDropped()) {
+    s = Status::ColumnFamilyDropped("Column family dropped during compaction");
+  }
+  if ((s.ok() || s.IsColumnFamilyDropped()) &&
+      shutting_down_->load(std::memory_order_acquire)) {
+    s = Status::ShutdownInProgress("Database shutdown");
+  }
+
+  if (!s.ok()) {
+    cfd_->imm()->RollbackMemtableFlush(mems_, meta_.fd.GetNumber());
+  } else if (write_manifest_) {
+    TEST_SYNC_POINT("FlushJob::InstallResults");
+    // Replace immutable memtable with the generated Table
+    s = cfd_->imm()->TryInstallMemtableFlushResults(
+        cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_,
+        meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_,
+        log_buffer_, &committed_flush_jobs_info_,
+        !(mempurge_s.ok()) /* write_edit : true if no mempurge happened (or if aborted),
+                              but 'false' if mempurge successful: no new min log number
+                              or new level 0 file path to write to manifest. */);
+  }
+
+  if (s.ok() && file_meta != nullptr) {
+    *file_meta = meta_;
+  }
+  RecordFlushIOStats();
+
+  // When measure_io_stats_ is true, the default 512 bytes is not enough.
+  auto stream = event_logger_->LogToBuffer(log_buffer_, 1024);
+  stream << "job" << job_context_->job_id << "event"
+         << "flush_finished";
+  stream << "output_compression"
+         << CompressionTypeToString(output_compression_);
+  stream << "lsm_state";
+  stream.StartArray();
+  auto vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); ++level) {
+    stream << vstorage->NumLevelFiles(level);
+  }
+  stream.EndArray();
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+  if (!blob_files.empty()) {
+    assert(blob_files.front());
+    stream << "blob_file_head" << blob_files.front()->GetBlobFileNumber();
+
+    assert(blob_files.back());
+    stream << "blob_file_tail" << blob_files.back()->GetBlobFileNumber();
+  }
+
+  stream << "immutable_memtables" << cfd_->imm()->NumNotFlushed();
+
+  if (measure_io_stats_) {
+    if (prev_perf_level != PerfLevel::kEnableTime) {
+      SetPerfLevel(prev_perf_level);
+    }
+    stream << "file_write_nanos" << (IOSTATS(write_nanos) - prev_write_nanos);
+    stream << "file_range_sync_nanos"
+           << (IOSTATS(range_sync_nanos) - prev_range_sync_nanos);
+    stream << "file_fsync_nanos" << (IOSTATS(fsync_nanos) - prev_fsync_nanos);
+    stream << "file_prepare_write_nanos"
+           << (IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos);
+    stream << "file_cpu_write_nanos"
+           << (IOSTATS(cpu_write_nanos) - prev_cpu_write_nanos);
+    stream << "file_cpu_read_nanos"
+           << (IOSTATS(cpu_read_nanos) - prev_cpu_read_nanos);
+  }
+
+  return s;
+}
+
+void FlushJob::Cancel() {
+  db_mutex_->AssertHeld();
+  assert(base_ != nullptr);
+  base_->Unref();
+}
+
+Status FlushJob::MemPurge() {
+  Status s;
+  db_mutex_->AssertHeld();
+  db_mutex_->Unlock();
+  assert(!mems_.empty());
+
+  // Measure purging time.
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
+
+  MemTable* new_mem = nullptr;
+  // For performance/log investigation purposes:
+  // look at how much useful payload we harvest in the new_mem.
+  // This value is then printed to the DB log.
+  double new_mem_capacity = 0.0;
+
+  // Create two iterators, one for the memtable data (contains
+  // info from puts + deletes), and one for the memtable
+  // Range Tombstones (from DeleteRanges).
+  ReadOptions ro;
+  ro.total_order_seek = true;
+  Arena arena;
+  std::vector<InternalIterator*> memtables;
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      range_del_iters;
+  for (MemTable* m : mems_) {
+    memtables.push_back(m->NewIterator(ro, &arena));
+    auto* range_del_iter = m->NewRangeTombstoneIterator(
+        ro, kMaxSequenceNumber, true /* immutable_memtable */);
+    if (range_del_iter != nullptr) {
+      range_del_iters.emplace_back(range_del_iter);
+    }
+  }
+
+  assert(!memtables.empty());
+  SequenceNumber first_seqno = kMaxSequenceNumber;
+  SequenceNumber earliest_seqno = kMaxSequenceNumber;
+  // Pick first and earliest seqno as min of all first_seqno
+  // and earliest_seqno of the mempurged memtables.
+  for (const auto& mem : mems_) {
+    first_seqno = mem->GetFirstSequenceNumber() < first_seqno
+                      ? mem->GetFirstSequenceNumber()
+                      : first_seqno;
+    earliest_seqno = mem->GetEarliestSequenceNumber() < earliest_seqno
+                         ? mem->GetEarliestSequenceNumber()
+                         : earliest_seqno;
+  }
+
+  ScopedArenaIterator iter(
+      NewMergingIterator(&(cfd_->internal_comparator()), memtables.data(),
+                         static_cast<int>(memtables.size()), &arena));
+
+  auto* ioptions = cfd_->ioptions();
+
+  // Place iterator at the First (meaning most recent) key node.
+  iter->SeekToFirst();
+
+  const std::string* const full_history_ts_low = &(cfd_->GetFullHistoryTsLow());
+  std::unique_ptr<CompactionRangeDelAggregator> range_del_agg(
+      new CompactionRangeDelAggregator(&(cfd_->internal_comparator()),
+                                       existing_snapshots_,
+                                       full_history_ts_low));
+  for (auto& rd_iter : range_del_iters) {
+    range_del_agg->AddTombstones(std::move(rd_iter));
+  }
+
+  // If there is valid data in the memtable,
+  // or at least range tombstones, copy over the info
+  // to the new memtable.
+  if (iter->Valid() || !range_del_agg->IsEmpty()) {
+    // MaxSize is the size of a memtable.
+    size_t maxSize = mutable_cf_options_.write_buffer_size;
+    std::unique_ptr<CompactionFilter> compaction_filter;
+    if (ioptions->compaction_filter_factory != nullptr &&
+        ioptions->compaction_filter_factory->ShouldFilterTableFileCreation(
+            TableFileCreationReason::kFlush)) {
+      CompactionFilter::Context ctx;
+      ctx.is_full_compaction = false;
+      ctx.is_manual_compaction = false;
+      ctx.column_family_id = cfd_->GetID();
+      ctx.reason = TableFileCreationReason::kFlush;
+      compaction_filter =
+          ioptions->compaction_filter_factory->CreateCompactionFilter(ctx);
+      if (compaction_filter != nullptr &&
+          !compaction_filter->IgnoreSnapshots()) {
+        s = Status::NotSupported(
+            "CompactionFilter::IgnoreSnapshots() = false is not supported "
+            "anymore.");
+        return s;
+      }
+    }
+
+    new_mem = new MemTable((cfd_->internal_comparator()), *(cfd_->ioptions()),
+                           mutable_cf_options_, cfd_->write_buffer_mgr(),
+                           earliest_seqno, cfd_->GetID());
+    assert(new_mem != nullptr);
+
+    Env* env = db_options_.env;
+    assert(env);
+    MergeHelper merge(
+        env, (cfd_->internal_comparator()).user_comparator(),
+        (ioptions->merge_operator).get(), compaction_filter.get(),
+        ioptions->logger, true /* internal key corruption is not ok */,
+        existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
+        snapshot_checker_);
+    assert(job_context_);
+    SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence();
+    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    CompactionIterator c_iter(
+        iter.get(), (cfd_->internal_comparator()).user_comparator(), &merge,
+        kMaxSequenceNumber, &existing_snapshots_,
+        earliest_write_conflict_snapshot_, job_snapshot_seq, snapshot_checker_,
+        env, ShouldReportDetailedTime(env, ioptions->stats),
+        true /* internal key corruption is not ok */, range_del_agg.get(),
+        nullptr, ioptions->allow_data_in_errors,
+        ioptions->enforce_single_del_contracts,
+        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        /*compaction=*/nullptr, compaction_filter.get(),
+        /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low);
+
+    // Set earliest sequence number in the new memtable
+    // to be equal to the earliest sequence number of the
+    // memtable being flushed (See later if there is a need
+    // to update this number!).
+    new_mem->SetEarliestSequenceNumber(earliest_seqno);
+    // Likewise for first seq number.
+    new_mem->SetFirstSequenceNumber(first_seqno);
+    SequenceNumber new_first_seqno = kMaxSequenceNumber;
+
+    c_iter.SeekToFirst();
+
+    // Key transfer
+    for (; c_iter.Valid(); c_iter.Next()) {
+      const ParsedInternalKey ikey = c_iter.ikey();
+      const Slice value = c_iter.value();
+      new_first_seqno =
+          ikey.sequence < new_first_seqno ? ikey.sequence : new_first_seqno;
+
+      // Should we update "OldestKeyTime" ???? -> timestamp appear
+      // to still be an "experimental" feature.
+      s = new_mem->Add(
+          ikey.sequence, ikey.type, ikey.user_key, value,
+          nullptr,   // KV protection info set as nullptr since it
+                     // should only be useful for the first add to
+                     // the original memtable.
+          false,     // : allow concurrent_memtable_writes_
+                     // Not seen as necessary for now.
+          nullptr,   // get_post_process_info(m) must be nullptr
+                     // when concurrent_memtable_writes is switched off.
+          nullptr);  // hint, only used when concurrent_memtable_writes_
+                     // is switched on.
+      if (!s.ok()) {
+        break;
+      }
+
+      // If new_mem has size greater than maxSize,
+      // then rollback to regular flush operation,
+      // and destroy new_mem.
+      if (new_mem->ApproximateMemoryUsage() > maxSize) {
+        s = Status::Aborted("Mempurge filled more than one memtable.");
+        new_mem_capacity = 1.0;
+        break;
+      }
+    }
+
+    // Check status and propagate
+    // potential error status from c_iter
+    if (!s.ok()) {
+      c_iter.status().PermitUncheckedError();
+    } else if (!c_iter.status().ok()) {
+      s = c_iter.status();
+    }
+
+    // Range tombstone transfer.
+    if (s.ok()) {
+      auto range_del_it = range_del_agg->NewIterator();
+      for (range_del_it->SeekToFirst(); range_del_it->Valid();
+           range_del_it->Next()) {
+        auto tombstone = range_del_it->Tombstone();
+        new_first_seqno =
+            tombstone.seq_ < new_first_seqno ? tombstone.seq_ : new_first_seqno;
+        s = new_mem->Add(
+            tombstone.seq_,        // Sequence number
+            kTypeRangeDeletion,    // KV type
+            tombstone.start_key_,  // Key is start key.
+            tombstone.end_key_,    // Value is end key.
+            nullptr,               // KV protection info set as nullptr since it
+                                   // should only be useful for the first add to
+                                   // the original memtable.
+            false,                 // : allow concurrent_memtable_writes_
+                                   // Not seen as necessary for now.
+            nullptr,               // get_post_process_info(m) must be nullptr
+                      // when concurrent_memtable_writes is switched off.
+            nullptr);  // hint, only used when concurrent_memtable_writes_
+                       // is switched on.
+
+        if (!s.ok()) {
+          break;
+        }
+
+        // If new_mem has size greater than maxSize,
+        // then rollback to regular flush operation,
+        // and destroy new_mem.
+        if (new_mem->ApproximateMemoryUsage() > maxSize) {
+          s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+          new_mem_capacity = 1.0;
+          break;
+        }
+      }
+    }
+
+    // If everything happened smoothly and new_mem contains valid data,
+    // decide if it is flushed to storage or kept in the imm()
+    // memtable list (memory).
+    if (s.ok() && (new_first_seqno != kMaxSequenceNumber)) {
+      // Rectify the first sequence number, which (unlike the earliest seq
+      // number) needs to be present in the new memtable.
+      new_mem->SetFirstSequenceNumber(new_first_seqno);
+
+      // The new_mem is added to the list of immutable memtables
+      // only if it filled at less than 100% capacity and isn't flagged
+      // as in need of being flushed.
+      if (new_mem->ApproximateMemoryUsage() < maxSize &&
+          !(new_mem->ShouldFlushNow())) {
+        // Construct fragmented memtable range tombstones without mutex
+        new_mem->ConstructFragmentedRangeTombstones();
+        db_mutex_->Lock();
+        uint64_t new_mem_id = mems_[0]->GetID();
+
+        new_mem->SetID(new_mem_id);
+        new_mem->SetNextLogNumber(mems_[0]->GetNextLogNumber());
+
+        // This addition will not trigger another flush, because
+        // we do not call SchedulePendingFlush().
+        cfd_->imm()->Add(new_mem, &job_context_->memtables_to_free);
+        new_mem->Ref();
+#ifndef ROCKSDB_LITE
+        // Piggyback FlushJobInfo on the first flushed memtable.
+        db_mutex_->AssertHeld();
+        meta_.fd.file_size = 0;
+        mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+        db_mutex_->Unlock();
+      } else {
+        s = Status::Aborted(Slice("Mempurge filled more than one memtable."));
+        new_mem_capacity = 1.0;
+        if (new_mem) {
+          job_context_->memtables_to_free.push_back(new_mem);
+        }
+      }
+    } else {
+      // In this case, the newly allocated new_mem is empty.
+      assert(new_mem != nullptr);
+      job_context_->memtables_to_free.push_back(new_mem);
+    }
+  }
+
+  // Reacquire the mutex for WriteLevel0 function.
+  db_mutex_->Lock();
+
+  // If mempurge successful, don't write input tables to level0,
+  // but write any full output table to level0.
+  if (s.ok()) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeSuccessful");
+  } else {
+    TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
+  }
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Mempurge lasted %" PRIu64
+                 " microseconds, and %" PRIu64
+                 " cpu "
+                 "microseconds. Status is %s ok. Perc capacity: %f\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros, s.ok() ? "" : "not", new_mem_capacity);
+
+  return s;
+}
+
+bool FlushJob::MemPurgeDecider(double threshold) {
+  // Never trigger mempurge if threshold is not a strictly positive value.
+  if (!(threshold > 0.0)) {
+    return false;
+  }
+  if (threshold > (1.0 * mems_.size())) {
+    return true;
+  }
+  // Payload and useful_payload (in bytes).
+  // The useful payload ratio of a given MemTable
+  // is estimated to be useful_payload/payload.
+  uint64_t payload = 0, useful_payload = 0, entry_size = 0;
+
+  // Local variables used repetitively inside the for-loop
+  // when iterating over the sampled entries.
+  Slice key_slice, value_slice;
+  ParsedInternalKey res;
+  SnapshotImpl min_snapshot;
+  std::string vget;
+  Status mget_s, parse_s;
+  MergeContext merge_context;
+  SequenceNumber max_covering_tombstone_seq = 0, sqno = 0,
+                 min_seqno_snapshot = 0;
+  bool get_res, can_be_useful_payload, not_in_next_mems;
+
+  // If estimated_useful_payload is > threshold,
+  // then flush to storage, else MemPurge.
+  double estimated_useful_payload = 0.0;
+  // Cochran formula for determining sample size.
+  // 95% confidence interval, 7% precision.
+  //    n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
+  double n0 = 196.0;
+  ReadOptions ro;
+  ro.total_order_seek = true;
+
+  // Iterate over each memtable of the set.
+  for (auto mem_iter = std::begin(mems_); mem_iter != std::end(mems_);
+       mem_iter++) {
+    MemTable* mt = *mem_iter;
+
+    // Else sample from the table.
+    uint64_t nentries = mt->num_entries();
+    // Corrected Cochran formula for small populations
+    // (converges to n0 for large populations).
+    uint64_t target_sample_size =
+        static_cast<uint64_t>(ceil(n0 / (1.0 + (n0 / nentries))));
+    std::unordered_set<const char*> sentries = {};
+    // Populate sample entries set.
+    mt->UniqueRandomSample(target_sample_size, &sentries);
+
+    // Estimate the garbage ratio by comparing if
+    // each sample corresponds to a valid entry.
+    for (const char* ss : sentries) {
+      key_slice = GetLengthPrefixedSlice(ss);
+      parse_s = ParseInternalKey(key_slice, &res, true /*log_err_key*/);
+      if (!parse_s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Memtable Decider: ParseInternalKey did not parse "
+                       "key_slice %s successfully.",
+                       key_slice.data());
+      }
+
+      // Size of the entry is "key size (+ value size if KV entry)"
+      entry_size = key_slice.size();
+      if (res.type == kTypeValue) {
+        value_slice =
+            GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+        entry_size += value_slice.size();
+      }
+
+      // Count entry bytes as payload.
+      payload += entry_size;
+
+      LookupKey lkey(res.user_key, kMaxSequenceNumber);
+
+      // Paranoia: zero out these values just in case.
+      max_covering_tombstone_seq = 0;
+      sqno = 0;
+
+      // Pick the oldest existing snapshot that is more recent
+      // than the sequence number of the sampled entry.
+      min_seqno_snapshot = kMaxSequenceNumber;
+      for (SequenceNumber seq_num : existing_snapshots_) {
+        if (seq_num > res.sequence && seq_num < min_seqno_snapshot) {
+          min_seqno_snapshot = seq_num;
+        }
+      }
+      min_snapshot.number_ = min_seqno_snapshot;
+      ro.snapshot =
+          min_seqno_snapshot < kMaxSequenceNumber ? &min_snapshot : nullptr;
+
+      // Estimate if the sample entry is valid or not.
+      get_res = mt->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                        &mget_s, &merge_context, &max_covering_tombstone_seq,
+                        &sqno, ro, true /* immutable_memtable */);
+      if (!get_res) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Memtable Get returned false when Get(sampled entry). "
+            "Yet each sample entry should exist somewhere in the memtable, "
+            "unrelated to whether it has been deleted or not.");
+      }
+
+      // TODO(bjlemaire): evaluate typeMerge.
+      // This is where the sampled entry is estimated to be
+      // garbage or not. Note that this is a garbage *estimation*
+      // because we do not include certain items such as
+      // CompactionFitlers triggered at flush, or if the same delete
+      // has been inserted twice or more in the memtable.
+
+      // Evaluate if the entry can be useful payload
+      // Situation #1: entry is a KV entry, was found in the memtable mt
+      //               and the sequence numbers match.
+      can_be_useful_payload = (res.type == kTypeValue) && get_res &&
+                              mget_s.ok() && (sqno == res.sequence);
+
+      // Situation #2: entry is a delete entry, was found in the memtable mt
+      //               (because gres==true) and no valid KV entry is found.
+      //               (note: duplicate delete entries are also taken into
+      //               account here, because the sequence number 'sqno'
+      //               in memtable->Get(&sqno) operation is set to be equal
+      //               to the most recent delete entry as well).
+      can_be_useful_payload |=
+          ((res.type == kTypeDeletion) || (res.type == kTypeSingleDeletion)) &&
+          mget_s.IsNotFound() && get_res && (sqno == res.sequence);
+
+      // If there is a chance that the entry is useful payload
+      // Verify that the entry does not appear in the following memtables
+      // (memtables with greater memtable ID/larger sequence numbers).
+      if (can_be_useful_payload) {
+        not_in_next_mems = true;
+        for (auto next_mem_iter = mem_iter + 1;
+             next_mem_iter != std::end(mems_); next_mem_iter++) {
+          if ((*next_mem_iter)
+                  ->Get(lkey, &vget, /*columns=*/nullptr, /*timestamp=*/nullptr,
+                        &mget_s, &merge_context, &max_covering_tombstone_seq,
+                        &sqno, ro, true /* immutable_memtable */)) {
+            not_in_next_mems = false;
+            break;
+          }
+        }
+        if (not_in_next_mems) {
+          useful_payload += entry_size;
+        }
+      }
+    }
+    if (payload > 0) {
+      // We use the estimated useful payload ratio to
+      // evaluate how many of the memtable bytes are useful bytes.
+      estimated_useful_payload +=
+          (mt->ApproximateMemoryUsage()) * (useful_payload * 1.0 / payload);
+
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "Mempurge sampling [CF %s] - found garbage ratio from "
+                     "sampling: %f. Threshold is %f\n",
+                     cfd_->GetName().c_str(),
+                     (payload - useful_payload) * 1.0 / payload, threshold);
+    } else {
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "Mempurge sampling: null payload measured, and collected "
+                     "sample size is %zu\n.",
+                     sentries.size());
+    }
+  }
+  // We convert the total number of useful payload bytes
+  // into the proportion of memtable necessary to store all these bytes.
+  // We compare this proportion with the threshold value.
+  return ((estimated_useful_payload / mutable_cf_options_.write_buffer_size) <
+          threshold);
+}
+
+Status FlushJob::WriteLevel0Table() {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_FLUSH_WRITE_L0);
+  db_mutex_->AssertHeld();
+  const uint64_t start_micros = clock_->NowMicros();
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
+  Status s;
+
+  SequenceNumber smallest_seqno = mems_.front()->GetEarliestSequenceNumber();
+  if (!db_impl_seqno_time_mapping_.Empty()) {
+    // make a local copy, as the seqno_time_mapping from db_impl is not thread
+    // safe, which will be used while not holding the db_mutex.
+    seqno_to_time_mapping_ = db_impl_seqno_time_mapping_.Copy(smallest_seqno);
+  }
+
+  std::vector<BlobFileAddition> blob_file_additions;
+
+  {
+    auto write_hint = cfd_->CalculateSSTWriteHint(0);
+    Env::IOPriority io_priority = GetRateLimiterPriorityForWrite();
+    db_mutex_->Unlock();
+    if (log_buffer_) {
+      log_buffer_->FlushBufferToLog();
+    }
+    // memtables and range_del_iters store internal iterators over each data
+    // memtable and its associated range deletion memtable, respectively, at
+    // corresponding indexes.
+    std::vector<InternalIterator*> memtables;
+    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+        range_del_iters;
+    ReadOptions ro;
+    ro.total_order_seek = true;
+    Arena arena;
+    uint64_t total_num_entries = 0, total_num_deletes = 0;
+    uint64_t total_data_size = 0;
+    size_t total_memory_usage = 0;
+    // Used for testing:
+    uint64_t mems_size = mems_.size();
+    (void)mems_size;  // avoids unused variable error when
+                      // TEST_SYNC_POINT_CALLBACK not used.
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:num_memtables",
+                             &mems_size);
+    assert(job_context_);
+    for (MemTable* m : mems_) {
+      ROCKS_LOG_INFO(
+          db_options_.info_log,
+          "[%s] [JOB %d] Flushing memtable with next log file: %" PRIu64 "\n",
+          cfd_->GetName().c_str(), job_context_->job_id, m->GetNextLogNumber());
+      memtables.push_back(m->NewIterator(ro, &arena));
+      auto* range_del_iter = m->NewRangeTombstoneIterator(
+          ro, kMaxSequenceNumber, true /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+      total_num_entries += m->num_entries();
+      total_num_deletes += m->num_deletes();
+      total_data_size += m->get_data_size();
+      total_memory_usage += m->ApproximateMemoryUsage();
+    }
+
+    event_logger_->Log() << "job" << job_context_->job_id << "event"
+                         << "flush_started"
+                         << "num_memtables" << mems_.size() << "num_entries"
+                         << total_num_entries << "num_deletes"
+                         << total_num_deletes << "total_data_size"
+                         << total_data_size << "memory_usage"
+                         << total_memory_usage << "flush_reason"
+                         << GetFlushReasonString(cfd_->GetFlushReason());
+
+    {
+      ScopedArenaIterator iter(
+          NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
+                             static_cast<int>(memtables.size()), &arena));
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": started",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber());
+
+      TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
+                               &output_compression_);
+      int64_t _current_time = 0;
+      auto status = clock_->GetCurrentTime(&_current_time);
+      // Safe to proceed even if GetCurrentTime fails. So, log and proceed.
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Failed to get current time to populate creation_time property. "
+            "Status: %s",
+            status.ToString().c_str());
+      }
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+      uint64_t oldest_key_time = mems_.front()->ApproximateOldestKeyTime();
+
+      // It's not clear whether oldest_key_time is always available. In case
+      // it is not available, use current_time.
+      uint64_t oldest_ancester_time = std::min(current_time, oldest_key_time);
+
+      TEST_SYNC_POINT_CALLBACK(
+          "FlushJob::WriteLevel0Table:oldest_ancester_time",
+          &oldest_ancester_time);
+      meta_.oldest_ancester_time = oldest_ancester_time;
+      meta_.file_creation_time = current_time;
+
+      uint64_t num_input_entries = 0;
+      uint64_t memtable_payload_bytes = 0;
+      uint64_t memtable_garbage_bytes = 0;
+      IOStatus io_s;
+
+      const std::string* const full_history_ts_low =
+          (full_history_ts_low_.empty()) ? nullptr : &full_history_ts_low_;
+      TableBuilderOptions tboptions(
+          *cfd_->ioptions(), mutable_cf_options_, cfd_->internal_comparator(),
+          cfd_->int_tbl_prop_collector_factories(), output_compression_,
+          mutable_cf_options_.compression_opts, cfd_->GetID(), cfd_->GetName(),
+          0 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kFlush, oldest_key_time, current_time,
+          db_id_, db_session_id_, 0 /* target_file_size */,
+          meta_.fd.GetNumber());
+      const SequenceNumber job_snapshot_seq =
+          job_context_->GetJobSnapshotSequence();
+      s = BuildTable(
+          dbname_, versions_, db_options_, tboptions, file_options_,
+          cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_,
+          &blob_file_additions, existing_snapshots_,
+          earliest_write_conflict_snapshot_, job_snapshot_seq,
+          snapshot_checker_, mutable_cf_options_.paranoid_file_checks,
+          cfd_->internal_stats(), &io_s, io_tracer_,
+          BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_,
+          job_context_->job_id, io_priority, &table_properties_, write_hint,
+          full_history_ts_low, blob_callback_, &num_input_entries,
+          &memtable_payload_bytes, &memtable_garbage_bytes);
+      // TODO: Cleanup io_status in BuildTable and table builders
+      assert(!s.ok() || io_s.ok());
+      io_s.PermitUncheckedError();
+      if (num_input_entries != total_num_entries && s.ok()) {
+        std::string msg = "Expected " + std::to_string(total_num_entries) +
+                          " entries in memtables, but read " +
+                          std::to_string(num_input_entries);
+        ROCKS_LOG_WARN(db_options_.info_log, "[%s] [JOB %d] Level-0 flush %s",
+                       cfd_->GetName().c_str(), job_context_->job_id,
+                       msg.c_str());
+        if (db_options_.flush_verify_memtable_count) {
+          s = Status::Corruption(msg);
+        }
+      }
+      if (tboptions.reason == TableFileCreationReason::kFlush) {
+        TEST_SYNC_POINT("DBImpl::FlushJob:Flush");
+        RecordTick(stats_, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH,
+                   memtable_payload_bytes);
+        RecordTick(stats_, MEMTABLE_GARBAGE_BYTES_AT_FLUSH,
+                   memtable_garbage_bytes);
+      }
+      LogFlush(db_options_.info_log);
+    }
+    ROCKS_LOG_BUFFER(log_buffer_,
+                     "[%s] [JOB %d] Level-0 flush table #%" PRIu64 ": %" PRIu64
+                     " bytes %s"
+                     "%s",
+                     cfd_->GetName().c_str(), job_context_->job_id,
+                     meta_.fd.GetNumber(), meta_.fd.GetFileSize(),
+                     s.ToString().c_str(),
+                     meta_.marked_for_compaction ? " (needs compaction)" : "");
+
+    if (s.ok() && output_file_directory_ != nullptr && sync_output_directory_) {
+      s = output_file_directory_->FsyncWithDirOptions(
+          IOOptions(), nullptr,
+          DirFsyncOptions(DirFsyncOptions::FsyncReason::kNewFileSynced));
+    }
+    TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table", &mems_);
+    db_mutex_->Lock();
+  }
+  base_->Unref();
+
+  // Note that if file_size is zero, the file has been deleted and
+  // should not be added to the manifest.
+  const bool has_output = meta_.fd.GetFileSize() > 0;
+
+  if (s.ok() && has_output) {
+    TEST_SYNC_POINT("DBImpl::FlushJob:SSTFileCreated");
+    // if we have more than 1 background thread, then we cannot
+    // insert files directly into higher levels because some other
+    // threads could be concurrently producing compacted files for
+    // that key range.
+    // Add file to L0
+    edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
+                   meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
+                   meta_.fd.smallest_seqno, meta_.fd.largest_seqno,
+                   meta_.marked_for_compaction, meta_.temperature,
+                   meta_.oldest_blob_file_number, meta_.oldest_ancester_time,
+                   meta_.file_creation_time, meta_.file_checksum,
+                   meta_.file_checksum_func_name, meta_.unique_id);
+
+    edit_->SetBlobFileAdditions(std::move(blob_file_additions));
+  }
+#ifndef ROCKSDB_LITE
+  // Piggyback FlushJobInfo on the first first flushed memtable.
+  mems_[0]->SetFlushJobInfo(GetFlushJobInfo());
+#endif  // !ROCKSDB_LITE
+
+  // Note that here we treat flush as level 0 compaction in internal stats
+  InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
+  const uint64_t micros = clock_->NowMicros() - start_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
+  stats.micros = micros;
+  stats.cpu_micros = cpu_micros;
+
+  ROCKS_LOG_INFO(db_options_.info_log,
+                 "[%s] [JOB %d] Flush lasted %" PRIu64
+                 " microseconds, and %" PRIu64 " cpu microseconds.\n",
+                 cfd_->GetName().c_str(), job_context_->job_id, micros,
+                 cpu_micros);
+
+  if (has_output) {
+    stats.bytes_written = meta_.fd.GetFileSize();
+    stats.num_output_files = 1;
+  }
+
+  const auto& blobs = edit_->GetBlobFileAdditions();
+  for (const auto& blob : blobs) {
+    stats.bytes_written_blob += blob.GetTotalBlobBytes();
+  }
+
+  stats.num_output_files_blob = static_cast<int>(blobs.size());
+
+  RecordTimeToHistogram(stats_, FLUSH_TIME, stats.micros);
+  cfd_->internal_stats()->AddCompactionStats(0 /* level */, thread_pri_, stats);
+  cfd_->internal_stats()->AddCFStats(
+      InternalStats::BYTES_FLUSHED,
+      stats.bytes_written + stats.bytes_written_blob);
+  RecordFlushIOStats();
+
+  return s;
+}
+
+Env::IOPriority FlushJob::GetRateLimiterPriorityForWrite() {
+  if (versions_ && versions_->GetColumnFamilySet() &&
+      versions_->GetColumnFamilySet()->write_controller()) {
+    WriteController* write_controller =
+        versions_->GetColumnFamilySet()->write_controller();
+    if (write_controller->IsStopped() || write_controller->NeedsDelay()) {
+      return Env::IO_USER;
+    }
+  }
+
+  return Env::IO_HIGH;
+}
+
+#ifndef ROCKSDB_LITE
+std::unique_ptr<FlushJobInfo> FlushJob::GetFlushJobInfo() const {
+  db_mutex_->AssertHeld();
+  std::unique_ptr<FlushJobInfo> info(new FlushJobInfo{});
+  info->cf_id = cfd_->GetID();
+  info->cf_name = cfd_->GetName();
+
+  const uint64_t file_number = meta_.fd.GetNumber();
+  info->file_path =
+      MakeTableFileName(cfd_->ioptions()->cf_paths[0].path, file_number);
+  info->file_number = file_number;
+  info->oldest_blob_file_number = meta_.oldest_blob_file_number;
+  info->thread_id = db_options_.env->GetThreadID();
+  info->job_id = job_context_->job_id;
+  info->smallest_seqno = meta_.fd.smallest_seqno;
+  info->largest_seqno = meta_.fd.largest_seqno;
+  info->table_properties = table_properties_;
+  info->flush_reason = cfd_->GetFlushReason();
+  info->blob_compression_type = mutable_cf_options_.blob_compression_type;
+
+  // Update BlobFilesInfo.
+  for (const auto& blob_file : edit_->GetBlobFileAdditions()) {
+    BlobFileAdditionInfo blob_file_addition_info(
+        BlobFileName(cfd_->ioptions()->cf_paths.front().path,
+                     blob_file.GetBlobFileNumber()) /*blob_file_path*/,
+        blob_file.GetBlobFileNumber(), blob_file.GetTotalBlobCount(),
+        blob_file.GetTotalBlobBytes());
+    info->blob_file_addition_infos.emplace_back(
+        std::move(blob_file_addition_info));
+  }
+  return info;
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job.h b/src/rocksdb/db/flush_job.h
new file mode 100644
index 000000000..60c272aec
--- /dev/null
+++ b/src/rocksdb/db/flush_job.h
@@ -0,0 +1,203 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_completion_callback.h"
+#include "db/column_family.h"
+#include "db/flush_scheduler.h"
+#include "db/internal_stats.h"
+#include "db/job_context.h"
+#include "db/log_writer.h"
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable_list.h"
+#include "db/seqno_to_time_mapping.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "db/write_thread.h"
+#include "logging/event_logger.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/listener.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/transaction_log.h"
+#include "table/scoped_arena_iterator.h"
+#include "util/autovector.h"
+#include "util/stop_watch.h"
+#include "util/thread_local.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class MemTable;
+class SnapshotChecker;
+class TableCache;
+class Version;
+class VersionEdit;
+class VersionSet;
+class Arena;
+
+class FlushJob {
+ public:
+  // TODO(icanadi) make effort to reduce number of parameters here
+  // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive
+  FlushJob(const std::string& dbname, ColumnFamilyData* cfd,
+           const ImmutableDBOptions& db_options,
+           const MutableCFOptions& mutable_cf_options, uint64_t max_memtable_id,
+           const FileOptions& file_options, VersionSet* versions,
+           InstrumentedMutex* db_mutex, std::atomic<bool>* shutting_down,
+           std::vector<SequenceNumber> existing_snapshots,
+           SequenceNumber earliest_write_conflict_snapshot,
+           SnapshotChecker* snapshot_checker, JobContext* job_context,
+           LogBuffer* log_buffer, FSDirectory* db_directory,
+           FSDirectory* output_file_directory,
+           CompressionType output_compression, Statistics* stats,
+           EventLogger* event_logger, bool measure_io_stats,
+           const bool sync_output_directory, const bool write_manifest,
+           Env::Priority thread_pri, const std::shared_ptr<IOTracer>& io_tracer,
+           const SeqnoToTimeMapping& seq_time_mapping,
+           const std::string& db_id = "", const std::string& db_session_id = "",
+           std::string full_history_ts_low = "",
+           BlobFileCompletionCallback* blob_callback = nullptr);
+
+  ~FlushJob();
+
+  // Require db_mutex held.
+  // Once PickMemTable() is called, either Run() or Cancel() has to be called.
+  void PickMemTable();
+  Status Run(LogsWithPrepTracker* prep_tracker = nullptr,
+             FileMetaData* file_meta = nullptr,
+             bool* switched_to_mempurge = nullptr);
+  void Cancel();
+  const autovector<MemTable*>& GetMemTables() const { return mems_; }
+
+#ifndef ROCKSDB_LITE
+  std::list<std::unique_ptr<FlushJobInfo>>* GetCommittedFlushJobsInfo() {
+    return &committed_flush_jobs_info_;
+  }
+#endif  // !ROCKSDB_LITE
+
+ private:
+  friend class FlushJobTest_GetRateLimiterPriorityForWrite_Test;
+
+  void ReportStartedFlush();
+  void ReportFlushInputSize(const autovector<MemTable*>& mems);
+  void RecordFlushIOStats();
+  Status WriteLevel0Table();
+
+  // Memtable Garbage Collection algorithm: a MemPurge takes the list
+  // of immutable memtables and filters out (or "purge") the outdated bytes
+  // out of it. The output (the filtered bytes, or "useful payload") is
+  // then transfered into a new memtable. If this memtable is filled, then
+  // the mempurge is aborted and rerouted to a regular flush process. Else,
+  // depending on the heuristics, placed onto the immutable memtable list.
+  // The addition to the imm list will not trigger a flush operation. The
+  // flush of the imm list will instead be triggered once the mutable memtable
+  // is added to the imm list.
+  // This process is typically intended for workloads with heavy overwrites
+  // when we want to avoid SSD writes (and reads) as much as possible.
+  // "MemPurge" is an experimental feature still at a very early stage
+  // of development. At the moment it is only compatible with the Get, Put,
+  // Delete operations as well as Iterators and CompactionFilters.
+  // For this early version, "MemPurge" is called by setting the
+  // options.experimental_mempurge_threshold value as >0.0. When this is
+  // the case, ALL automatic flush operations (kWRiteBufferManagerFull) will
+  // first go through the MemPurge process. Therefore, we strongly
+  // recommend all users not to set this flag as true given that the MemPurge
+  // process has not matured yet.
+  Status MemPurge();
+  bool MemPurgeDecider(double threshold);
+  // The rate limiter priority (io_priority) is determined dynamically here.
+  Env::IOPriority GetRateLimiterPriorityForWrite();
+#ifndef ROCKSDB_LITE
+  std::unique_ptr<FlushJobInfo> GetFlushJobInfo() const;
+#endif  // !ROCKSDB_LITE
+
+  const std::string& dbname_;
+  const std::string db_id_;
+  const std::string db_session_id_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const MutableCFOptions& mutable_cf_options_;
+  // A variable storing the largest memtable id to flush in this
+  // flush job. RocksDB uses this variable to select the memtables to flush in
+  // this job. All memtables in this column family with an ID smaller than or
+  // equal to max_memtable_id_ will be selected for flush.
+  uint64_t max_memtable_id_;
+  const FileOptions file_options_;
+  VersionSet* versions_;
+  InstrumentedMutex* db_mutex_;
+  std::atomic<bool>* shutting_down_;
+  std::vector<SequenceNumber> existing_snapshots_;
+  SequenceNumber earliest_write_conflict_snapshot_;
+  SnapshotChecker* snapshot_checker_;
+  JobContext* job_context_;
+  LogBuffer* log_buffer_;
+  FSDirectory* db_directory_;
+  FSDirectory* output_file_directory_;
+  CompressionType output_compression_;
+  Statistics* stats_;
+  EventLogger* event_logger_;
+  TableProperties table_properties_;
+  bool measure_io_stats_;
+  // True if this flush job should call fsync on the output directory. False
+  // otherwise.
+  // Usually sync_output_directory_ is true. A flush job needs to call sync on
+  // the output directory before committing to the MANIFEST.
+  // However, an individual flush job does not have to call sync on the output
+  // directory if it is part of an atomic flush. After all flush jobs in the
+  // atomic flush succeed, call sync once on each distinct output directory.
+  const bool sync_output_directory_;
+  // True if this flush job should write to MANIFEST after successfully
+  // flushing memtables. False otherwise.
+  // Usually write_manifest_ is true. A flush job commits to the MANIFEST after
+  // flushing the memtables.
+  // However, an individual flush job cannot rashly write to the MANIFEST
+  // immediately after it finishes the flush if it is part of an atomic flush.
+  // In this case, only after all flush jobs succeed in flush can RocksDB
+  // commit to the MANIFEST.
+  const bool write_manifest_;
+  // The current flush job can commit flush result of a concurrent flush job.
+  // We collect FlushJobInfo of all jobs committed by current job and fire
+  // OnFlushCompleted for them.
+  std::list<std::unique_ptr<FlushJobInfo>> committed_flush_jobs_info_;
+
+  // Variables below are set by PickMemTable():
+  FileMetaData meta_;
+  autovector<MemTable*> mems_;
+  VersionEdit* edit_;
+  Version* base_;
+  bool pick_memtable_called;
+  Env::Priority thread_pri_;
+
+  const std::shared_ptr<IOTracer> io_tracer_;
+  SystemClock* clock_;
+
+  const std::string full_history_ts_low_;
+  BlobFileCompletionCallback* blob_callback_;
+
+  // reference to the seqno_time_mapping_ in db_impl.h, not safe to read without
+  // db mutex
+  const SeqnoToTimeMapping& db_impl_seqno_time_mapping_;
+  SeqnoToTimeMapping seqno_to_time_mapping_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_job_test.cc b/src/rocksdb/db/flush_job_test.cc
new file mode 100644
index 000000000..f994b4e9b
--- /dev/null
+++ b/src/rocksdb/db/flush_job_test.cc
@@ -0,0 +1,745 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_job.h"
+
+#include <algorithm>
+#include <array>
+#include <map>
+#include <string>
+
+#include "db/blob/blob_index.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) Mock out everything else:
+// 1. VersionSet
+// 2. Memtable
+class FlushJobTestBase : public testing::Test {
+ protected:
+  FlushJobTestBase(std::string dbname, const Comparator* ucmp)
+      : env_(Env::Default()),
+        fs_(env_->GetFileSystem()),
+        dbname_(std::move(dbname)),
+        ucmp_(ucmp),
+        options_(),
+        db_options_(options_),
+        column_family_names_({kDefaultColumnFamilyName, "foo", "bar"}),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(new mock::MockTableFactory()) {}
+
+  virtual ~FlushJobTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "db is still in %s\n", dbname_.c_str());
+    } else {
+      // destroy versions_ to release all file handles
+      versions_.reset();
+      EXPECT_OK(DestroyDir(env_, dbname_));
+    }
+  }
+
+  void NewDB() {
+    ASSERT_OK(SetIdentityFile(env_, dbname_));
+    VersionEdit new_db;
+
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    autovector<VersionEdit> new_cfs;
+    SequenceNumber last_seq = 1;
+    uint32_t cf_id = 1;
+    for (size_t i = 1; i != column_family_names_.size(); ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(column_family_names_[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetComparatorName(ucmp_->Name());
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+
+    {
+      log::Writer log(std::move(file_writer), 0, false);
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = log.AddRecord(record);
+      ASSERT_OK(s);
+
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = log.AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+    // Make "CURRENT" file that points to the new manifest file.
+    s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+    ASSERT_OK(s);
+  }
+
+  void SetUp() override {
+    EXPECT_OK(env_->CreateDirIfMissing(dbname_));
+
+    // TODO(icanadi) Remove this once we mock out VersionSet
+    NewDB();
+
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.statistics = CreateDBStatistics();
+
+    cf_options_.comparator = ucmp_;
+
+    std::vector<ColumnFamilyDescriptor> column_families;
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : column_family_names_) {
+      column_families.emplace_back(cf_name, cf_options_);
+    }
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families, false));
+  }
+
+  Env* env_;
+  std::shared_ptr<FileSystem> fs_;
+  std::string dbname_;
+  const Comparator* const ucmp_;
+  EnvOptions env_options_;
+  Options options_;
+  ImmutableDBOptions db_options_;
+  const std::vector<std::string> column_family_names_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  ColumnFamilyOptions cf_options_;
+  std::unique_ptr<VersionSet> versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+
+  SeqnoToTimeMapping empty_seqno_to_time_mapping_;
+};
+
+class FlushJobTest : public FlushJobTestBase {
+ public:
+  FlushJobTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_test"),
+                         BytewiseComparator()) {}
+};
+
+TEST_F(FlushJobTest, Empty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(),
+                     db_options_, *cfd->GetLatestMutableCFOptions(),
+                     std::numeric_limits<uint64_t>::max() /* memtable_id */,
+                     env_options_, versions_.get(), &mutex_, &shutting_down_,
+                     {}, kMaxSequenceNumber, snapshot_checker, &job_context,
+                     nullptr, nullptr, nullptr, kNoCompression, nullptr,
+                     &event_logger, false, true /* sync_output_directory */,
+                     true /* write_manifest */, Env::Priority::USER,
+                     nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  {
+    InstrumentedMutexLock l(&mutex_);
+    flush_job.PickMemTable();
+    ASSERT_OK(flush_job.Run());
+  }
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, NonEmpty) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+  new_mem->Ref();
+  auto inserted_keys = mock::MakeMockFile();
+  // Test data:
+  //   seqno [    1,    2 ... 8998, 8999, 9000, 9001, 9002 ... 9999 ]
+  //   key   [ 1001, 1002 ... 9998, 9999,    0,    1,    2 ...  999 ]
+  //   range-delete "9995" -> "9999" at seqno 10000
+  //   blob references with seqnos 10001..10006
+  for (int i = 1; i < 10000; ++i) {
+    std::string key(std::to_string((i + 1000) % 10000));
+    std::string value("value" + key);
+    ASSERT_OK(new_mem->Add(SequenceNumber(i), kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
+    if ((i + 1000) % 10000 < 9995) {
+      InternalKey internal_key(key, SequenceNumber(i), kTypeValue);
+      inserted_keys.push_back({internal_key.Encode().ToString(), value});
+    }
+  }
+
+  {
+    ASSERT_OK(new_mem->Add(SequenceNumber(10000), kTypeRangeDeletion, "9995",
+                           "9999a", nullptr /* kv_prot_info */));
+    InternalKey internal_key("9995", SequenceNumber(10000), kTypeRangeDeletion);
+    inserted_keys.push_back({internal_key.Encode().ToString(), "9999a"});
+  }
+
+  // Note: the first two blob references will not be considered when resolving
+  // the oldest blob file referenced (the first one is inlined TTL, while the
+  // second one is TTL and thus points to a TTL blob file).
+  constexpr std::array<uint64_t, 6> blob_file_numbers{
+      {kInvalidBlobFileNumber, 5, 103, 17, 102, 101}};
+  for (size_t i = 0; i < blob_file_numbers.size(); ++i) {
+    std::string key(std::to_string(i + 10001));
+    std::string blob_index;
+    if (i == 0) {
+      BlobIndex::EncodeInlinedTTL(&blob_index, /* expiration */ 1234567890ULL,
+                                  "foo");
+    } else if (i == 1) {
+      BlobIndex::EncodeBlobTTL(&blob_index, /* expiration */ 1234567890ULL,
+                               blob_file_numbers[i], /* offset */ i << 10,
+                               /* size */ i << 20, kNoCompression);
+    } else {
+      BlobIndex::EncodeBlob(&blob_index, blob_file_numbers[i],
+                            /* offset */ i << 10, /* size */ i << 20,
+                            kNoCompression);
+    }
+
+    const SequenceNumber seq(i + 10001);
+    ASSERT_OK(new_mem->Add(seq, kTypeBlobIndex, key, blob_index,
+                           nullptr /* kv_prot_info */));
+
+    InternalKey internal_key(key, seq, kTypeBlobIndex);
+    inserted_keys.push_back({internal_key.Encode().ToString(), blob_index});
+  }
+  mock::SortKVVector(&inserted_keys);
+
+  autovector<MemTable*> to_delete;
+  new_mem->ConstructFragmentedRangeTombstones();
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("9999a", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(1, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(10006, file_meta.fd.largest_seqno);
+  ASSERT_EQ(17, file_meta.oldest_blob_file_number);
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemTablesSingleColumnFamily) {
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(std::to_string(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  HistogramData hist;
+  FileMetaData file_meta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(nullptr /* prep_tracker */, &file_meta));
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+
+  ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+  ASSERT_EQ("99", file_meta.largest.user_key().ToString());
+  ASSERT_EQ(0, file_meta.fd.smallest_seqno);
+  ASSERT_EQ(SequenceNumber(num_mems_to_flush * num_keys_per_table - 1),
+            file_meta.fd.largest_seqno);
+  ASSERT_EQ(kInvalidBlobFileNumber, file_meta.oldest_blob_file_number);
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, FlushMemtablesMultipleColumnFamilies) {
+  autovector<ColumnFamilyData*> all_cfds;
+  for (auto cfd : *versions_->GetColumnFamilySet()) {
+    all_cfds.push_back(cfd);
+  }
+  const std::vector<size_t> num_memtables = {2, 1, 3};
+  assert(num_memtables.size() == column_family_names_.size());
+  const size_t num_keys_per_memtable = 1000;
+  JobContext job_context(0);
+  std::vector<uint64_t> memtable_ids;
+  std::vector<SequenceNumber> smallest_seqs;
+  std::vector<SequenceNumber> largest_seqs;
+  autovector<MemTable*> to_delete;
+  SequenceNumber curr_seqno = 0;
+  size_t k = 0;
+  for (auto cfd : all_cfds) {
+    smallest_seqs.push_back(curr_seqno);
+    for (size_t i = 0; i != num_memtables[k]; ++i) {
+      MemTable* mem = cfd->ConstructNewMemtable(
+          *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+      mem->SetID(i);
+      mem->Ref();
+
+      for (size_t j = 0; j != num_keys_per_memtable; ++j) {
+        std::string key(std::to_string(j + i * num_keys_per_memtable));
+        std::string value("value" + key);
+        ASSERT_OK(mem->Add(curr_seqno++, kTypeValue, key, value,
+                           nullptr /* kv_prot_info */));
+      }
+      mem->ConstructFragmentedRangeTombstones();
+      cfd->imm()->Add(mem, &to_delete);
+    }
+    largest_seqs.push_back(curr_seqno - 1);
+    memtable_ids.push_back(num_memtables[k++] - 1);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relevant
+  std::vector<std::unique_ptr<FlushJob>> flush_jobs;
+  k = 0;
+  for (auto cfd : all_cfds) {
+    std::vector<SequenceNumber> snapshot_seqs;
+    flush_jobs.emplace_back(new FlushJob(
+        dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+        memtable_ids[k], env_options_, versions_.get(), &mutex_,
+        &shutting_down_, snapshot_seqs, kMaxSequenceNumber, snapshot_checker,
+        &job_context, nullptr, nullptr, nullptr, kNoCompression,
+        db_options_.statistics.get(), &event_logger, true,
+        false /* sync_output_directory */, false /* write_manifest */,
+        Env::Priority::USER, nullptr /*IOTracer*/,
+        empty_seqno_to_time_mapping_));
+    k++;
+  }
+  HistogramData hist;
+  std::vector<FileMetaData> file_metas;
+  // Call reserve to avoid auto-resizing
+  file_metas.reserve(flush_jobs.size());
+  mutex_.Lock();
+  for (auto& job : flush_jobs) {
+    job->PickMemTable();
+  }
+  for (auto& job : flush_jobs) {
+    FileMetaData meta;
+    // Run will release and re-acquire  mutex
+    ASSERT_OK(job->Run(nullptr /**/, &meta));
+    file_metas.emplace_back(meta);
+  }
+  autovector<FileMetaData*> file_meta_ptrs;
+  for (auto& meta : file_metas) {
+    file_meta_ptrs.push_back(&meta);
+  }
+  autovector<const autovector<MemTable*>*> mems_list;
+  for (size_t i = 0; i != all_cfds.size(); ++i) {
+    const auto& mems = flush_jobs[i]->GetMemTables();
+    mems_list.push_back(&mems);
+  }
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  for (auto cfd : all_cfds) {
+    mutable_cf_options_list.push_back(cfd->GetLatestMutableCFOptions());
+  }
+  autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+      committed_flush_jobs_info;
+#ifndef ROCKSDB_LITE
+  for (auto& job : flush_jobs) {
+    committed_flush_jobs_info.push_back(job->GetCommittedFlushJobsInfo());
+  }
+#endif  //! ROCKSDB_LITE
+
+  Status s = InstallMemtableAtomicFlushResults(
+      nullptr /* imm_lists */, all_cfds, mutable_cf_options_list, mems_list,
+      versions_.get(), nullptr /* prep_tracker */, &mutex_, file_meta_ptrs,
+      committed_flush_jobs_info, &job_context.memtables_to_free,
+      nullptr /* db_directory */, nullptr /* log_buffer */);
+  ASSERT_OK(s);
+
+  mutex_.Unlock();
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  k = 0;
+  for (const auto& file_meta : file_metas) {
+    ASSERT_EQ(std::to_string(0), file_meta.smallest.user_key().ToString());
+    ASSERT_EQ("999", file_meta.largest.user_key()
+                         .ToString());  // max key by bytewise comparator
+    ASSERT_EQ(smallest_seqs[k], file_meta.fd.smallest_seqno);
+    ASSERT_EQ(largest_seqs[k], file_meta.fd.largest_seqno);
+    // Verify that imm is empty
+    ASSERT_EQ(std::numeric_limits<uint64_t>::max(),
+              all_cfds[k]->imm()->GetEarliestMemTableID());
+    ASSERT_EQ(0, all_cfds[k]->imm()->GetLatestMemTableID());
+    ++k;
+  }
+
+  for (auto m : to_delete) {
+    delete m;
+  }
+  to_delete.clear();
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, Snapshots) {
+  JobContext job_context(0);
+  auto cfd = versions_->GetColumnFamilySet()->GetDefault();
+  auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                           kMaxSequenceNumber);
+
+  std::set<SequenceNumber> snapshots_set;
+  int keys = 10000;
+  int max_inserts_per_keys = 8;
+
+  Random rnd(301);
+  for (int i = 0; i < keys / 2; ++i) {
+    snapshots_set.insert(rnd.Uniform(keys * (max_inserts_per_keys / 2)) + 1);
+  }
+  // set has already removed the duplicate snapshots
+  std::vector<SequenceNumber> snapshots(snapshots_set.begin(),
+                                        snapshots_set.end());
+
+  new_mem->Ref();
+  SequenceNumber current_seqno = 0;
+  auto inserted_keys = mock::MakeMockFile();
+  for (int i = 1; i < keys; ++i) {
+    std::string key(std::to_string(i));
+    int insertions = rnd.Uniform(max_inserts_per_keys);
+    for (int j = 0; j < insertions; ++j) {
+      std::string value(rnd.HumanReadableString(10));
+      auto seqno = ++current_seqno;
+      ASSERT_OK(new_mem->Add(SequenceNumber(seqno), kTypeValue, key, value,
+                             nullptr /* kv_prot_info */));
+      // a key is visible only if:
+      // 1. it's the last one written (j == insertions - 1)
+      // 2. there's a snapshot pointing at it
+      bool visible = (j == insertions - 1) ||
+                     (snapshots_set.find(seqno) != snapshots_set.end());
+      if (visible) {
+        InternalKey internal_key(key, seqno, kTypeValue);
+        inserted_keys.push_back({internal_key.Encode().ToString(), value});
+      }
+    }
+  }
+  mock::SortKVVector(&inserted_keys);
+
+  autovector<MemTable*> to_delete;
+  new_mem->ConstructFragmentedRangeTombstones();
+  cfd->imm()->Add(new_mem, &to_delete);
+  for (auto& m : to_delete) {
+    delete m;
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run());
+  mutex_.Unlock();
+  mock_table_factory_->AssertSingleFile(inserted_keys);
+  HistogramData hist;
+  db_options_.statistics->histogramData(FLUSH_TIME, &hist);
+  ASSERT_GT(hist.average, 0.0);
+  job_context.Clean();
+}
+
+TEST_F(FlushJobTest, GetRateLimiterPriorityForWrite) {
+  // Prepare a FlushJob that flush MemTables of Single Column Family.
+  const size_t num_mems = 2;
+  const size_t num_mems_to_flush = 1;
+  const size_t num_keys_per_table = 100;
+  JobContext job_context(0);
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  std::vector<uint64_t> memtable_ids;
+  std::vector<MemTable*> new_mems;
+  for (size_t i = 0; i != num_mems; ++i) {
+    MemTable* mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                                              kMaxSequenceNumber);
+    mem->SetID(i);
+    mem->Ref();
+    new_mems.emplace_back(mem);
+    memtable_ids.push_back(mem->GetID());
+
+    for (size_t j = 0; j < num_keys_per_table; ++j) {
+      std::string key(std::to_string(j + i * num_keys_per_table));
+      std::string value("value" + key);
+      ASSERT_OK(mem->Add(SequenceNumber(j + i * num_keys_per_table), kTypeValue,
+                         key, value, nullptr /* kv_prot_info */));
+    }
+  }
+
+  autovector<MemTable*> to_delete;
+  for (auto mem : new_mems) {
+    mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(mem, &to_delete);
+  }
+
+  EventLogger event_logger(db_options_.info_log.get());
+  SnapshotChecker* snapshot_checker = nullptr;  // not relavant
+
+  assert(memtable_ids.size() == num_mems);
+  uint64_t smallest_memtable_id = memtable_ids.front();
+  uint64_t flush_memtable_id = smallest_memtable_id + num_mems_to_flush - 1;
+  FlushJob flush_job(
+      dbname_, versions_->GetColumnFamilySet()->GetDefault(), db_options_,
+      *cfd->GetLatestMutableCFOptions(), flush_memtable_id, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, {}, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_);
+
+  // When the state from WriteController is normal.
+  ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_HIGH);
+
+  WriteController* write_controller =
+      flush_job.versions_->GetColumnFamilySet()->write_controller();
+
+  {
+    // When the state from WriteController is Delayed.
+    std::unique_ptr<WriteControllerToken> delay_token =
+        write_controller->GetDelayToken(1000000);
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+
+  {
+    // When the state from WriteController is Stopped.
+    std::unique_ptr<WriteControllerToken> stop_token =
+        write_controller->GetStopToken();
+    ASSERT_EQ(flush_job.GetRateLimiterPriorityForWrite(), Env::IO_USER);
+  }
+}
+
+class FlushJobTimestampTest : public FlushJobTestBase {
+ public:
+  FlushJobTimestampTest()
+      : FlushJobTestBase(test::PerThreadDBPath("flush_job_ts_gc_test"),
+                         test::BytewiseComparatorWithU64TsWrapper()) {}
+
+  void AddKeyValueToMemtable(MemTable* memtable, std::string key, uint64_t ts,
+                             SequenceNumber seq, ValueType value_type,
+                             Slice value) {
+    std::string key_str(std::move(key));
+    PutFixed64(&key_str, ts);
+    ASSERT_OK(memtable->Add(seq, value_type, key_str, value,
+                            nullptr /* kv_prot_info */));
+  }
+
+ protected:
+  static constexpr uint64_t kStartTs = 10;
+  static constexpr SequenceNumber kStartSeq = 0;
+  SequenceNumber curr_seq_{kStartSeq};
+  std::atomic<uint64_t> curr_ts_{kStartTs};
+};
+
+TEST_F(FlushJobTimestampTest, AllKeysExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    uint64_t ts = curr_ts_.fetch_add(1);
+    SequenceNumber seq = (curr_seq_++);
+    AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                          ValueType::kTypeDeletionWithTimestamp, "");
+    new_mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  constexpr SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, std::numeric_limits<uint64_t>::max());
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+      /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string key = test::EncodeInt(0);
+    key.append(test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1));
+    InternalKey ikey(key, curr_seq_ - 1, ValueType::kTypeDeletionWithTimestamp);
+    ASSERT_EQ(ikey.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(ikey.Encode(), fmeta.largest.Encode());
+  }
+
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(FlushJobTimestampTest, NoKeyExpired) {
+  ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetDefault();
+  autovector<MemTable*> to_delete;
+
+  {
+    MemTable* new_mem = cfd->ConstructNewMemtable(
+        *cfd->GetLatestMutableCFOptions(), kMaxSequenceNumber);
+    new_mem->Ref();
+    for (int i = 0; i < 100; ++i) {
+      uint64_t ts = curr_ts_.fetch_add(1);
+      SequenceNumber seq = (curr_seq_++);
+      AddKeyValueToMemtable(new_mem, test::EncodeInt(0), ts, seq,
+                            ValueType::kTypeValue, "0_value");
+    }
+    new_mem->ConstructFragmentedRangeTombstones();
+    cfd->imm()->Add(new_mem, &to_delete);
+  }
+
+  std::vector<SequenceNumber> snapshots;
+  SnapshotChecker* const snapshot_checker = nullptr;
+  JobContext job_context(0);
+  EventLogger event_logger(db_options_.info_log.get());
+  std::string full_history_ts_low;
+  PutFixed64(&full_history_ts_low, 0);
+  FlushJob flush_job(
+      dbname_, cfd, db_options_, *cfd->GetLatestMutableCFOptions(),
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, env_options_,
+      versions_.get(), &mutex_, &shutting_down_, snapshots, kMaxSequenceNumber,
+      snapshot_checker, &job_context, nullptr, nullptr, nullptr, kNoCompression,
+      db_options_.statistics.get(), &event_logger, true,
+      true /* sync_output_directory */, true /* write_manifest */,
+      Env::Priority::USER, nullptr /*IOTracer*/, empty_seqno_to_time_mapping_,
+      /*db_id=*/"",
+      /*db_session_id=*/"", full_history_ts_low);
+
+  FileMetaData fmeta;
+  mutex_.Lock();
+  flush_job.PickMemTable();
+  ASSERT_OK(flush_job.Run(/*prep_tracker=*/nullptr, &fmeta));
+  mutex_.Unlock();
+
+  {
+    std::string ukey = test::EncodeInt(0);
+    std::string smallest_key =
+        ukey + test::EncodeInt(curr_ts_.load(std::memory_order_relaxed) - 1);
+    std::string largest_key = ukey + test::EncodeInt(kStartTs);
+    InternalKey smallest(smallest_key, curr_seq_ - 1, ValueType::kTypeValue);
+    InternalKey largest(largest_key, kStartSeq, ValueType::kTypeValue);
+    ASSERT_EQ(smallest.Encode(), fmeta.smallest.Encode());
+    ASSERT_EQ(largest.Encode(), fmeta.largest.Encode());
+  }
+  job_context.Clean();
+  ASSERT_TRUE(to_delete.empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/flush_scheduler.cc b/src/rocksdb/db/flush_scheduler.cc
new file mode 100644
index 000000000..6f4d3e1a5
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.cc
@@ -0,0 +1,86 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/flush_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+#ifndef NDEBUG
+  {
+    std::lock_guard<std::mutex> lock(checking_mutex_);
+    assert(checking_set_.count(cfd) == 0);
+    checking_set_.insert(cfd);
+  }
+#endif  // NDEBUG
+  cfd->Ref();
+// Suppress false positive clang analyzer warnings.
+#ifndef __clang_analyzer__
+  Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
+  while (!head_.compare_exchange_strong(
+      node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
+    // failing CAS updates the first param, so we are already set for
+    // retry.  TakeNextColumnFamily won't happen until after another
+    // inter-thread synchronization, so we don't even need release
+    // semantics for this CAS
+  }
+#endif  // __clang_analyzer__
+}
+
+ColumnFamilyData* FlushScheduler::TakeNextColumnFamily() {
+  while (true) {
+    if (head_.load(std::memory_order_relaxed) == nullptr) {
+      return nullptr;
+    }
+
+    // dequeue the head
+    Node* node = head_.load(std::memory_order_relaxed);
+    head_.store(node->next, std::memory_order_relaxed);
+    ColumnFamilyData* cfd = node->column_family;
+    delete node;
+
+#ifndef NDEBUG
+    {
+      std::lock_guard<std::mutex> lock(checking_mutex_);
+      auto iter = checking_set_.find(cfd);
+      assert(iter != checking_set_.end());
+      checking_set_.erase(iter);
+    }
+#endif  // NDEBUG
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+
+    // no longer relevant, retry
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool FlushScheduler::Empty() {
+  auto rv = head_.load(std::memory_order_relaxed) == nullptr;
+#ifndef NDEBUG
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  // Empty is allowed to be called concurrnetly with ScheduleFlush. It would
+  // only miss the recent schedules.
+  assert((rv == checking_set_.empty()) || rv);
+#endif  // NDEBUG
+  return rv;
+}
+
+void FlushScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(head_.load(std::memory_order_relaxed) == nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/flush_scheduler.h b/src/rocksdb/db/flush_scheduler.h
new file mode 100644
index 000000000..eb03f3e11
--- /dev/null
+++ b/src/rocksdb/db/flush_scheduler.h
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <mutex>
+#include <set>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// FlushScheduler keeps track of all column families whose memtable may
+// be full and require flushing. Unless otherwise noted, all methods on
+// FlushScheduler should be called only with the DB mutex held or from
+// a single-threaded recovery context.
+class FlushScheduler {
+ public:
+  FlushScheduler() : head_(nullptr) {}
+
+  // May be called from multiple threads at once, but not concurrent with
+  // any other method calls on this instance
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Removes and returns Ref()-ed column family. Client needs to Unref().
+  // Filters column families that have been dropped.
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  // This can be called concurrently with ScheduleWork but it would miss all
+  // the scheduled flushes after the last synchronization. This would result
+  // into less precise enforcement of memtable sizes but should not matter much.
+  bool Empty();
+
+  void Clear();
+
+ private:
+  struct Node {
+    ColumnFamilyData* column_family;
+    Node* next;
+  };
+
+  std::atomic<Node*> head_;
+#ifndef NDEBUG
+  std::mutex checking_mutex_;
+  std::set<ColumnFamilyData*> checking_set_;
+#endif  // NDEBUG
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/forward_iterator.cc b/src/rocksdb/db/forward_iterator.cc
new file mode 100644
index 000000000..3fbc2cf47
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.cc
@@ -0,0 +1,1062 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include "db/forward_iterator.h"
+
+#include <limits>
+#include <string>
+#include <utility>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_iter.h"
+#include "db/dbformat.h"
+#include "db/job_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Usage:
+//     ForwardLevelIterator iter;
+//     iter.SetFileIndex(file_index);
+//     iter.Seek(target); // or iter.SeekToFirst();
+//     iter.Next()
+class ForwardLevelIterator : public InternalIterator {
+ public:
+  ForwardLevelIterator(
+      const ColumnFamilyData* const cfd, const ReadOptions& read_options,
+      const std::vector<FileMetaData*>& files,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      bool allow_unprepared_value)
+      : cfd_(cfd),
+        read_options_(read_options),
+        files_(files),
+        valid_(false),
+        file_index_(std::numeric_limits<uint32_t>::max()),
+        file_iter_(nullptr),
+        pinned_iters_mgr_(nullptr),
+        prefix_extractor_(prefix_extractor),
+        allow_unprepared_value_(allow_unprepared_value) {
+    status_.PermitUncheckedError();  // Allow uninitialized status through
+  }
+
+  ~ForwardLevelIterator() override {
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+  }
+
+  void SetFileIndex(uint32_t file_index) {
+    assert(file_index < files_.size());
+    status_ = Status::OK();
+    if (file_index != file_index_) {
+      file_index_ = file_index;
+      Reset();
+    }
+  }
+  void Reset() {
+    assert(file_index_ < files_.size());
+
+    // Reset current pointer
+    if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+      pinned_iters_mgr_->PinIterator(file_iter_);
+    } else {
+      delete file_iter_;
+    }
+
+    ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                         kMaxSequenceNumber /* upper_bound */);
+    file_iter_ = cfd_->table_cache()->NewIterator(
+        read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
+        *files_[file_index_],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        prefix_extractor_, /*table_reader_ptr=*/nullptr,
+        /*file_read_hist=*/nullptr, TableReaderCaller::kUserIterator,
+        /*arena=*/nullptr, /*skip_filters=*/false, /*level=*/-1,
+        /*max_file_size_for_l0_meta_pin=*/0,
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+    file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    valid_ = false;
+    if (!range_del_agg.IsEmpty()) {
+      status_ = Status::NotSupported(
+          "Range tombstones unsupported with ForwardIterator");
+    }
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardLevelIterator::Prev()");
+    valid_ = false;
+  }
+  bool Valid() const override { return valid_; }
+  void SeekToFirst() override {
+    assert(file_iter_ != nullptr);
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+    file_iter_->SeekToFirst();
+    valid_ = file_iter_->Valid();
+  }
+  void Seek(const Slice& internal_key) override {
+    assert(file_iter_ != nullptr);
+
+    // This deviates from the usual convention for InternalIterator::Seek() in
+    // that it doesn't discard pre-existing error status. That's because this
+    // Seek() is only supposed to be called immediately after SetFileIndex()
+    // (which discards pre-existing error status), and SetFileIndex() may set
+    // an error status, which we shouldn't discard.
+    if (!status_.ok()) {
+      assert(!valid_);
+      return;
+    }
+
+    file_iter_->Seek(internal_key);
+    valid_ = file_iter_->Valid();
+  }
+  void SeekForPrev(const Slice& /*internal_key*/) override {
+    status_ = Status::NotSupported("ForwardLevelIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void Next() override {
+    assert(valid_);
+    file_iter_->Next();
+    for (;;) {
+      valid_ = file_iter_->Valid();
+      if (!file_iter_->status().ok()) {
+        assert(!valid_);
+        return;
+      }
+      if (valid_) {
+        return;
+      }
+      if (file_index_ + 1 >= files_.size()) {
+        valid_ = false;
+        return;
+      }
+      SetFileIndex(file_index_ + 1);
+      if (!status_.ok()) {
+        assert(!valid_);
+        return;
+      }
+      file_iter_->SeekToFirst();
+    }
+  }
+  Slice key() const override {
+    assert(valid_);
+    return file_iter_->key();
+  }
+  Slice value() const override {
+    assert(valid_);
+    return file_iter_->value();
+  }
+  Status status() const override {
+    if (!status_.ok()) {
+      return status_;
+    } else if (file_iter_) {
+      return file_iter_->status();
+    }
+    return Status::OK();
+  }
+  bool PrepareValue() override {
+    assert(valid_);
+    if (file_iter_->PrepareValue()) {
+      return true;
+    }
+
+    assert(!file_iter_->Valid());
+    valid_ = false;
+    return false;
+  }
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsKeyPinned();
+  }
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_->IsValuePinned();
+  }
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_) {
+      file_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+ private:
+  const ColumnFamilyData* const cfd_;
+  const ReadOptions& read_options_;
+  const std::vector<FileMetaData*>& files_;
+
+  bool valid_;
+  uint32_t file_index_;
+  Status status_;
+  InternalIterator* file_iter_;
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  // Kept alive by ForwardIterator::sv_->mutable_cf_options
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+  const bool allow_unprepared_value_;
+};
+
+ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                                 ColumnFamilyData* cfd,
+                                 SuperVersion* current_sv,
+                                 bool allow_unprepared_value)
+    : db_(db),
+      read_options_(read_options),
+      cfd_(cfd),
+      prefix_extractor_(current_sv->mutable_cf_options.prefix_extractor.get()),
+      user_comparator_(cfd->user_comparator()),
+      allow_unprepared_value_(allow_unprepared_value),
+      immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())),
+      sv_(current_sv),
+      mutable_iter_(nullptr),
+      current_(nullptr),
+      valid_(false),
+      status_(Status::OK()),
+      immutable_status_(Status::OK()),
+      has_iter_trimmed_for_upper_bound_(false),
+      current_over_upper_bound_(false),
+      is_prev_set_(false),
+      is_prev_inclusive_(false),
+      pinned_iters_mgr_(nullptr) {
+  if (sv_) {
+    RebuildIterators(false);
+  }
+
+  // immutable_status_ is a local aggregation of the
+  // status of the immutable Iterators.
+  // We have to PermitUncheckedError in case it is never
+  // used, otherwise it will fail ASSERT_STATUS_CHECKED.
+  immutable_status_.PermitUncheckedError();
+}
+
+ForwardIterator::~ForwardIterator() { Cleanup(true); }
+
+void ForwardIterator::SVCleanup(DBImpl* db, SuperVersion* sv,
+                                bool background_purge_on_iterator_cleanup) {
+  if (sv->Unref()) {
+    // Job id == 0 means that this is not our background process, but rather
+    // user thread
+    JobContext job_context(0);
+    db->mutex_.Lock();
+    sv->Cleanup();
+    db->FindObsoleteFiles(&job_context, false, true);
+    if (background_purge_on_iterator_cleanup) {
+      db->ScheduleBgLogWriterClose(&job_context);
+      db->AddSuperVersionsToFreeQueue(sv);
+      db->SchedulePurge();
+    }
+    db->mutex_.Unlock();
+    if (!background_purge_on_iterator_cleanup) {
+      delete sv;
+    }
+    if (job_context.HaveSomethingToDelete()) {
+      db->PurgeObsoleteFiles(job_context, background_purge_on_iterator_cleanup);
+    }
+    job_context.Clean();
+  }
+}
+
+namespace {
+struct SVCleanupParams {
+  DBImpl* db;
+  SuperVersion* sv;
+  bool background_purge_on_iterator_cleanup;
+};
+}  // anonymous namespace
+
+// Used in PinnedIteratorsManager to release pinned SuperVersion
+void ForwardIterator::DeferredSVCleanup(void* arg) {
+  auto d = reinterpret_cast<SVCleanupParams*>(arg);
+  ForwardIterator::SVCleanup(d->db, d->sv,
+                             d->background_purge_on_iterator_cleanup);
+  delete d;
+}
+
+void ForwardIterator::SVCleanup() {
+  if (sv_ == nullptr) {
+    return;
+  }
+  bool background_purge =
+      read_options_.background_purge_on_iterator_cleanup ||
+      db_->immutable_db_options().avoid_unnecessary_blocking_io;
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    // pinned_iters_mgr_ tells us to make sure that all visited key-value slices
+    // are alive until pinned_iters_mgr_->ReleasePinnedData() is called.
+    // The slices may point into some memtables owned by sv_, so we need to keep
+    // sv_ referenced until pinned_iters_mgr_ unpins everything.
+    auto p = new SVCleanupParams{db_, sv_, background_purge};
+    pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup);
+  } else {
+    SVCleanup(db_, sv_, background_purge);
+  }
+}
+
+void ForwardIterator::Cleanup(bool release_sv) {
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+
+  if (release_sv) {
+    SVCleanup();
+  }
+}
+
+bool ForwardIterator::Valid() const {
+  // See UpdateCurrent().
+  return valid_ ? !current_over_upper_bound_ : false;
+}
+
+void ForwardIterator::SeekToFirst() {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+  SeekInternal(Slice(), true, false);
+}
+
+bool ForwardIterator::IsOverUpperBound(const Slice& internal_key) const {
+  return !(read_options_.iterate_upper_bound == nullptr ||
+           cfd_->internal_comparator().user_comparator()->Compare(
+               ExtractUserKey(internal_key),
+               *read_options_.iterate_upper_bound) < 0);
+}
+
+void ForwardIterator::Seek(const Slice& internal_key) {
+  if (sv_ == nullptr) {
+    RebuildIterators(true);
+  } else if (sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    RenewIterators();
+  } else if (immutable_status_.IsIncomplete()) {
+    ResetIncompleteIterators();
+  }
+
+  SeekInternal(internal_key, false, false);
+  if (read_options_.async_io) {
+    SeekInternal(internal_key, false, true);
+  }
+}
+
+// In case of async_io, SeekInternal is called twice with seek_after_async_io
+// enabled in second call which only does seeking part to retrieve the blocks.
+void ForwardIterator::SeekInternal(const Slice& internal_key,
+                                   bool seek_to_first,
+                                   bool seek_after_async_io) {
+  assert(mutable_iter_);
+  // mutable
+  if (!seek_after_async_io) {
+    seek_to_first ? mutable_iter_->SeekToFirst()
+                  : mutable_iter_->Seek(internal_key);
+  }
+
+  // immutable
+  // TODO(ljin): NeedToSeekImmutable has negative impact on performance
+  // if it turns to need to seek immutable often. We probably want to have
+  // an option to turn it off.
+  if (seek_to_first || seek_after_async_io ||
+      NeedToSeekImmutable(internal_key)) {
+    if (!seek_after_async_io) {
+      immutable_status_ = Status::OK();
+      if (has_iter_trimmed_for_upper_bound_ &&
+          (
+              // prev_ is not set yet
+              is_prev_set_ == false ||
+              // We are doing SeekToFirst() and internal_key.size() = 0
+              seek_to_first ||
+              // prev_key_ > internal_key
+              cfd_->internal_comparator().InternalKeyComparator::Compare(
+                  prev_key_.GetInternalKey(), internal_key) > 0)) {
+        // Some iterators are trimmed. Need to rebuild.
+        RebuildIterators(true);
+        // Already seeked mutable iter, so seek again
+        seek_to_first ? mutable_iter_->SeekToFirst()
+                      : mutable_iter_->Seek(internal_key);
+      }
+      {
+        auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator()));
+        immutable_min_heap_.swap(tmp);
+      }
+      for (size_t i = 0; i < imm_iters_.size(); i++) {
+        auto* m = imm_iters_[i];
+        seek_to_first ? m->SeekToFirst() : m->Seek(internal_key);
+        if (!m->status().ok()) {
+          immutable_status_ = m->status();
+        } else if (m->Valid()) {
+          immutable_min_heap_.push(m);
+        }
+      }
+    }
+
+    Slice target_user_key;
+    if (!seek_to_first) {
+      target_user_key = ExtractUserKey(internal_key);
+    }
+    const VersionStorageInfo* vstorage = sv_->current->storage_info();
+    const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+    for (size_t i = 0; i < l0.size(); ++i) {
+      if (!l0_iters_[i]) {
+        continue;
+      }
+      if (seek_after_async_io) {
+        if (!l0_iters_[i]->status().IsTryAgain()) {
+          continue;
+        }
+      }
+
+      if (seek_to_first) {
+        l0_iters_[i]->SeekToFirst();
+      } else {
+        // If the target key passes over the largest key, we are sure Next()
+        // won't go over this file.
+        if (seek_after_async_io == false &&
+            user_comparator_->Compare(target_user_key,
+                                      l0[i]->largest.user_key()) > 0) {
+          if (read_options_.iterate_upper_bound != nullptr) {
+            has_iter_trimmed_for_upper_bound_ = true;
+            DeleteIterator(l0_iters_[i]);
+            l0_iters_[i] = nullptr;
+          }
+          continue;
+        }
+        l0_iters_[i]->Seek(internal_key);
+      }
+
+      if (l0_iters_[i]->status().IsTryAgain()) {
+        assert(!seek_after_async_io);
+        continue;
+      } else if (!l0_iters_[i]->status().ok()) {
+        immutable_status_ = l0_iters_[i]->status();
+      } else if (l0_iters_[i]->Valid() &&
+                 !IsOverUpperBound(l0_iters_[i]->key())) {
+        immutable_min_heap_.push(l0_iters_[i]);
+      } else {
+        has_iter_trimmed_for_upper_bound_ = true;
+        DeleteIterator(l0_iters_[i]);
+        l0_iters_[i] = nullptr;
+      }
+    }
+
+    for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+      const std::vector<FileMetaData*>& level_files =
+          vstorage->LevelFiles(level);
+      if (level_files.empty()) {
+        continue;
+      }
+      if (level_iters_[level - 1] == nullptr) {
+        continue;
+      }
+
+      if (seek_after_async_io) {
+        if (!level_iters_[level - 1]->status().IsTryAgain()) {
+          continue;
+        }
+      }
+      uint32_t f_idx = 0;
+      if (!seek_to_first && !seek_after_async_io) {
+        f_idx = FindFileInRange(level_files, internal_key, 0,
+                                static_cast<uint32_t>(level_files.size()));
+      }
+
+      // Seek
+      if (seek_after_async_io || f_idx < level_files.size()) {
+        if (!seek_after_async_io) {
+          level_iters_[level - 1]->SetFileIndex(f_idx);
+        }
+        seek_to_first ? level_iters_[level - 1]->SeekToFirst()
+                      : level_iters_[level - 1]->Seek(internal_key);
+
+        if (level_iters_[level - 1]->status().IsTryAgain()) {
+          assert(!seek_after_async_io);
+          continue;
+        } else if (!level_iters_[level - 1]->status().ok()) {
+          immutable_status_ = level_iters_[level - 1]->status();
+        } else if (level_iters_[level - 1]->Valid() &&
+                   !IsOverUpperBound(level_iters_[level - 1]->key())) {
+          immutable_min_heap_.push(level_iters_[level - 1]);
+        } else {
+          // Nothing in this level is interesting. Remove.
+          has_iter_trimmed_for_upper_bound_ = true;
+          DeleteIterator(level_iters_[level - 1]);
+          level_iters_[level - 1] = nullptr;
+        }
+      }
+    }
+
+    if (seek_to_first) {
+      is_prev_set_ = false;
+    } else {
+      prev_key_.SetInternalKey(internal_key);
+      is_prev_set_ = true;
+      is_prev_inclusive_ = true;
+    }
+
+    TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Immutable", this);
+  } else if (current_ && current_ != mutable_iter_) {
+    // current_ is one of immutable iterators, push it back to the heap
+    immutable_min_heap_.push(current_);
+  }
+
+  // For async_io, it should be updated when seek_after_async_io is true (in
+  // second call).
+  if (seek_to_first || !read_options_.async_io || seek_after_async_io) {
+    UpdateCurrent();
+  }
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::SeekInternal:Return", this);
+}
+
+void ForwardIterator::Next() {
+  assert(valid_);
+  bool update_prev_key = false;
+
+  if (sv_ == nullptr || sv_->version_number != cfd_->GetSuperVersionNumber()) {
+    std::string current_key = key().ToString();
+    Slice old_key(current_key.data(), current_key.size());
+
+    if (sv_ == nullptr) {
+      RebuildIterators(true);
+    } else {
+      RenewIterators();
+    }
+
+    SeekInternal(old_key, false, false);
+    if (read_options_.async_io) {
+      SeekInternal(old_key, false, true);
+    }
+
+    if (!valid_ || key().compare(old_key) != 0) {
+      return;
+    }
+  } else if (current_ != mutable_iter_) {
+    // It is going to advance immutable iterator
+
+    if (is_prev_set_ && prefix_extractor_) {
+      // advance prev_key_ to current_ only if they share the same prefix
+      update_prev_key =
+          prefix_extractor_->Transform(prev_key_.GetUserKey())
+              .compare(prefix_extractor_->Transform(current_->key())) == 0;
+    } else {
+      update_prev_key = true;
+    }
+
+    if (update_prev_key) {
+      prev_key_.SetInternalKey(current_->key());
+      is_prev_set_ = true;
+      is_prev_inclusive_ = false;
+    }
+  }
+
+  current_->Next();
+  if (current_ != mutable_iter_) {
+    if (!current_->status().ok()) {
+      immutable_status_ = current_->status();
+    } else if ((current_->Valid()) && (!IsOverUpperBound(current_->key()))) {
+      immutable_min_heap_.push(current_);
+    } else {
+      if ((current_->Valid()) && (IsOverUpperBound(current_->key()))) {
+        // remove the current iterator
+        DeleteCurrentIter();
+        current_ = nullptr;
+      }
+      if (update_prev_key) {
+        mutable_iter_->Seek(prev_key_.GetInternalKey());
+      }
+    }
+  }
+  UpdateCurrent();
+  TEST_SYNC_POINT_CALLBACK("ForwardIterator::Next:Return", this);
+}
+
+Slice ForwardIterator::key() const {
+  assert(valid_);
+  return current_->key();
+}
+
+Slice ForwardIterator::value() const {
+  assert(valid_);
+  return current_->value();
+}
+
+Status ForwardIterator::status() const {
+  if (!status_.ok()) {
+    return status_;
+  } else if (!mutable_iter_->status().ok()) {
+    return mutable_iter_->status();
+  }
+
+  return immutable_status_;
+}
+
+bool ForwardIterator::PrepareValue() {
+  assert(valid_);
+  if (current_->PrepareValue()) {
+    return true;
+  }
+
+  assert(!current_->Valid());
+  assert(!current_->status().ok());
+  assert(current_ != mutable_iter_);  // memtable iterator can't fail
+  assert(immutable_status_.ok());
+
+  valid_ = false;
+  immutable_status_ = current_->status();
+  return false;
+}
+
+Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) {
+  assert(prop != nullptr);
+  if (prop_name == "rocksdb.iterator.super-version-number") {
+    *prop = std::to_string(sv_->version_number);
+    return Status::OK();
+  }
+  return Status::InvalidArgument();
+}
+
+void ForwardIterator::SetPinnedItersMgr(
+    PinnedIteratorsManager* pinned_iters_mgr) {
+  pinned_iters_mgr_ = pinned_iters_mgr;
+  UpdateChildrenPinnedItersMgr();
+}
+
+void ForwardIterator::UpdateChildrenPinnedItersMgr() {
+  // Set PinnedIteratorsManager for mutable memtable iterator.
+  if (mutable_iter_) {
+    mutable_iter_->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  // Set PinnedIteratorsManager for immutable memtable iterators.
+  for (InternalIterator* child_iter : imm_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L0 files iterators.
+  for (InternalIterator* child_iter : l0_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+
+  // Set PinnedIteratorsManager for L1+ levels iterators.
+  for (ForwardLevelIterator* child_iter : level_iters_) {
+    if (child_iter) {
+      child_iter->SetPinnedItersMgr(pinned_iters_mgr_);
+    }
+  }
+}
+
+bool ForwardIterator::IsKeyPinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsKeyPinned();
+}
+
+bool ForwardIterator::IsValuePinned() const {
+  return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+         current_->IsValuePinned();
+}
+
+void ForwardIterator::RebuildIterators(bool refresh_sv) {
+  // Clean up
+  Cleanup(refresh_sv);
+  if (refresh_sv) {
+    // New
+    sv_ = cfd_->GetReferencedSuperVersion(db_);
+  }
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
+  sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        sv_->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence(),
+            false /* immutable_memtable */));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    // Always return Status::OK().
+    Status temp_s = sv_->imm->AddRangeTombstoneIterators(read_options_, &arena_,
+                                                         &range_del_agg);
+    assert(temp_s.ok());
+  }
+  has_iter_trimmed_for_upper_bound_ = false;
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  l0_iters_.reserve(l0_files.size());
+  for (const auto* l0 : l0_files) {
+    if ((read_options_.iterate_upper_bound != nullptr) &&
+        cfd_->internal_comparator().user_comparator()->Compare(
+            l0->smallest.user_key(), *read_options_.iterate_upper_bound) > 0) {
+      // No need to set has_iter_trimmed_for_upper_bound_: this ForwardIterator
+      // will never be interested in files with smallest key above
+      // iterate_upper_bound, since iterate_upper_bound can't be changed.
+      l0_iters_.push_back(nullptr);
+      continue;
+    }
+    l0_iters_.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0,
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        sv_->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+  }
+  BuildLevelIterators(vstorage, sv_);
+  current_ = nullptr;
+  is_prev_set_ = false;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::RenewIterators() {
+  SuperVersion* svnew;
+  assert(sv_);
+  svnew = cfd_->GetReferencedSuperVersion(db_);
+
+  if (mutable_iter_ != nullptr) {
+    DeleteIterator(mutable_iter_, true /* is_arena */);
+  }
+  for (auto* m : imm_iters_) {
+    DeleteIterator(m, true /* is_arena */);
+  }
+  imm_iters_.clear();
+
+  mutable_iter_ = svnew->mem->NewIterator(read_options_, &arena_);
+  svnew->imm->AddIterators(read_options_, &imm_iters_, &arena_);
+  ReadRangeDelAggregator range_del_agg(&cfd_->internal_comparator(),
+                                       kMaxSequenceNumber /* upper_bound */);
+  if (!read_options_.ignore_range_deletions) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        svnew->mem->NewRangeTombstoneIterator(
+            read_options_, sv_->current->version_set()->LastSequence(),
+            false /* immutable_memtable */));
+    range_del_agg.AddTombstones(std::move(range_del_iter));
+    // Always return Status::OK().
+    Status temp_s = svnew->imm->AddRangeTombstoneIterators(
+        read_options_, &arena_, &range_del_agg);
+    assert(temp_s.ok());
+  }
+
+  const auto* vstorage = sv_->current->storage_info();
+  const auto& l0_files = vstorage->LevelFiles(0);
+  const auto* vstorage_new = svnew->current->storage_info();
+  const auto& l0_files_new = vstorage_new->LevelFiles(0);
+  size_t iold, inew;
+  bool found;
+  std::vector<InternalIterator*> l0_iters_new;
+  l0_iters_new.reserve(l0_files_new.size());
+
+  for (inew = 0; inew < l0_files_new.size(); inew++) {
+    found = false;
+    for (iold = 0; iold < l0_files.size(); iold++) {
+      if (l0_files[iold] == l0_files_new[inew]) {
+        found = true;
+        break;
+      }
+    }
+    if (found) {
+      if (l0_iters_[iold] == nullptr) {
+        l0_iters_new.push_back(nullptr);
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Null", this);
+      } else {
+        l0_iters_new.push_back(l0_iters_[iold]);
+        l0_iters_[iold] = nullptr;
+        TEST_SYNC_POINT_CALLBACK("ForwardIterator::RenewIterators:Copy", this);
+      }
+      continue;
+    }
+    l0_iters_new.push_back(cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files_new[inew],
+        read_options_.ignore_range_deletions ? nullptr : &range_del_agg,
+        svnew->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(svnew->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_));
+  }
+
+  for (auto* f : l0_iters_) {
+    DeleteIterator(f);
+  }
+  l0_iters_.clear();
+  l0_iters_ = l0_iters_new;
+
+  for (auto* l : level_iters_) {
+    DeleteIterator(l);
+  }
+  level_iters_.clear();
+  BuildLevelIterators(vstorage_new, svnew);
+  current_ = nullptr;
+  is_prev_set_ = false;
+  SVCleanup();
+  sv_ = svnew;
+
+  UpdateChildrenPinnedItersMgr();
+  if (!range_del_agg.IsEmpty()) {
+    status_ = Status::NotSupported(
+        "Range tombstones unsupported with ForwardIterator");
+    valid_ = false;
+  }
+}
+
+void ForwardIterator::BuildLevelIterators(const VersionStorageInfo* vstorage,
+                                          SuperVersion* sv) {
+  level_iters_.reserve(vstorage->num_levels() - 1);
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    const auto& level_files = vstorage->LevelFiles(level);
+    if ((level_files.empty()) ||
+        ((read_options_.iterate_upper_bound != nullptr) &&
+         (user_comparator_->Compare(*read_options_.iterate_upper_bound,
+                                    level_files[0]->smallest.user_key()) <
+          0))) {
+      level_iters_.push_back(nullptr);
+      if (!level_files.empty()) {
+        has_iter_trimmed_for_upper_bound_ = true;
+      }
+    } else {
+      level_iters_.push_back(new ForwardLevelIterator(
+          cfd_, read_options_, level_files,
+          sv->mutable_cf_options.prefix_extractor, allow_unprepared_value_));
+    }
+  }
+}
+
+void ForwardIterator::ResetIncompleteIterators() {
+  const auto& l0_files = sv_->current->storage_info()->LevelFiles(0);
+  for (size_t i = 0; i < l0_iters_.size(); ++i) {
+    assert(i < l0_files.size());
+    if (!l0_iters_[i] || !l0_iters_[i]->status().IsIncomplete()) {
+      continue;
+    }
+    DeleteIterator(l0_iters_[i]);
+    l0_iters_[i] = cfd_->table_cache()->NewIterator(
+        read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
+        *l0_files[i], /*range_del_agg=*/nullptr,
+        sv_->mutable_cf_options.prefix_extractor,
+        /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+        TableReaderCaller::kUserIterator, /*arena=*/nullptr,
+        /*skip_filters=*/false, /*level=*/-1,
+        MaxFileSizeForL0MetaPin(sv_->mutable_cf_options),
+        /*smallest_compaction_key=*/nullptr,
+        /*largest_compaction_key=*/nullptr, allow_unprepared_value_);
+    l0_iters_[i]->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  for (auto* level_iter : level_iters_) {
+    if (level_iter && level_iter->status().IsIncomplete()) {
+      level_iter->Reset();
+    }
+  }
+
+  current_ = nullptr;
+  is_prev_set_ = false;
+}
+
+void ForwardIterator::UpdateCurrent() {
+  if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
+    current_ = nullptr;
+  } else if (immutable_min_heap_.empty()) {
+    current_ = mutable_iter_;
+  } else if (!mutable_iter_->Valid()) {
+    current_ = immutable_min_heap_.top();
+    immutable_min_heap_.pop();
+  } else {
+    current_ = immutable_min_heap_.top();
+    assert(current_ != nullptr);
+    assert(current_->Valid());
+    int cmp = cfd_->internal_comparator().InternalKeyComparator::Compare(
+        mutable_iter_->key(), current_->key());
+    assert(cmp != 0);
+    if (cmp > 0) {
+      immutable_min_heap_.pop();
+    } else {
+      current_ = mutable_iter_;
+    }
+  }
+  valid_ = current_ != nullptr && immutable_status_.ok();
+  if (!status_.ok()) {
+    status_ = Status::OK();
+  }
+
+  // Upper bound doesn't apply to the memtable iterator. We want Valid() to
+  // return false when all iterators are over iterate_upper_bound, but can't
+  // just set valid_ to false, as that would effectively disable the tailing
+  // optimization (Seek() would be called on all immutable iterators regardless
+  // of whether the target key is greater than prev_key_).
+  current_over_upper_bound_ = valid_ && IsOverUpperBound(current_->key());
+}
+
+bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
+  // We maintain the interval (prev_key_, immutable_min_heap_.top()->key())
+  // such that there are no records with keys within that range in
+  // immutable_min_heap_. Since immutable structures (SST files and immutable
+  // memtables) can't change in this version, we don't need to do a seek if
+  // 'target' belongs to that interval (immutable_min_heap_.top() is already
+  // at the correct position).
+
+  if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) {
+    return true;
+  }
+  Slice prev_key = prev_key_.GetInternalKey();
+  if (prefix_extractor_ && prefix_extractor_->Transform(target).compare(
+                               prefix_extractor_->Transform(prev_key)) != 0) {
+    return true;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) {
+    return true;
+  }
+
+  if (immutable_min_heap_.empty() && current_ == mutable_iter_) {
+    // Nothing to seek on.
+    return false;
+  }
+  if (cfd_->internal_comparator().InternalKeyComparator::Compare(
+          target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key()
+                                            : current_->key()) > 0) {
+    return true;
+  }
+  return false;
+}
+
+void ForwardIterator::DeleteCurrentIter() {
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      continue;
+    }
+    if (l0_iters_[i] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(l0_iters_[i]);
+      l0_iters_[i] = nullptr;
+      return;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if (level_iters_[level - 1] == nullptr) {
+      continue;
+    }
+    if (level_iters_[level - 1] == current_) {
+      has_iter_trimmed_for_upper_bound_ = true;
+      DeleteIterator(level_iters_[level - 1]);
+      level_iters_[level - 1] = nullptr;
+    }
+  }
+}
+
+bool ForwardIterator::TEST_CheckDeletedIters(int* pdeleted_iters,
+                                             int* pnum_iters) {
+  bool retval = false;
+  int deleted_iters = 0;
+  int num_iters = 0;
+
+  const VersionStorageInfo* vstorage = sv_->current->storage_info();
+  const std::vector<FileMetaData*>& l0 = vstorage->LevelFiles(0);
+  for (size_t i = 0; i < l0.size(); ++i) {
+    if (!l0_iters_[i]) {
+      retval = true;
+      deleted_iters++;
+    } else {
+      num_iters++;
+    }
+  }
+
+  for (int32_t level = 1; level < vstorage->num_levels(); ++level) {
+    if ((level_iters_[level - 1] == nullptr) &&
+        (!vstorage->LevelFiles(level).empty())) {
+      retval = true;
+      deleted_iters++;
+    } else if (!vstorage->LevelFiles(level).empty()) {
+      num_iters++;
+    }
+  }
+  if ((!retval) && num_iters <= 1) {
+    retval = true;
+  }
+  if (pdeleted_iters) {
+    *pdeleted_iters = deleted_iters;
+  }
+  if (pnum_iters) {
+    *pnum_iters = num_iters;
+  }
+  return retval;
+}
+
+uint32_t ForwardIterator::FindFileInRange(
+    const std::vector<FileMetaData*>& files, const Slice& internal_key,
+    uint32_t left, uint32_t right) {
+  auto cmp = [&](const FileMetaData* f, const Slice& k) -> bool {
+    return cfd_->internal_comparator().InternalKeyComparator::Compare(
+               f->largest.Encode(), k) < 0;
+  };
+  const auto& b = files.begin();
+  return static_cast<uint32_t>(
+      std::lower_bound(b + left, b + right, internal_key, cmp) - b);
+}
+
+void ForwardIterator::DeleteIterator(InternalIterator* iter, bool is_arena) {
+  if (iter == nullptr) {
+    return;
+  }
+
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(iter, is_arena);
+  } else {
+    if (is_arena) {
+      iter->~InternalIterator();
+    } else {
+      delete iter;
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator.h b/src/rocksdb/db/forward_iterator.h
new file mode 100644
index 000000000..5a5c6f0f3
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator.h
@@ -0,0 +1,168 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include "rocksdb/comparator.h"
+#ifndef ROCKSDB_LITE
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "memory/arena.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBImpl;
+class Env;
+struct SuperVersion;
+class ColumnFamilyData;
+class ForwardLevelIterator;
+class VersionStorageInfo;
+struct FileMetaData;
+
+class MinIterComparator {
+ public:
+  explicit MinIterComparator(const CompareInterface* comparator)
+      : comparator_(comparator) {}
+
+  bool operator()(InternalIterator* a, InternalIterator* b) {
+    return comparator_->Compare(a->key(), b->key()) > 0;
+  }
+
+ private:
+  const CompareInterface* comparator_;
+};
+
+using MinIterHeap =
+    std::priority_queue<InternalIterator*, std::vector<InternalIterator*>,
+                        MinIterComparator>;
+
+/**
+ * ForwardIterator is a special type of iterator that only supports Seek()
+ * and Next(). It is expected to perform better than TailingIterator by
+ * removing the encapsulation and making all information accessible within
+ * the iterator. At the current implementation, snapshot is taken at the
+ * time Seek() is called. The Next() followed do not see new values after.
+ */
+class ForwardIterator : public InternalIterator {
+ public:
+  ForwardIterator(DBImpl* db, const ReadOptions& read_options,
+                  ColumnFamilyData* cfd, SuperVersion* current_sv = nullptr,
+                  bool allow_unprepared_value = false);
+  virtual ~ForwardIterator();
+
+  void SeekForPrev(const Slice& /*target*/) override {
+    status_ = Status::NotSupported("ForwardIterator::SeekForPrev()");
+    valid_ = false;
+  }
+  void SeekToLast() override {
+    status_ = Status::NotSupported("ForwardIterator::SeekToLast()");
+    valid_ = false;
+  }
+  void Prev() override {
+    status_ = Status::NotSupported("ForwardIterator::Prev");
+    valid_ = false;
+  }
+
+  virtual bool Valid() const override;
+  void SeekToFirst() override;
+  virtual void Seek(const Slice& target) override;
+  virtual void Next() override;
+  virtual Slice key() const override;
+  virtual Slice value() const override;
+  virtual Status status() const override;
+  virtual bool PrepareValue() override;
+  virtual Status GetProperty(std::string prop_name, std::string* prop) override;
+  virtual void SetPinnedItersMgr(
+      PinnedIteratorsManager* pinned_iters_mgr) override;
+  virtual bool IsKeyPinned() const override;
+  virtual bool IsValuePinned() const override;
+
+  bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters);
+
+ private:
+  void Cleanup(bool release_sv);
+  // Unreference and, if needed, clean up the current SuperVersion. This is
+  // either done immediately or deferred until this iterator is unpinned by
+  // PinnedIteratorsManager.
+  void SVCleanup();
+  static void SVCleanup(DBImpl* db, SuperVersion* sv,
+                        bool background_purge_on_iterator_cleanup);
+  static void DeferredSVCleanup(void* arg);
+
+  void RebuildIterators(bool refresh_sv);
+  void RenewIterators();
+  void BuildLevelIterators(const VersionStorageInfo* vstorage,
+                           SuperVersion* sv);
+  void ResetIncompleteIterators();
+  void SeekInternal(const Slice& internal_key, bool seek_to_first,
+                    bool seek_after_async_io);
+
+  void UpdateCurrent();
+  bool NeedToSeekImmutable(const Slice& internal_key);
+  void DeleteCurrentIter();
+  uint32_t FindFileInRange(const std::vector<FileMetaData*>& files,
+                           const Slice& internal_key, uint32_t left,
+                           uint32_t right);
+
+  bool IsOverUpperBound(const Slice& internal_key) const;
+
+  // Set PinnedIteratorsManager for all children Iterators, this function should
+  // be called whenever we update children Iterators or pinned_iters_mgr_.
+  void UpdateChildrenPinnedItersMgr();
+
+  // A helper function that will release iter in the proper manner, or pass it
+  // to pinned_iters_mgr_ to release it later if pinning is enabled.
+  void DeleteIterator(InternalIterator* iter, bool is_arena = false);
+
+  DBImpl* const db_;
+  const ReadOptions read_options_;
+  ColumnFamilyData* const cfd_;
+  const SliceTransform* const prefix_extractor_;
+  const Comparator* user_comparator_;
+  const bool allow_unprepared_value_;
+  MinIterHeap immutable_min_heap_;
+
+  SuperVersion* sv_;
+  InternalIterator* mutable_iter_;
+  std::vector<InternalIterator*> imm_iters_;
+  std::vector<InternalIterator*> l0_iters_;
+  std::vector<ForwardLevelIterator*> level_iters_;
+  InternalIterator* current_;
+  bool valid_;
+
+  // Internal iterator status; set only by one of the unsupported methods.
+  Status status_;
+  // Status of immutable iterators, maintained here to avoid iterating over
+  // all of them in status().
+  Status immutable_status_;
+  // Indicates that at least one of the immutable iterators pointed to a key
+  // larger than iterate_upper_bound and was therefore destroyed. Seek() may
+  // need to rebuild such iterators.
+  bool has_iter_trimmed_for_upper_bound_;
+  // Is current key larger than iterate_upper_bound? If so, makes Valid()
+  // return false.
+  bool current_over_upper_bound_;
+
+  // Left endpoint of the range of keys that immutable iterators currently
+  // cover. When Seek() is called with a key that's within that range, immutable
+  // iterators don't need to be moved; see NeedToSeekImmutable(). This key is
+  // included in the range after a Seek(), but excluded when advancing the
+  // iterator using Next().
+  IterKey prev_key_;
+  bool is_prev_set_;
+  bool is_prev_inclusive_;
+
+  PinnedIteratorsManager* pinned_iters_mgr_;
+  Arena arena_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/forward_iterator_bench.cc b/src/rocksdb/db/forward_iterator_bench.cc
new file mode 100644
index 000000000..325661cef
--- /dev/null
+++ b/src/rocksdb/db/forward_iterator_bench.cc
@@ -0,0 +1,378 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#if !defined(GFLAGS) || defined(ROCKSDB_LITE)
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#elif defined(OS_MACOSX) || defined(OS_WIN)
+// Block forward_iterator_bench under MAC and Windows
+int main() { return 0; }
+#else
+#include <semaphore.h>
+
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <climits>
+#include <condition_variable>
+#include <limits>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/gflags_compat.h"
+
+const int MAX_SHARDS = 100000;
+
+DEFINE_int32(writers, 8, "");
+DEFINE_int32(readers, 8, "");
+DEFINE_int64(rate, 100000, "");
+DEFINE_int64(value_size, 300, "");
+DEFINE_int64(shards, 1000, "");
+DEFINE_int64(memtable_size, 500000000, "");
+DEFINE_int64(block_cache_size, 300000000, "");
+DEFINE_int64(block_size, 65536, "");
+DEFINE_double(runtime, 300.0, "");
+DEFINE_bool(cache_only_first, true, "");
+DEFINE_bool(iterate_upper_bound, true, "");
+
+struct Stats {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> written{0};
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> read{0};
+  std::atomic<uint64_t> cache_misses{0};
+  char pad3[128] __attribute__((__unused__));
+} stats;
+
+struct Key {
+  Key() {}
+  Key(uint64_t shard_in, uint64_t seqno_in)
+      : shard_be(htobe64(shard_in)), seqno_be(htobe64(seqno_in)) {}
+
+  uint64_t shard() const { return be64toh(shard_be); }
+  uint64_t seqno() const { return be64toh(seqno_be); }
+
+ private:
+  uint64_t shard_be;
+  uint64_t seqno_be;
+} __attribute__((__packed__));
+
+struct Reader;
+struct Writer;
+
+struct ShardState {
+  char pad1[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_written{0};
+  Writer* writer;
+  Reader* reader;
+  char pad2[128] __attribute__((__unused__));
+  std::atomic<uint64_t> last_read{0};
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it;
+  std::unique_ptr<ROCKSDB_NAMESPACE::Iterator> it_cacheonly;
+  Key upper_bound;
+  ROCKSDB_NAMESPACE::Slice upper_bound_slice;
+  char pad3[128] __attribute__((__unused__));
+};
+
+struct Reader {
+ public:
+  explicit Reader(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {
+    sem_init(&sem_, 0, 0);
+    thread_ = port::Thread(&Reader::run, this);
+  }
+
+  void run() {
+    while (1) {
+      sem_wait(&sem_);
+      if (done_.load()) {
+        break;
+      }
+
+      uint64_t shard;
+      {
+        std::lock_guard<std::mutex> guard(queue_mutex_);
+        assert(!shards_pending_queue_.empty());
+        shard = shards_pending_queue_.front();
+        shards_pending_queue_.pop();
+        shards_pending_set_.reset(shard);
+      }
+      readOnceFromShard(shard);
+    }
+  }
+
+  void readOnceFromShard(uint64_t shard) {
+    ShardState& state = (*shard_states_)[shard];
+    if (!state.it) {
+      // Initialize iterators
+      ROCKSDB_NAMESPACE::ReadOptions options;
+      options.tailing = true;
+      if (FLAGS_iterate_upper_bound) {
+        state.upper_bound = Key(shard, std::numeric_limits<uint64_t>::max());
+        state.upper_bound_slice = ROCKSDB_NAMESPACE::Slice(
+            (const char*)&state.upper_bound, sizeof(state.upper_bound));
+        options.iterate_upper_bound = &state.upper_bound_slice;
+      }
+
+      state.it.reset(db_->NewIterator(options));
+
+      if (FLAGS_cache_only_first) {
+        options.read_tier = ROCKSDB_NAMESPACE::ReadTier::kBlockCacheTier;
+        state.it_cacheonly.reset(db_->NewIterator(options));
+      }
+    }
+
+    const uint64_t upto = state.last_written.load();
+    for (ROCKSDB_NAMESPACE::Iterator* it :
+         {state.it_cacheonly.get(), state.it.get()}) {
+      if (it == nullptr) {
+        continue;
+      }
+      if (state.last_read.load() >= upto) {
+        break;
+      }
+      bool need_seek = true;
+      for (uint64_t seq = state.last_read.load() + 1; seq <= upto; ++seq) {
+        if (need_seek) {
+          Key from(shard, state.last_read.load() + 1);
+          it->Seek(ROCKSDB_NAMESPACE::Slice((const char*)&from, sizeof(from)));
+          need_seek = false;
+        } else {
+          it->Next();
+        }
+        if (it->status().IsIncomplete()) {
+          ++::stats.cache_misses;
+          break;
+        }
+        assert(it->Valid());
+        assert(it->key().size() == sizeof(Key));
+        Key key;
+        memcpy(&key, it->key().data(), it->key().size());
+        // fprintf(stderr, "Expecting (%ld, %ld) read (%ld, %ld)\n",
+        //         shard, seq, key.shard(), key.seqno());
+        assert(key.shard() == shard);
+        assert(key.seqno() == seq);
+        state.last_read.store(seq);
+        ++::stats.read;
+      }
+    }
+  }
+
+  void onWrite(uint64_t shard) {
+    {
+      std::lock_guard<std::mutex> guard(queue_mutex_);
+      if (!shards_pending_set_.test(shard)) {
+        shards_pending_queue_.push(shard);
+        shards_pending_set_.set(shard);
+        sem_post(&sem_);
+      }
+    }
+  }
+
+  ~Reader() {
+    done_.store(true);
+    sem_post(&sem_);
+    thread_.join();
+  }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  sem_t sem_;
+  std::mutex queue_mutex_;
+  std::bitset<MAX_SHARDS + 1> shards_pending_set_;
+  std::queue<uint64_t> shards_pending_queue_;
+  std::atomic<bool> done_{false};
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct Writer {
+  explicit Writer(std::vector<ShardState>* shard_states,
+                  ROCKSDB_NAMESPACE::DB* db)
+      : shard_states_(shard_states), db_(db) {}
+
+  void start() { thread_ = port::Thread(&Writer::run, this); }
+
+  void run() {
+    std::queue<std::chrono::steady_clock::time_point> workq;
+    std::chrono::steady_clock::time_point deadline(
+        std::chrono::steady_clock::now() +
+        std::chrono::nanoseconds((uint64_t)(1000000000 * FLAGS_runtime)));
+    std::vector<uint64_t> my_shards;
+    for (int i = 1; i <= FLAGS_shards; ++i) {
+      if ((*shard_states_)[i].writer == this) {
+        my_shards.push_back(i);
+      }
+    }
+
+    std::mt19937 rng{std::random_device()()};
+    std::uniform_int_distribution<int> shard_dist(
+        0, static_cast<int>(my_shards.size()) - 1);
+    std::string value(FLAGS_value_size, '*');
+
+    while (1) {
+      auto now = std::chrono::steady_clock::now();
+      if (FLAGS_runtime >= 0 && now >= deadline) {
+        break;
+      }
+      if (workq.empty()) {
+        for (int i = 0; i < FLAGS_rate; i += FLAGS_writers) {
+          std::chrono::nanoseconds offset(1000000000LL * i / FLAGS_rate);
+          workq.push(now + offset);
+        }
+      }
+      while (!workq.empty() && workq.front() < now) {
+        workq.pop();
+        uint64_t shard = my_shards[shard_dist(rng)];
+        ShardState& state = (*shard_states_)[shard];
+        uint64_t seqno = state.last_written.load() + 1;
+        Key key(shard, seqno);
+        // fprintf(stderr, "Writing (%ld, %ld)\n", shard, seqno);
+        ROCKSDB_NAMESPACE::Status status =
+            db_->Put(ROCKSDB_NAMESPACE::WriteOptions(),
+                     ROCKSDB_NAMESPACE::Slice((const char*)&key, sizeof(key)),
+                     ROCKSDB_NAMESPACE::Slice(value));
+        assert(status.ok());
+        state.last_written.store(seqno);
+        state.reader->onWrite(shard);
+        ++::stats.written;
+      }
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    }
+    // fprintf(stderr, "Writer done\n");
+  }
+
+  ~Writer() { thread_.join(); }
+
+ private:
+  char pad1[128] __attribute__((__unused__));
+  std::vector<ShardState>* shard_states_;
+  ROCKSDB_NAMESPACE::DB* db_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  char pad2[128] __attribute__((__unused__));
+};
+
+struct StatsThread {
+  explicit StatsThread(ROCKSDB_NAMESPACE::DB* db)
+      : db_(db), thread_(&StatsThread::run, this) {}
+
+  void run() {
+    auto tstart = std::chrono::steady_clock::now(), tlast = tstart;
+    uint64_t wlast = 0, rlast = 0;
+    while (!done_.load()) {
+      {
+        std::unique_lock<std::mutex> lock(cvm_);
+        cv_.wait_for(lock, std::chrono::seconds(1));
+      }
+      auto now = std::chrono::steady_clock::now();
+      double elapsed =
+          std::chrono::duration_cast<std::chrono::duration<double> >(now -
+                                                                     tlast)
+              .count();
+      uint64_t w = ::stats.written.load();
+      uint64_t r = ::stats.read.load();
+      fprintf(stderr,
+              "%s elapsed %4lds | written %10ld | w/s %10.0f | read %10ld | "
+              "r/s %10.0f | cache misses %10ld\n",
+              db_->GetEnv()->TimeToString(time(nullptr)).c_str(),
+              std::chrono::duration_cast<std::chrono::seconds>(now - tstart)
+                  .count(),
+              w, (w - wlast) / elapsed, r, (r - rlast) / elapsed,
+              ::stats.cache_misses.load());
+      wlast = w;
+      rlast = r;
+      tlast = now;
+    }
+  }
+
+  ~StatsThread() {
+    {
+      std::lock_guard<std::mutex> guard(cvm_);
+      done_.store(true);
+    }
+    cv_.notify_all();
+    thread_.join();
+  }
+
+ private:
+  ROCKSDB_NAMESPACE::DB* db_;
+  std::mutex cvm_;
+  std::condition_variable cv_;
+  ROCKSDB_NAMESPACE::port::Thread thread_;
+  std::atomic<bool> done_{false};
+};
+
+int main(int argc, char** argv) {
+  GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
+
+  std::mt19937 rng{std::random_device()()};
+  ROCKSDB_NAMESPACE::Status status;
+  std::string path =
+      ROCKSDB_NAMESPACE::test::PerThreadDBPath("forward_iterator_test");
+  fprintf(stderr, "db path is %s\n", path.c_str());
+  ROCKSDB_NAMESPACE::Options options;
+  options.create_if_missing = true;
+  options.compression = ROCKSDB_NAMESPACE::CompressionType::kNoCompression;
+  options.compaction_style =
+      ROCKSDB_NAMESPACE::CompactionStyle::kCompactionStyleNone;
+  options.level0_slowdown_writes_trigger = 99999;
+  options.level0_stop_writes_trigger = 99999;
+  options.use_direct_io_for_flush_and_compaction = true;
+  options.write_buffer_size = FLAGS_memtable_size;
+  ROCKSDB_NAMESPACE::BlockBasedTableOptions table_options;
+  table_options.block_cache =
+      ROCKSDB_NAMESPACE::NewLRUCache(FLAGS_block_cache_size);
+  table_options.block_size = FLAGS_block_size;
+  options.table_factory.reset(
+      ROCKSDB_NAMESPACE::NewBlockBasedTableFactory(table_options));
+
+  status = ROCKSDB_NAMESPACE::DestroyDB(path, options);
+  assert(status.ok());
+  ROCKSDB_NAMESPACE::DB* db_raw;
+  status = ROCKSDB_NAMESPACE::DB::Open(options, path, &db_raw);
+  assert(status.ok());
+  std::unique_ptr<ROCKSDB_NAMESPACE::DB> db(db_raw);
+
+  std::vector<ShardState> shard_states(FLAGS_shards + 1);
+  std::deque<Reader> readers;
+  while (static_cast<int>(readers.size()) < FLAGS_readers) {
+    readers.emplace_back(&shard_states, db_raw);
+  }
+  std::deque<Writer> writers;
+  while (static_cast<int>(writers.size()) < FLAGS_writers) {
+    writers.emplace_back(&shard_states, db_raw);
+  }
+
+  // Each shard gets a random reader and random writer assigned to it
+  for (int i = 1; i <= FLAGS_shards; ++i) {
+    std::uniform_int_distribution<int> reader_dist(0, FLAGS_readers - 1);
+    std::uniform_int_distribution<int> writer_dist(0, FLAGS_writers - 1);
+    shard_states[i].reader = &readers[reader_dist(rng)];
+    shard_states[i].writer = &writers[writer_dist(rng)];
+  }
+
+  StatsThread stats_thread(db_raw);
+  for (Writer& w : writers) {
+    w.start();
+  }
+
+  writers.clear();
+  readers.clear();
+}
+#endif  // !defined(GFLAGS) || defined(ROCKSDB_LITE)
diff --git a/src/rocksdb/db/history_trimming_iterator.h b/src/rocksdb/db/history_trimming_iterator.h
new file mode 100644
index 000000000..b445ced33
--- /dev/null
+++ b/src/rocksdb/db/history_trimming_iterator.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/slice.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class HistoryTrimmingIterator : public InternalIterator {
+ public:
+  explicit HistoryTrimmingIterator(InternalIterator* input,
+                                   const Comparator* cmp, const std::string& ts)
+      : input_(input), filter_ts_(ts), cmp_(cmp) {
+    assert(cmp_->timestamp_size() > 0 && !ts.empty());
+  }
+
+  bool filter() const {
+    if (!input_->Valid()) {
+      return true;
+    }
+    Slice current_ts = ExtractTimestampFromKey(key(), cmp_->timestamp_size());
+    return cmp_->CompareTimestamp(current_ts, Slice(filter_ts_)) <= 0;
+  }
+
+  bool Valid() const override { return input_->Valid(); }
+
+  void SeekToFirst() override {
+    input_->SeekToFirst();
+    while (!filter()) {
+      input_->Next();
+    }
+  }
+
+  void SeekToLast() override {
+    input_->SeekToLast();
+    while (!filter()) {
+      input_->Prev();
+    }
+  }
+
+  void Seek(const Slice& target) override {
+    input_->Seek(target);
+    while (!filter()) {
+      input_->Next();
+    }
+  }
+
+  void SeekForPrev(const Slice& target) override {
+    input_->SeekForPrev(target);
+    while (!filter()) {
+      input_->Prev();
+    }
+  }
+
+  void Next() override {
+    do {
+      input_->Next();
+    } while (!filter());
+  }
+
+  void Prev() override {
+    do {
+      input_->Prev();
+    } while (!filter());
+  }
+
+  Slice key() const override { return input_->key(); }
+
+  Slice value() const override { return input_->value(); }
+
+  Status status() const override { return input_->status(); }
+
+  bool IsKeyPinned() const override { return input_->IsKeyPinned(); }
+
+  bool IsValuePinned() const override { return input_->IsValuePinned(); }
+
+ private:
+  InternalIterator* input_;
+  const std::string filter_ts_;
+  const Comparator* const cmp_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_job.cc b/src/rocksdb/db/import_column_family_job.cc
new file mode 100644
index 000000000..34985666a
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.cc
@@ -0,0 +1,312 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/import_column_family_job.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <string>
+#include <vector>
+
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/random_access_file_reader.h"
+#include "logging/logging.h"
+#include "table/merging_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/sst_file_writer_collectors.h"
+#include "table/table_builder.h"
+#include "table/unique_id_impl.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number,
+                                      SuperVersion* sv) {
+  Status status;
+
+  // Read the information of files we are importing
+  for (const auto& file_metadata : metadata_) {
+    const auto file_path = file_metadata.db_path + "/" + file_metadata.name;
+    IngestedFileInfo file_to_import;
+    status =
+        GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv);
+    if (!status.ok()) {
+      return status;
+    }
+    files_to_import_.push_back(file_to_import);
+  }
+
+  auto num_files = files_to_import_.size();
+  if (num_files == 0) {
+    return Status::InvalidArgument("The list of files is empty");
+  } else if (num_files > 1) {
+    // Verify that passed files don't have overlapping ranges in any particular
+    // level.
+    int min_level = 1;  // Check for overlaps in Level 1 and above.
+    int max_level = -1;
+    for (const auto& file_metadata : metadata_) {
+      if (file_metadata.level > max_level) {
+        max_level = file_metadata.level;
+      }
+    }
+    for (int level = min_level; level <= max_level; ++level) {
+      autovector<const IngestedFileInfo*> sorted_files;
+      for (size_t i = 0; i < num_files; i++) {
+        if (metadata_[i].level == level) {
+          sorted_files.push_back(&files_to_import_[i]);
+        }
+      }
+
+      std::sort(
+          sorted_files.begin(), sorted_files.end(),
+          [this](const IngestedFileInfo* info1, const IngestedFileInfo* info2) {
+            return cfd_->internal_comparator().Compare(
+                       info1->smallest_internal_key,
+                       info2->smallest_internal_key) < 0;
+          });
+
+      for (size_t i = 0; i + 1 < sorted_files.size(); i++) {
+        if (cfd_->internal_comparator().Compare(
+                sorted_files[i]->largest_internal_key,
+                sorted_files[i + 1]->smallest_internal_key) >= 0) {
+          return Status::InvalidArgument("Files have overlapping ranges");
+        }
+      }
+    }
+  }
+
+  for (const auto& f : files_to_import_) {
+    if (f.num_entries == 0) {
+      return Status::InvalidArgument("File contain no entries");
+    }
+
+    if (!f.smallest_internal_key.Valid() || !f.largest_internal_key.Valid()) {
+      return Status::Corruption("File has corrupted keys");
+    }
+  }
+
+  // Copy/Move external files into DB
+  auto hardlink_files = import_options_.move_files;
+  for (auto& f : files_to_import_) {
+    const auto path_outside_db = f.external_file_path;
+    const auto path_inside_db = TableFileName(
+        cfd_->ioptions()->cf_paths, f.fd.GetNumber(), f.fd.GetPathId());
+
+    if (hardlink_files) {
+      status =
+          fs_->LinkFile(path_outside_db, path_inside_db, IOOptions(), nullptr);
+      if (status.IsNotSupported()) {
+        // Original file is on a different FS, use copy instead of hard linking
+        hardlink_files = false;
+        ROCKS_LOG_INFO(db_options_.info_log,
+                       "Try to link file %s but it's not supported : %s",
+                       f.internal_file_path.c_str(), status.ToString().c_str());
+      }
+    }
+    if (!hardlink_files) {
+      status =
+          CopyFile(fs_.get(), path_outside_db, path_inside_db, 0,
+                   db_options_.use_fsync, io_tracer_, Temperature::kUnknown);
+    }
+    if (!status.ok()) {
+      break;
+    }
+    f.copy_file = !hardlink_files;
+    f.internal_file_path = path_inside_db;
+  }
+
+  if (!status.ok()) {
+    // We failed, remove all files that we copied into the db
+    for (const auto& f : files_to_import_) {
+      if (f.internal_file_path.empty()) {
+        break;
+      }
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+
+  return status;
+}
+
+// REQUIRES: we have become the only writer by entering both write_thread_ and
+// nonmem_write_thread_
+Status ImportColumnFamilyJob::Run() {
+  Status status;
+  edit_.SetColumnFamily(cfd_->GetID());
+
+  // We use the import time as the ancester time. This is the time the data
+  // is written to the database.
+  int64_t temp_current_time = 0;
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+  uint64_t current_time = kUnknownOldestAncesterTime;
+  if (clock_->GetCurrentTime(&temp_current_time).ok()) {
+    current_time = oldest_ancester_time =
+        static_cast<uint64_t>(temp_current_time);
+  }
+
+  for (size_t i = 0; i < files_to_import_.size(); ++i) {
+    const auto& f = files_to_import_[i];
+    const auto& file_metadata = metadata_[i];
+
+    edit_.AddFile(file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(),
+                  f.fd.GetFileSize(), f.smallest_internal_key,
+                  f.largest_internal_key, file_metadata.smallest_seqno,
+                  file_metadata.largest_seqno, false, file_metadata.temperature,
+                  kInvalidBlobFileNumber, oldest_ancester_time, current_time,
+                  kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+                  f.unique_id);
+
+    // If incoming sequence number is higher, update local sequence number.
+    if (file_metadata.largest_seqno > versions_->LastSequence()) {
+      versions_->SetLastAllocatedSequence(file_metadata.largest_seqno);
+      versions_->SetLastPublishedSequence(file_metadata.largest_seqno);
+      versions_->SetLastSequence(file_metadata.largest_seqno);
+    }
+  }
+
+  return status;
+}
+
+void ImportColumnFamilyJob::Cleanup(const Status& status) {
+  if (!status.ok()) {
+    // We failed to add files to the database remove all the files we copied.
+    for (const auto& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.internal_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "AddFile() clean up for file %s failed : %s",
+                       f.internal_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  } else if (status.ok() && import_options_.move_files) {
+    // The files were moved and added successfully, remove original file links
+    for (IngestedFileInfo& f : files_to_import_) {
+      const auto s =
+          fs_->DeleteFile(f.external_file_path, IOOptions(), nullptr);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "%s was added to DB successfully but failed to remove original "
+            "file link : %s",
+            f.external_file_path.c_str(), s.ToString().c_str());
+      }
+    }
+  }
+}
+
+Status ImportColumnFamilyJob::GetIngestedFileInfo(
+    const std::string& external_file, uint64_t new_file_number,
+    IngestedFileInfo* file_to_import, SuperVersion* sv) {
+  file_to_import->external_file_path = external_file;
+
+  // Get external file size
+  Status status = fs_->GetFileSize(external_file, IOOptions(),
+                                   &file_to_import->file_size, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Assign FD with number
+  file_to_import->fd =
+      FileDescriptor(new_file_number, 0, file_to_import->file_size);
+
+  // Create TableReader for external file
+  std::unique_ptr<TableReader> table_reader;
+  std::unique_ptr<FSRandomAccessFile> sst_file;
+  std::unique_ptr<RandomAccessFileReader> sst_file_reader;
+
+  status =
+      fs_->NewRandomAccessFile(external_file, env_options_, &sst_file, nullptr);
+  if (!status.ok()) {
+    return status;
+  }
+  sst_file_reader.reset(new RandomAccessFileReader(
+      std::move(sst_file), external_file, nullptr /*Env*/, io_tracer_));
+
+  status = cfd_->ioptions()->table_factory->NewTableReader(
+      TableReaderOptions(
+          *cfd_->ioptions(), sv->mutable_cf_options.prefix_extractor,
+          env_options_, cfd_->internal_comparator(),
+          /*skip_filters*/ false, /*immortal*/ false,
+          /*force_direct_prefetch*/ false, /*level*/ -1,
+          /*block_cache_tracer*/ nullptr,
+          /*max_file_size_for_l0_meta_pin*/ 0, versions_->DbSessionId(),
+          /*cur_file_num*/ new_file_number),
+      std::move(sst_file_reader), file_to_import->file_size, &table_reader);
+  if (!status.ok()) {
+    return status;
+  }
+
+  // Get the external file properties
+  auto props = table_reader->GetTableProperties();
+
+  // Set original_seqno to 0.
+  file_to_import->original_seqno = 0;
+
+  // Get number of entries in table
+  file_to_import->num_entries = props->num_entries;
+
+  ParsedInternalKey key;
+  ReadOptions ro;
+  // During reading the external file we can cache blocks that we read into
+  // the block cache, if we later change the global seqno of this file, we will
+  // have block in cache that will include keys with wrong seqno.
+  // We need to disable fill_cache so that we read from the file without
+  // updating the block cache.
+  ro.fill_cache = false;
+  std::unique_ptr<InternalIterator> iter(table_reader->NewIterator(
+      ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr,
+      /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion));
+
+  // Get first (smallest) key from file
+  iter->SeekToFirst();
+  Status pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
+  }
+  file_to_import->smallest_internal_key.SetFrom(key);
+
+  // Get last (largest) key from file
+  iter->SeekToLast();
+  pik_status =
+      ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors);
+  if (!pik_status.ok()) {
+    return Status::Corruption("Corrupted Key in external file. ",
+                              pik_status.getState());
+  }
+  file_to_import->largest_internal_key.SetFrom(key);
+
+  file_to_import->cf_id = static_cast<uint32_t>(props->column_family_id);
+
+  file_to_import->table_properties = *props;
+
+  auto s = GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                  props->orig_file_number,
+                                  &(file_to_import->unique_id));
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Failed to get SST unique id for file %s",
+                   file_to_import->internal_file_path.c_str());
+  }
+
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/import_column_family_job.h b/src/rocksdb/db/import_column_family_job.h
new file mode 100644
index 000000000..57c49c67f
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_job.h
@@ -0,0 +1,82 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/external_sst_file_ingestion_job.h"
+#include "db/snapshot_impl.h"
+#include "options/db_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/metadata.h"
+#include "rocksdb/sst_file_writer.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct EnvOptions;
+class SystemClock;
+
+// Imports a set of sst files as is into a new column family. Logic is similar
+// to ExternalSstFileIngestionJob.
+class ImportColumnFamilyJob {
+ public:
+  ImportColumnFamilyJob(VersionSet* versions, ColumnFamilyData* cfd,
+                        const ImmutableDBOptions& db_options,
+                        const EnvOptions& env_options,
+                        const ImportColumnFamilyOptions& import_options,
+                        const std::vector<LiveFileMetaData>& metadata,
+                        const std::shared_ptr<IOTracer>& io_tracer)
+      : clock_(db_options.clock),
+        versions_(versions),
+        cfd_(cfd),
+        db_options_(db_options),
+        fs_(db_options_.fs, io_tracer),
+        env_options_(env_options),
+        import_options_(import_options),
+        metadata_(metadata),
+        io_tracer_(io_tracer) {}
+
+  // Prepare the job by copying external files into the DB.
+  Status Prepare(uint64_t next_file_number, SuperVersion* sv);
+
+  // Will execute the import job and prepare edit() to be applied.
+  // REQUIRES: Mutex held
+  Status Run();
+
+  // Cleanup after successful/failed job
+  void Cleanup(const Status& status);
+
+  VersionEdit* edit() { return &edit_; }
+
+  const autovector<IngestedFileInfo>& files_to_import() const {
+    return files_to_import_;
+  }
+
+ private:
+  // Open the external file and populate `file_to_import` with all the
+  // external information we need to import this file.
+  Status GetIngestedFileInfo(const std::string& external_file,
+                             uint64_t new_file_number,
+                             IngestedFileInfo* file_to_import,
+                             SuperVersion* sv);
+
+  SystemClock* clock_;
+  VersionSet* versions_;
+  ColumnFamilyData* cfd_;
+  const ImmutableDBOptions& db_options_;
+  const FileSystemPtr fs_;
+  const EnvOptions& env_options_;
+  autovector<IngestedFileInfo> files_to_import_;
+  VersionEdit edit_;
+  const ImportColumnFamilyOptions& import_options_;
+  std::vector<LiveFileMetaData> metadata_;
+  const std::shared_ptr<IOTracer> io_tracer_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/import_column_family_test.cc b/src/rocksdb/db/import_column_family_test.cc
new file mode 100644
index 000000000..2847ea8da
--- /dev/null
+++ b/src/rocksdb/db/import_column_family_test.cc
@@ -0,0 +1,644 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include <functional>
+
+#include "db/db_test_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "rocksdb/sst_file_writer.h"
+#include "test_util/testutil.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ImportColumnFamilyTest : public DBTestBase {
+ public:
+  ImportColumnFamilyTest()
+      : DBTestBase("import_column_family_test", /*env_do_fsync=*/true) {
+    sst_files_dir_ = dbname_ + "/sst_files/";
+    export_files_dir_ = test::PerThreadDBPath(env_, "export");
+    DestroyAndRecreateExternalSSTFilesDir();
+    import_cfh_ = nullptr;
+    import_cfh2_ = nullptr;
+    metadata_ptr_ = nullptr;
+  }
+
+  ~ImportColumnFamilyTest() {
+    if (import_cfh_) {
+      EXPECT_OK(db_->DropColumnFamily(import_cfh_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+      import_cfh_ = nullptr;
+    }
+    if (import_cfh2_) {
+      EXPECT_OK(db_->DropColumnFamily(import_cfh2_));
+      EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh2_));
+      import_cfh2_ = nullptr;
+    }
+    if (metadata_ptr_) {
+      delete metadata_ptr_;
+      metadata_ptr_ = nullptr;
+    }
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
+  }
+
+  void DestroyAndRecreateExternalSSTFilesDir() {
+    EXPECT_OK(DestroyDir(env_, sst_files_dir_));
+    EXPECT_OK(env_->CreateDir(sst_files_dir_));
+    EXPECT_OK(DestroyDir(env_, export_files_dir_));
+  }
+
+  LiveFileMetaData LiveFileMetaDataInit(std::string name, std::string path,
+                                        int level,
+                                        SequenceNumber smallest_seqno,
+                                        SequenceNumber largest_seqno) {
+    LiveFileMetaData metadata;
+    metadata.name = name;
+    metadata.db_path = path;
+    metadata.smallest_seqno = smallest_seqno;
+    metadata.largest_seqno = largest_seqno;
+    metadata.level = level;
+    return metadata;
+  }
+
+ protected:
+  std::string sst_files_dir_;
+  std::string export_files_dir_;
+  ColumnFamilyHandle* import_cfh_;
+  ColumnFamilyHandle* import_cfh2_;
+  ExportImportFilesMetaData* metadata_ptr_;
+};
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFiles) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+  SstFileWriter sfw_unknown(EnvOptions(), options);
+
+  // cf1.sst
+  const std::string cf1_sst_name = "cf1.sst";
+  const std::string cf1_sst = sst_files_dir_ + cf1_sst_name;
+  ASSERT_OK(sfw_cf1.Open(cf1_sst));
+  ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+  ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // cf_unknown.sst
+  const std::string unknown_sst_name = "cf_unknown.sst";
+  const std::string unknown_sst = sst_files_dir_ + unknown_sst_name;
+  ASSERT_OK(sfw_unknown.Open(unknown_sst));
+  ASSERT_OK(sfw_unknown.Put("K3", "V1"));
+  ASSERT_OK(sfw_unknown.Put("K4", "V2"));
+  ASSERT_OK(sfw_unknown.Finish());
+
+  {
+    // Import sst file corresponding to cf1 onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value));
+    ASSERT_EQ(value, "V1");
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value));
+    ASSERT_EQ(value, "V2");
+    ASSERT_OK(db_->DropColumnFamily(import_cfh_));
+    ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+    import_cfh_ = nullptr;
+  }
+
+  {
+    // Import sst file corresponding to unknown cf onto a new cf and verify
+    ExportImportFilesMetaData metadata;
+    metadata.files.push_back(
+        LiveFileMetaDataInit(unknown_sst_name, sst_files_dir_, 0, 20, 29));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(
+        options, "yoyo", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K3", &value));
+    ASSERT_EQ(value, "V1");
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K4", &value));
+    ASSERT_EQ(value, "V2");
+  }
+  EXPECT_OK(db_->DestroyColumnFamilyHandle(import_cfh_));
+  import_cfh_ = nullptr;
+
+  // verify sst unique id during reopen
+  options.verify_sst_unique_id_in_manifest = true;
+  ReopenWithColumnFamilies({"default", "koko", "yoyo"}, options);
+}
+
+TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+
+  // file3.sst
+  const std::string file3_sst_name = "file3.sst";
+  const std::string file3_sst = sst_files_dir_ + file3_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file3_sst));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file2.sst
+  const std::string file2_sst_name = "file2.sst";
+  const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file2_sst));
+  for (int i = 0; i < 100; i += 2) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite1"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1a.sst
+  const std::string file1a_sst_name = "file1a.sst";
+  const std::string file1a_sst = sst_files_dir_ + file1a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1a_sst));
+  for (int i = 0; i < 52; i += 4) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file1b.sst
+  const std::string file1b_sst_name = "file1b.sst";
+  const std::string file1b_sst = sst_files_dir_ + file1b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file1b_sst));
+  for (int i = 52; i < 100; i += 4) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite2"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0a.sst
+  const std::string file0a_sst_name = "file0a.sst";
+  const std::string file0a_sst = sst_files_dir_ + file0a_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0a_sst));
+  for (int i = 0; i < 100; i += 16) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite3"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // file0b.sst
+  const std::string file0b_sst_name = "file0b.sst";
+  const std::string file0b_sst = sst_files_dir_ + file0b_sst_name;
+  ASSERT_OK(sfw_cf1.Open(file0b_sst));
+  for (int i = 0; i < 100; i += 16) {
+    ASSERT_OK(sfw_cf1.Put(Key(i), Key(i) + "_overwrite4"));
+  }
+  ASSERT_OK(sfw_cf1.Finish());
+
+  // Import sst files and verify
+  ExportImportFilesMetaData metadata;
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 3, 10, 19));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 2, 20, 29));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1a_sst_name, sst_files_dir_, 1, 30, 34));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file1b_sst_name, sst_files_dir_, 1, 35, 39));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0a_sst_name, sst_files_dir_, 0, 40, 49));
+  metadata.files.push_back(
+      LiveFileMetaDataInit(file0b_sst_name, sst_files_dir_, 0, 50, 59));
+  metadata.db_comparator_name = options.comparator->Name();
+
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(
+      options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  for (int i = 0; i < 100; i += 5) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite5"));
+  }
+
+  // Flush and check again
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+
+  // Compact and check again.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+  for (int i = 0; i < 100; i++) {
+    std::string value;
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value));
+    if (i % 5 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite5");
+    } else if (i % 16 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite4");
+    } else if (i % 4 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite2");
+    } else if (i % 2 == 0) {
+      ASSERT_EQ(value, Key(i) + "_overwrite1");
+    } else {
+      ASSERT_EQ(value, Key(i) + "_val");
+    }
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(Flush(1));
+
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  ImportColumnFamilyOptions import_options;
+  import_options.move_files = false;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options,
+                                              *metadata_ptr_, &import_cfh_));
+  ASSERT_NE(import_cfh_, nullptr);
+
+  import_options.move_files = true;
+  ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options,
+                                              *metadata_ptr_, &import_cfh2_));
+  ASSERT_NE(import_cfh2_, nullptr);
+  delete metadata_ptr_;
+  metadata_ptr_ = NULL;
+
+  std::string value1, value2;
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Get(1, Key(i)), value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Modify keys in cf1 and verify.
+  for (int i = 0; i < 25; i++) {
+    ASSERT_OK(db_->Delete(WriteOptions(), import_cfh_, Key(i)));
+  }
+  for (int i = 25; i < 50; i++) {
+    ASSERT_OK(
+        db_->Put(WriteOptions(), import_cfh_, Key(i), Key(i) + "_overwrite3"));
+  }
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+
+  // Compact and check again.
+  ASSERT_OK(db_->Flush(FlushOptions(), import_cfh_));
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), import_cfh_, nullptr, nullptr));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_TRUE(
+        db_->Get(ReadOptions(), import_cfh_, Key(i), &value1).IsNotFound());
+  }
+  for (int i = 25; i < 50; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite3", value1);
+  }
+  for (int i = 50; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, Key(i), &value1));
+    ASSERT_EQ(Key(i) + "_overwrite2", value1);
+  }
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(db_->Get(ReadOptions(), import_cfh2_, Key(i), &value2));
+    ASSERT_EQ(Get(1, Key(i)), value2);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  for (int i = 0; i < 100; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_val"));
+  }
+  ASSERT_OK(Flush(1));
+
+  // Compact to create a L1 file.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+
+  // Overwrite the value in the same set of keys.
+  for (int i = 0; i < 50; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite"));
+  }
+
+  // Flush to create L0 file.
+  ASSERT_OK(Flush(1));
+
+  for (int i = 0; i < 25; ++i) {
+    ASSERT_OK(Put(1, Key(i), Key(i) + "_overwrite2"));
+  }
+
+  // Flush again to create another L0 file. It should have higher sequencer.
+  ASSERT_OK(Flush(1));
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  for (int i = 0; i < 100; ++i) {
+    std::string value;
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, Key(i), &value));
+    ASSERT_EQ(Get(1, Key(i)), value);
+  }
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+  delete db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+}
+
+TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) {
+  // Imports a column family containing a level where two files overlap at their
+  // endpoints. "Overlap" means the largest user key in one file is the same as
+  // the smallest user key in the second file.
+  const int kFileBytes = 128 << 10;  // 128KB
+  const int kValueBytes = 1 << 10;   // 1KB
+  const int kNumFiles = 4;
+
+  Options options = CurrentOptions();
+  options.disable_auto_compactions = true;
+  options.num_levels = 2;
+  CreateAndReopenWithCF({"koko"}, options);
+
+  Random rnd(301);
+  // Every key is snapshot protected to ensure older versions will not be
+  // dropped during compaction.
+  std::vector<const Snapshot*> snapshots;
+  snapshots.reserve(kFileBytes / kValueBytes * kNumFiles);
+  for (int i = 0; i < kNumFiles; ++i) {
+    for (int j = 0; j < kFileBytes / kValueBytes; ++j) {
+      auto value = rnd.RandomString(kValueBytes);
+      ASSERT_OK(Put(1, "key", value));
+      snapshots.push_back(db_->GetSnapshot());
+    }
+    ASSERT_OK(Flush(1));
+  }
+
+  // Compact to create overlapping L1 files.
+  ASSERT_OK(
+      db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr));
+  ASSERT_GT(NumTableFilesAtLevel(1, 1), 1);
+
+  Checkpoint* checkpoint;
+  ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
+  ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_,
+                                           &metadata_ptr_));
+  ASSERT_NE(metadata_ptr_, nullptr);
+  delete checkpoint;
+
+  // Create a new db and import the files.
+  DB* db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  ASSERT_OK(DB::Open(options, dbname_ + "/db_copy", &db_copy));
+  ColumnFamilyHandle* cfh = nullptr;
+  ASSERT_OK(db_copy->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                  ImportColumnFamilyOptions(),
+                                                  *metadata_ptr_, &cfh));
+  ASSERT_NE(cfh, nullptr);
+
+  {
+    std::string value;
+    ASSERT_OK(db_copy->Get(ReadOptions(), cfh, "key", &value));
+  }
+  ASSERT_OK(db_copy->DropColumnFamily(cfh));
+  ASSERT_OK(db_copy->DestroyColumnFamilyHandle(cfh));
+  delete db_copy;
+  ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy"));
+  for (const Snapshot* snapshot : snapshots) {
+    db_->ReleaseSnapshot(snapshot);
+  }
+}
+
+TEST_F(ImportColumnFamilyTest, ImportColumnFamilyNegativeTest) {
+  Options options = CurrentOptions();
+  CreateAndReopenWithCF({"koko"}, options);
+
+  {
+    // Create column family with existing cf name.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "koko",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Column family already exists"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with no files specified.
+    ExportImportFilesMetaData metadata;
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("The list of files is empty"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with overlapping keys in sst files.
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file2_sst_name = "file2.sst";
+    const std::string file2_sst = sst_files_dir_ + file2_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file2_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K3", "V3"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file2_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Files have overlapping ranges"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with a mismatching comparator, should fail with appropriate error.
+    ExportImportFilesMetaData metadata;
+    Options mismatch_options = CurrentOptions();
+    mismatch_options.comparator = ReverseBytewiseComparator();
+    SstFileWriter sfw_cf1(EnvOptions(), mismatch_options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Finish());
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = mismatch_options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "coco",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::InvalidArgument("Comparator name mismatch"));
+    ASSERT_EQ(import_cfh_, nullptr);
+  }
+
+  {
+    // Import with non existent sst file should fail with appropriate error
+    ExportImportFilesMetaData metadata;
+    SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]);
+    const std::string file1_sst_name = "file1.sst";
+    const std::string file1_sst = sst_files_dir_ + file1_sst_name;
+    ASSERT_OK(sfw_cf1.Open(file1_sst));
+    ASSERT_OK(sfw_cf1.Put("K1", "V1"));
+    ASSERT_OK(sfw_cf1.Put("K2", "V2"));
+    ASSERT_OK(sfw_cf1.Finish());
+    const std::string file3_sst_name = "file3.sst";
+
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file1_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.files.push_back(
+        LiveFileMetaDataInit(file3_sst_name, sst_files_dir_, 1, 10, 19));
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_EQ(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_),
+              Status::IOError("No such file or directory"));
+    ASSERT_EQ(import_cfh_, nullptr);
+
+    // Test successful import after a failure with the same CF name. Ensures
+    // there is no side effect with CF when there is a failed import
+    metadata.files.pop_back();
+    metadata.db_comparator_name = options.comparator->Name();
+
+    ASSERT_OK(db_->CreateColumnFamilyWithImport(ColumnFamilyOptions(), "yoyo",
+                                                ImportColumnFamilyOptions(),
+                                                metadata, &import_cfh_));
+    ASSERT_NE(import_cfh_, nullptr);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as External SST File Writer and Import are not supported "
+          "in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/internal_stats.cc b/src/rocksdb/db/internal_stats.cc
new file mode 100644
index 000000000..ac5b81f3e
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.cc
@@ -0,0 +1,2002 @@
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/internal_stats.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "cache/cache_entry_stats.h"
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "table/block_based/cachable_entry.h"
+#include "util/hash_containers.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+const std::map<LevelStatType, LevelStat> InternalStats::compaction_level_stats =
+    {
+        {LevelStatType::NUM_FILES, LevelStat{"NumFiles", "Files"}},
+        {LevelStatType::COMPACTED_FILES,
+         LevelStat{"CompactedFiles", "CompactedFiles"}},
+        {LevelStatType::SIZE_BYTES, LevelStat{"SizeBytes", "Size"}},
+        {LevelStatType::SCORE, LevelStat{"Score", "Score"}},
+        {LevelStatType::READ_GB, LevelStat{"ReadGB", "Read(GB)"}},
+        {LevelStatType::RN_GB, LevelStat{"RnGB", "Rn(GB)"}},
+        {LevelStatType::RNP1_GB, LevelStat{"Rnp1GB", "Rnp1(GB)"}},
+        {LevelStatType::WRITE_GB, LevelStat{"WriteGB", "Write(GB)"}},
+        {LevelStatType::W_NEW_GB, LevelStat{"WnewGB", "Wnew(GB)"}},
+        {LevelStatType::MOVED_GB, LevelStat{"MovedGB", "Moved(GB)"}},
+        {LevelStatType::WRITE_AMP, LevelStat{"WriteAmp", "W-Amp"}},
+        {LevelStatType::READ_MBPS, LevelStat{"ReadMBps", "Rd(MB/s)"}},
+        {LevelStatType::WRITE_MBPS, LevelStat{"WriteMBps", "Wr(MB/s)"}},
+        {LevelStatType::COMP_SEC, LevelStat{"CompSec", "Comp(sec)"}},
+        {LevelStatType::COMP_CPU_SEC,
+         LevelStat{"CompMergeCPU", "CompMergeCPU(sec)"}},
+        {LevelStatType::COMP_COUNT, LevelStat{"CompCount", "Comp(cnt)"}},
+        {LevelStatType::AVG_SEC, LevelStat{"AvgSec", "Avg(sec)"}},
+        {LevelStatType::KEY_IN, LevelStat{"KeyIn", "KeyIn"}},
+        {LevelStatType::KEY_DROP, LevelStat{"KeyDrop", "KeyDrop"}},
+        {LevelStatType::R_BLOB_GB, LevelStat{"RblobGB", "Rblob(GB)"}},
+        {LevelStatType::W_BLOB_GB, LevelStat{"WblobGB", "Wblob(GB)"}},
+};
+
+const std::map<InternalStats::InternalDBStatsType, DBStatInfo>
+    InternalStats::db_stats_type_to_info = {
+        {InternalStats::kIntStatsWalFileBytes,
+         DBStatInfo{"db.wal_bytes_written"}},
+        {InternalStats::kIntStatsWalFileSynced, DBStatInfo{"db.wal_syncs"}},
+        {InternalStats::kIntStatsBytesWritten,
+         DBStatInfo{"db.user_bytes_written"}},
+        {InternalStats::kIntStatsNumKeysWritten,
+         DBStatInfo{"db.user_keys_written"}},
+        {InternalStats::kIntStatsWriteDoneByOther,
+         DBStatInfo{"db.user_writes_by_other"}},
+        {InternalStats::kIntStatsWriteDoneBySelf,
+         DBStatInfo{"db.user_writes_by_self"}},
+        {InternalStats::kIntStatsWriteWithWal,
+         DBStatInfo{"db.user_writes_with_wal"}},
+        {InternalStats::kIntStatsWriteStallMicros,
+         DBStatInfo{"db.user_write_stall_micros"}},
+};
+
+namespace {
+const double kMB = 1048576.0;
+const double kGB = kMB * 1024;
+const double kMicrosInSec = 1000000.0;
+
+void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name,
+                           const std::string& group_by) {
+  int written_size =
+      snprintf(buf, len, "\n** Compaction Stats [%s] **\n", cf_name.c_str());
+  written_size = std::min(written_size, static_cast<int>(len));
+  auto hdr = [](LevelStatType t) {
+    return InternalStats::compaction_level_stats.at(t).header_name.c_str();
+  };
+  int line_size = snprintf(
+      buf + written_size, len - written_size,
+      "%s    %s   %s     %s %s  %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s "
+      "%s\n",
+      // Note that we skip COMPACTED_FILES and merge it with Files column
+      group_by.c_str(), hdr(LevelStatType::NUM_FILES),
+      hdr(LevelStatType::SIZE_BYTES), hdr(LevelStatType::SCORE),
+      hdr(LevelStatType::READ_GB), hdr(LevelStatType::RN_GB),
+      hdr(LevelStatType::RNP1_GB), hdr(LevelStatType::WRITE_GB),
+      hdr(LevelStatType::W_NEW_GB), hdr(LevelStatType::MOVED_GB),
+      hdr(LevelStatType::WRITE_AMP), hdr(LevelStatType::READ_MBPS),
+      hdr(LevelStatType::WRITE_MBPS), hdr(LevelStatType::COMP_SEC),
+      hdr(LevelStatType::COMP_CPU_SEC), hdr(LevelStatType::COMP_COUNT),
+      hdr(LevelStatType::AVG_SEC), hdr(LevelStatType::KEY_IN),
+      hdr(LevelStatType::KEY_DROP), hdr(LevelStatType::R_BLOB_GB),
+      hdr(LevelStatType::W_BLOB_GB));
+
+  written_size += line_size;
+  written_size = std::min(written_size, static_cast<int>(len));
+  snprintf(buf + written_size, len - written_size, "%s\n",
+           std::string(line_size, '-').c_str());
+}
+
+void PrepareLevelStats(std::map<LevelStatType, double>* level_stats,
+                       int num_files, int being_compacted,
+                       double total_file_size, double score, double w_amp,
+                       const InternalStats::CompactionStats& stats) {
+  const uint64_t bytes_read = stats.bytes_read_non_output_levels +
+                              stats.bytes_read_output_level +
+                              stats.bytes_read_blob;
+  const uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+  const int64_t bytes_new = stats.bytes_written - stats.bytes_read_output_level;
+  const double elapsed = (stats.micros + 1) / kMicrosInSec;
+
+  (*level_stats)[LevelStatType::NUM_FILES] = num_files;
+  (*level_stats)[LevelStatType::COMPACTED_FILES] = being_compacted;
+  (*level_stats)[LevelStatType::SIZE_BYTES] = total_file_size;
+  (*level_stats)[LevelStatType::SCORE] = score;
+  (*level_stats)[LevelStatType::READ_GB] = bytes_read / kGB;
+  (*level_stats)[LevelStatType::RN_GB] =
+      stats.bytes_read_non_output_levels / kGB;
+  (*level_stats)[LevelStatType::RNP1_GB] = stats.bytes_read_output_level / kGB;
+  (*level_stats)[LevelStatType::WRITE_GB] = stats.bytes_written / kGB;
+  (*level_stats)[LevelStatType::W_NEW_GB] = bytes_new / kGB;
+  (*level_stats)[LevelStatType::MOVED_GB] = stats.bytes_moved / kGB;
+  (*level_stats)[LevelStatType::WRITE_AMP] = w_amp;
+  (*level_stats)[LevelStatType::READ_MBPS] = bytes_read / kMB / elapsed;
+  (*level_stats)[LevelStatType::WRITE_MBPS] = bytes_written / kMB / elapsed;
+  (*level_stats)[LevelStatType::COMP_SEC] = stats.micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_CPU_SEC] = stats.cpu_micros / kMicrosInSec;
+  (*level_stats)[LevelStatType::COMP_COUNT] = stats.count;
+  (*level_stats)[LevelStatType::AVG_SEC] =
+      stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count;
+  (*level_stats)[LevelStatType::KEY_IN] =
+      static_cast<double>(stats.num_input_records);
+  (*level_stats)[LevelStatType::KEY_DROP] =
+      static_cast<double>(stats.num_dropped_records);
+  (*level_stats)[LevelStatType::R_BLOB_GB] = stats.bytes_read_blob / kGB;
+  (*level_stats)[LevelStatType::W_BLOB_GB] = stats.bytes_written_blob / kGB;
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     const std::map<LevelStatType, double>& stat_value) {
+  snprintf(
+      buf, len,
+      "%4s "      /*  Level */
+      "%6d/%-3d " /*  Files */
+      "%8s "      /*  Size */
+      "%5.1f "    /*  Score */
+      "%8.1f "    /*  Read(GB) */
+      "%7.1f "    /*  Rn(GB) */
+      "%8.1f "    /*  Rnp1(GB) */
+      "%9.1f "    /*  Write(GB) */
+      "%8.1f "    /*  Wnew(GB) */
+      "%9.1f "    /*  Moved(GB) */
+      "%5.1f "    /*  W-Amp */
+      "%8.1f "    /*  Rd(MB/s) */
+      "%8.1f "    /*  Wr(MB/s) */
+      "%9.2f "    /*  Comp(sec) */
+      "%17.2f "   /*  CompMergeCPU(sec) */
+      "%9d "      /*  Comp(cnt) */
+      "%8.3f "    /*  Avg(sec) */
+      "%7s "      /*  KeyIn */
+      "%6s "      /*  KeyDrop */
+      "%9.1f "    /*  Rblob(GB) */
+      "%9.1f\n",  /*  Wblob(GB) */
+      name.c_str(), static_cast<int>(stat_value.at(LevelStatType::NUM_FILES)),
+      static_cast<int>(stat_value.at(LevelStatType::COMPACTED_FILES)),
+      BytesToHumanString(
+          static_cast<uint64_t>(stat_value.at(LevelStatType::SIZE_BYTES)))
+          .c_str(),
+      stat_value.at(LevelStatType::SCORE),
+      stat_value.at(LevelStatType::READ_GB),
+      stat_value.at(LevelStatType::RN_GB),
+      stat_value.at(LevelStatType::RNP1_GB),
+      stat_value.at(LevelStatType::WRITE_GB),
+      stat_value.at(LevelStatType::W_NEW_GB),
+      stat_value.at(LevelStatType::MOVED_GB),
+      stat_value.at(LevelStatType::WRITE_AMP),
+      stat_value.at(LevelStatType::READ_MBPS),
+      stat_value.at(LevelStatType::WRITE_MBPS),
+      stat_value.at(LevelStatType::COMP_SEC),
+      stat_value.at(LevelStatType::COMP_CPU_SEC),
+      static_cast<int>(stat_value.at(LevelStatType::COMP_COUNT)),
+      stat_value.at(LevelStatType::AVG_SEC),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_IN)))
+          .c_str(),
+      NumberToHumanString(
+          static_cast<std::int64_t>(stat_value.at(LevelStatType::KEY_DROP)))
+          .c_str(),
+      stat_value.at(LevelStatType::R_BLOB_GB),
+      stat_value.at(LevelStatType::W_BLOB_GB));
+}
+
+void PrintLevelStats(char* buf, size_t len, const std::string& name,
+                     int num_files, int being_compacted, double total_file_size,
+                     double score, double w_amp,
+                     const InternalStats::CompactionStats& stats) {
+  std::map<LevelStatType, double> level_stats;
+  PrepareLevelStats(&level_stats, num_files, being_compacted, total_file_size,
+                    score, w_amp, stats);
+  PrintLevelStats(buf, len, name, level_stats);
+}
+
+// Assumes that trailing numbers represent an optional argument. This requires
+// property names to not end with numbers.
+std::pair<Slice, Slice> GetPropertyNameAndArg(const Slice& property) {
+  Slice name = property, arg = property;
+  size_t sfx_len = 0;
+  while (sfx_len < property.size() &&
+         isdigit(property[property.size() - sfx_len - 1])) {
+    ++sfx_len;
+  }
+  name.remove_suffix(sfx_len);
+  arg.remove_prefix(property.size() - sfx_len);
+  return {name, arg};
+}
+}  // anonymous namespace
+
+static const std::string rocksdb_prefix = "rocksdb.";
+
+static const std::string num_files_at_level_prefix = "num-files-at-level";
+static const std::string compression_ratio_at_level_prefix =
+    "compression-ratio-at-level";
+static const std::string allstats = "stats";
+static const std::string sstables = "sstables";
+static const std::string cfstats = "cfstats";
+static const std::string cfstats_no_file_histogram =
+    "cfstats-no-file-histogram";
+static const std::string cf_file_histogram = "cf-file-histogram";
+static const std::string dbstats = "dbstats";
+static const std::string levelstats = "levelstats";
+static const std::string block_cache_entry_stats = "block-cache-entry-stats";
+static const std::string fast_block_cache_entry_stats =
+    "fast-block-cache-entry-stats";
+static const std::string num_immutable_mem_table = "num-immutable-mem-table";
+static const std::string num_immutable_mem_table_flushed =
+    "num-immutable-mem-table-flushed";
+static const std::string mem_table_flush_pending = "mem-table-flush-pending";
+static const std::string compaction_pending = "compaction-pending";
+static const std::string background_errors = "background-errors";
+static const std::string cur_size_active_mem_table =
+    "cur-size-active-mem-table";
+static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables";
+static const std::string size_all_mem_tables = "size-all-mem-tables";
+static const std::string num_entries_active_mem_table =
+    "num-entries-active-mem-table";
+static const std::string num_entries_imm_mem_tables =
+    "num-entries-imm-mem-tables";
+static const std::string num_deletes_active_mem_table =
+    "num-deletes-active-mem-table";
+static const std::string num_deletes_imm_mem_tables =
+    "num-deletes-imm-mem-tables";
+static const std::string estimate_num_keys = "estimate-num-keys";
+static const std::string estimate_table_readers_mem =
+    "estimate-table-readers-mem";
+static const std::string is_file_deletions_enabled =
+    "is-file-deletions-enabled";
+static const std::string num_snapshots = "num-snapshots";
+static const std::string oldest_snapshot_time = "oldest-snapshot-time";
+static const std::string oldest_snapshot_sequence = "oldest-snapshot-sequence";
+static const std::string num_live_versions = "num-live-versions";
+static const std::string current_version_number =
+    "current-super-version-number";
+static const std::string estimate_live_data_size = "estimate-live-data-size";
+static const std::string min_log_number_to_keep_str = "min-log-number-to-keep";
+static const std::string min_obsolete_sst_number_to_keep_str =
+    "min-obsolete-sst-number-to-keep";
+static const std::string base_level_str = "base-level";
+static const std::string total_sst_files_size = "total-sst-files-size";
+static const std::string live_sst_files_size = "live-sst-files-size";
+static const std::string live_sst_files_size_at_temperature =
+    "live-sst-files-size-at-temperature";
+static const std::string estimate_pending_comp_bytes =
+    "estimate-pending-compaction-bytes";
+static const std::string aggregated_table_properties =
+    "aggregated-table-properties";
+static const std::string aggregated_table_properties_at_level =
+    aggregated_table_properties + "-at-level";
+static const std::string num_running_compactions = "num-running-compactions";
+static const std::string num_running_flushes = "num-running-flushes";
+static const std::string actual_delayed_write_rate =
+    "actual-delayed-write-rate";
+static const std::string is_write_stopped = "is-write-stopped";
+static const std::string estimate_oldest_key_time = "estimate-oldest-key-time";
+static const std::string block_cache_capacity = "block-cache-capacity";
+static const std::string block_cache_usage = "block-cache-usage";
+static const std::string block_cache_pinned_usage = "block-cache-pinned-usage";
+static const std::string options_statistics = "options-statistics";
+static const std::string num_blob_files = "num-blob-files";
+static const std::string blob_stats = "blob-stats";
+static const std::string total_blob_file_size = "total-blob-file-size";
+static const std::string live_blob_file_size = "live-blob-file-size";
+static const std::string live_blob_file_garbage_size =
+    "live-blob-file-garbage-size";
+static const std::string blob_cache_capacity = "blob-cache-capacity";
+static const std::string blob_cache_usage = "blob-cache-usage";
+static const std::string blob_cache_pinned_usage = "blob-cache-pinned-usage";
+
+const std::string DB::Properties::kNumFilesAtLevelPrefix =
+    rocksdb_prefix + num_files_at_level_prefix;
+const std::string DB::Properties::kCompressionRatioAtLevelPrefix =
+    rocksdb_prefix + compression_ratio_at_level_prefix;
+const std::string DB::Properties::kStats = rocksdb_prefix + allstats;
+const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables;
+const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats;
+const std::string DB::Properties::kCFStatsNoFileHistogram =
+    rocksdb_prefix + cfstats_no_file_histogram;
+const std::string DB::Properties::kCFFileHistogram =
+    rocksdb_prefix + cf_file_histogram;
+const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats;
+const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats;
+const std::string DB::Properties::kBlockCacheEntryStats =
+    rocksdb_prefix + block_cache_entry_stats;
+const std::string DB::Properties::kFastBlockCacheEntryStats =
+    rocksdb_prefix + fast_block_cache_entry_stats;
+const std::string DB::Properties::kNumImmutableMemTable =
+    rocksdb_prefix + num_immutable_mem_table;
+const std::string DB::Properties::kNumImmutableMemTableFlushed =
+    rocksdb_prefix + num_immutable_mem_table_flushed;
+const std::string DB::Properties::kMemTableFlushPending =
+    rocksdb_prefix + mem_table_flush_pending;
+const std::string DB::Properties::kCompactionPending =
+    rocksdb_prefix + compaction_pending;
+const std::string DB::Properties::kNumRunningCompactions =
+    rocksdb_prefix + num_running_compactions;
+const std::string DB::Properties::kNumRunningFlushes =
+    rocksdb_prefix + num_running_flushes;
+const std::string DB::Properties::kBackgroundErrors =
+    rocksdb_prefix + background_errors;
+const std::string DB::Properties::kCurSizeActiveMemTable =
+    rocksdb_prefix + cur_size_active_mem_table;
+const std::string DB::Properties::kCurSizeAllMemTables =
+    rocksdb_prefix + cur_size_all_mem_tables;
+const std::string DB::Properties::kSizeAllMemTables =
+    rocksdb_prefix + size_all_mem_tables;
+const std::string DB::Properties::kNumEntriesActiveMemTable =
+    rocksdb_prefix + num_entries_active_mem_table;
+const std::string DB::Properties::kNumEntriesImmMemTables =
+    rocksdb_prefix + num_entries_imm_mem_tables;
+const std::string DB::Properties::kNumDeletesActiveMemTable =
+    rocksdb_prefix + num_deletes_active_mem_table;
+const std::string DB::Properties::kNumDeletesImmMemTables =
+    rocksdb_prefix + num_deletes_imm_mem_tables;
+const std::string DB::Properties::kEstimateNumKeys =
+    rocksdb_prefix + estimate_num_keys;
+const std::string DB::Properties::kEstimateTableReadersMem =
+    rocksdb_prefix + estimate_table_readers_mem;
+const std::string DB::Properties::kIsFileDeletionsEnabled =
+    rocksdb_prefix + is_file_deletions_enabled;
+const std::string DB::Properties::kNumSnapshots =
+    rocksdb_prefix + num_snapshots;
+const std::string DB::Properties::kOldestSnapshotTime =
+    rocksdb_prefix + oldest_snapshot_time;
+const std::string DB::Properties::kOldestSnapshotSequence =
+    rocksdb_prefix + oldest_snapshot_sequence;
+const std::string DB::Properties::kNumLiveVersions =
+    rocksdb_prefix + num_live_versions;
+const std::string DB::Properties::kCurrentSuperVersionNumber =
+    rocksdb_prefix + current_version_number;
+const std::string DB::Properties::kEstimateLiveDataSize =
+    rocksdb_prefix + estimate_live_data_size;
+const std::string DB::Properties::kMinLogNumberToKeep =
+    rocksdb_prefix + min_log_number_to_keep_str;
+const std::string DB::Properties::kMinObsoleteSstNumberToKeep =
+    rocksdb_prefix + min_obsolete_sst_number_to_keep_str;
+const std::string DB::Properties::kTotalSstFilesSize =
+    rocksdb_prefix + total_sst_files_size;
+const std::string DB::Properties::kLiveSstFilesSize =
+    rocksdb_prefix + live_sst_files_size;
+const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level_str;
+const std::string DB::Properties::kEstimatePendingCompactionBytes =
+    rocksdb_prefix + estimate_pending_comp_bytes;
+const std::string DB::Properties::kAggregatedTableProperties =
+    rocksdb_prefix + aggregated_table_properties;
+const std::string DB::Properties::kAggregatedTablePropertiesAtLevel =
+    rocksdb_prefix + aggregated_table_properties_at_level;
+const std::string DB::Properties::kActualDelayedWriteRate =
+    rocksdb_prefix + actual_delayed_write_rate;
+const std::string DB::Properties::kIsWriteStopped =
+    rocksdb_prefix + is_write_stopped;
+const std::string DB::Properties::kEstimateOldestKeyTime =
+    rocksdb_prefix + estimate_oldest_key_time;
+const std::string DB::Properties::kBlockCacheCapacity =
+    rocksdb_prefix + block_cache_capacity;
+const std::string DB::Properties::kBlockCacheUsage =
+    rocksdb_prefix + block_cache_usage;
+const std::string DB::Properties::kBlockCachePinnedUsage =
+    rocksdb_prefix + block_cache_pinned_usage;
+const std::string DB::Properties::kOptionsStatistics =
+    rocksdb_prefix + options_statistics;
+const std::string DB::Properties::kLiveSstFilesSizeAtTemperature =
+    rocksdb_prefix + live_sst_files_size_at_temperature;
+const std::string DB::Properties::kNumBlobFiles =
+    rocksdb_prefix + num_blob_files;
+const std::string DB::Properties::kBlobStats = rocksdb_prefix + blob_stats;
+const std::string DB::Properties::kTotalBlobFileSize =
+    rocksdb_prefix + total_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileSize =
+    rocksdb_prefix + live_blob_file_size;
+const std::string DB::Properties::kLiveBlobFileGarbageSize =
+    rocksdb_prefix + live_blob_file_garbage_size;
+const std::string DB::Properties::kBlobCacheCapacity =
+    rocksdb_prefix + blob_cache_capacity;
+const std::string DB::Properties::kBlobCacheUsage =
+    rocksdb_prefix + blob_cache_usage;
+const std::string DB::Properties::kBlobCachePinnedUsage =
+    rocksdb_prefix + blob_cache_pinned_usage;
+
+const std::string InternalStats::kPeriodicCFStats =
+    DB::Properties::kCFStats + ".periodic";
+const int InternalStats::kMaxNoChangePeriodSinceDump = 8;
+
+const UnorderedMap<std::string, DBPropertyInfo>
+    InternalStats::ppt_name_to_info = {
+        {DB::Properties::kNumFilesAtLevelPrefix,
+         {false, &InternalStats::HandleNumFilesAtLevel, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCompressionRatioAtLevelPrefix,
+         {false, &InternalStats::HandleCompressionRatioAtLevelPrefix, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kLevelStats,
+         {false, &InternalStats::HandleLevelStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kStats,
+         {false, &InternalStats::HandleStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kCFStats,
+         {false, &InternalStats::HandleCFStats, nullptr,
+          &InternalStats::HandleCFMapStats, nullptr}},
+        {InternalStats::kPeriodicCFStats,
+         {false, &InternalStats::HandleCFStatsPeriodic, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCFStatsNoFileHistogram,
+         {false, &InternalStats::HandleCFStatsNoFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kCFFileHistogram,
+         {false, &InternalStats::HandleCFFileHistogram, nullptr, nullptr,
+          nullptr}},
+        {DB::Properties::kDBStats,
+         {false, &InternalStats::HandleDBStats, nullptr,
+          &InternalStats::HandleDBMapStats, nullptr}},
+        {DB::Properties::kBlockCacheEntryStats,
+         {true, &InternalStats::HandleBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleBlockCacheEntryStatsMap, nullptr}},
+        {DB::Properties::kFastBlockCacheEntryStats,
+         {true, &InternalStats::HandleFastBlockCacheEntryStats, nullptr,
+          &InternalStats::HandleFastBlockCacheEntryStatsMap, nullptr}},
+        {DB::Properties::kSSTables,
+         {false, &InternalStats::HandleSsTables, nullptr, nullptr, nullptr}},
+        {DB::Properties::kAggregatedTableProperties,
+         {false, &InternalStats::HandleAggregatedTableProperties, nullptr,
+          &InternalStats::HandleAggregatedTablePropertiesMap, nullptr}},
+        {DB::Properties::kAggregatedTablePropertiesAtLevel,
+         {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel,
+          nullptr, &InternalStats::HandleAggregatedTablePropertiesAtLevelMap,
+          nullptr}},
+        {DB::Properties::kNumImmutableMemTable,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kNumImmutableMemTableFlushed,
+         {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed,
+          nullptr, nullptr}},
+        {DB::Properties::kMemTableFlushPending,
+         {false, nullptr, &InternalStats::HandleMemTableFlushPending, nullptr,
+          nullptr}},
+        {DB::Properties::kCompactionPending,
+         {false, nullptr, &InternalStats::HandleCompactionPending, nullptr,
+          nullptr}},
+        {DB::Properties::kBackgroundErrors,
+         {false, nullptr, &InternalStats::HandleBackgroundErrors, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeActiveMemTable,
+         {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable, nullptr,
+          nullptr}},
+        {DB::Properties::kCurSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleCurSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kSizeAllMemTables,
+         {false, nullptr, &InternalStats::HandleSizeAllMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumEntriesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumEntriesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kNumDeletesActiveMemTable,
+         {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable,
+          nullptr, nullptr}},
+        {DB::Properties::kNumDeletesImmMemTables,
+         {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateNumKeys,
+         {false, nullptr, &InternalStats::HandleEstimateNumKeys, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateTableReadersMem,
+         {true, nullptr, &InternalStats::HandleEstimateTableReadersMem, nullptr,
+          nullptr}},
+        {DB::Properties::kIsFileDeletionsEnabled,
+         {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled, nullptr,
+          nullptr}},
+        {DB::Properties::kNumSnapshots,
+         {false, nullptr, &InternalStats::HandleNumSnapshots, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotTime,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotTime, nullptr,
+          nullptr}},
+        {DB::Properties::kOldestSnapshotSequence,
+         {false, nullptr, &InternalStats::HandleOldestSnapshotSequence, nullptr,
+          nullptr}},
+        {DB::Properties::kNumLiveVersions,
+         {false, nullptr, &InternalStats::HandleNumLiveVersions, nullptr,
+          nullptr}},
+        {DB::Properties::kCurrentSuperVersionNumber,
+         {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber,
+          nullptr, nullptr}},
+        {DB::Properties::kEstimateLiveDataSize,
+         {true, nullptr, &InternalStats::HandleEstimateLiveDataSize, nullptr,
+          nullptr}},
+        {DB::Properties::kMinLogNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinLogNumberToKeep, nullptr,
+          nullptr}},
+        {DB::Properties::kMinObsoleteSstNumberToKeep,
+         {false, nullptr, &InternalStats::HandleMinObsoleteSstNumberToKeep,
+          nullptr, nullptr}},
+        {DB::Properties::kBaseLevel,
+         {false, nullptr, &InternalStats::HandleBaseLevel, nullptr, nullptr}},
+        {DB::Properties::kTotalSstFilesSize,
+         {false, nullptr, &InternalStats::HandleTotalSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSize,
+         {false, nullptr, &InternalStats::HandleLiveSstFilesSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveSstFilesSizeAtTemperature,
+         {false, &InternalStats::HandleLiveSstFilesSizeAtTemperature, nullptr,
+          nullptr, nullptr}},
+        {DB::Properties::kEstimatePendingCompactionBytes,
+         {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes,
+          nullptr, nullptr}},
+        {DB::Properties::kNumRunningFlushes,
+         {false, nullptr, &InternalStats::HandleNumRunningFlushes, nullptr,
+          nullptr}},
+        {DB::Properties::kNumRunningCompactions,
+         {false, nullptr, &InternalStats::HandleNumRunningCompactions, nullptr,
+          nullptr}},
+        {DB::Properties::kActualDelayedWriteRate,
+         {false, nullptr, &InternalStats::HandleActualDelayedWriteRate, nullptr,
+          nullptr}},
+        {DB::Properties::kIsWriteStopped,
+         {false, nullptr, &InternalStats::HandleIsWriteStopped, nullptr,
+          nullptr}},
+        {DB::Properties::kEstimateOldestKeyTime,
+         {false, nullptr, &InternalStats::HandleEstimateOldestKeyTime, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlockCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlockCacheUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kBlockCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlockCachePinnedUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kOptionsStatistics,
+         {true, nullptr, nullptr, nullptr,
+          &DBImpl::GetPropertyHandleOptionsStatistics}},
+        {DB::Properties::kNumBlobFiles,
+         {false, nullptr, &InternalStats::HandleNumBlobFiles, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobStats,
+         {false, &InternalStats::HandleBlobStats, nullptr, nullptr, nullptr}},
+        {DB::Properties::kTotalBlobFileSize,
+         {false, nullptr, &InternalStats::HandleTotalBlobFileSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveBlobFileSize,
+         {false, nullptr, &InternalStats::HandleLiveBlobFileSize, nullptr,
+          nullptr}},
+        {DB::Properties::kLiveBlobFileGarbageSize,
+         {false, nullptr, &InternalStats::HandleLiveBlobFileGarbageSize,
+          nullptr, nullptr}},
+        {DB::Properties::kBlobCacheCapacity,
+         {false, nullptr, &InternalStats::HandleBlobCacheCapacity, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobCacheUsage,
+         {false, nullptr, &InternalStats::HandleBlobCacheUsage, nullptr,
+          nullptr}},
+        {DB::Properties::kBlobCachePinnedUsage,
+         {false, nullptr, &InternalStats::HandleBlobCachePinnedUsage, nullptr,
+          nullptr}},
+};
+
+InternalStats::InternalStats(int num_levels, SystemClock* clock,
+                             ColumnFamilyData* cfd)
+    : db_stats_{},
+      cf_stats_value_{},
+      cf_stats_count_{},
+      comp_stats_(num_levels),
+      comp_stats_by_pri_(Env::Priority::TOTAL),
+      file_read_latency_(num_levels),
+      has_cf_change_since_dump_(true),
+      bg_error_count_(0),
+      number_levels_(num_levels),
+      clock_(clock),
+      cfd_(cfd),
+      started_at_(clock->NowMicros()) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    // Extract or create stats collector. Could fail in rare cases.
+    Status s = CacheEntryStatsCollector<CacheEntryRoleStats>::GetShared(
+        block_cache, clock_, &cache_entry_stats_collector_);
+    if (s.ok()) {
+      assert(cache_entry_stats_collector_);
+    } else {
+      assert(!cache_entry_stats_collector_);
+    }
+  }
+}
+
+void InternalStats::TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats,
+                                                bool foreground) {
+  CollectCacheEntryStats(foreground);
+  if (cache_entry_stats_collector_) {
+    cache_entry_stats_collector_->GetStats(stats);
+  }
+}
+
+void InternalStats::CollectCacheEntryStats(bool foreground) {
+  // This function is safe to call from any thread because
+  // cache_entry_stats_collector_ field is const after constructor
+  // and ->GetStats does its own synchronization, which also suffices for
+  // cache_entry_stats_.
+
+  if (!cache_entry_stats_collector_) {
+    return;  // nothing to do (e.g. no block cache)
+  }
+
+  // For "background" collections, strictly cap the collection time by
+  // expanding effective cache TTL. For foreground, be more aggressive about
+  // getting latest data.
+  int min_interval_seconds = foreground ? 10 : 180;
+  // 1/500 = max of 0.2% of one CPU thread
+  int min_interval_factor = foreground ? 10 : 500;
+  cache_entry_stats_collector_->CollectStats(min_interval_seconds,
+                                             min_interval_factor);
+}
+
+std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+InternalStats::CacheEntryRoleStats::GetEntryCallback() {
+  return [&](const Slice& /*key*/, void* /*value*/, size_t charge,
+             Cache::DeleterFn deleter) {
+    auto e = role_map_.find(deleter);
+    size_t role_idx;
+    if (e == role_map_.end()) {
+      role_idx = static_cast<size_t>(CacheEntryRole::kMisc);
+    } else {
+      role_idx = static_cast<size_t>(e->second);
+    }
+    entry_counts[role_idx]++;
+    total_charges[role_idx] += charge;
+  };
+}
+
+void InternalStats::CacheEntryRoleStats::BeginCollection(
+    Cache* cache, SystemClock*, uint64_t start_time_micros) {
+  Clear();
+  last_start_time_micros_ = start_time_micros;
+  ++collection_count;
+  role_map_ = CopyCacheDeleterRoleMap();
+  std::ostringstream str;
+  str << cache->Name() << "@" << static_cast<void*>(cache) << "#"
+      << port::GetProcessID();
+  cache_id = str.str();
+  cache_capacity = cache->GetCapacity();
+  cache_usage = cache->GetUsage();
+  table_size = cache->GetTableAddressCount();
+  occupancy = cache->GetOccupancyCount();
+}
+
+void InternalStats::CacheEntryRoleStats::EndCollection(
+    Cache*, SystemClock*, uint64_t end_time_micros) {
+  last_end_time_micros_ = end_time_micros;
+}
+
+void InternalStats::CacheEntryRoleStats::SkippedCollection() {
+  ++copies_of_last_collection;
+}
+
+uint64_t InternalStats::CacheEntryRoleStats::GetLastDurationMicros() const {
+  if (last_end_time_micros_ > last_start_time_micros_) {
+    return last_end_time_micros_ - last_start_time_micros_;
+  } else {
+    return 0U;
+  }
+}
+
+std::string InternalStats::CacheEntryRoleStats::ToString(
+    SystemClock* clock) const {
+  std::ostringstream str;
+  str << "Block cache " << cache_id
+      << " capacity: " << BytesToHumanString(cache_capacity)
+      << " usage: " << BytesToHumanString(cache_usage)
+      << " table_size: " << table_size << " occupancy: " << occupancy
+      << " collections: " << collection_count
+      << " last_copies: " << copies_of_last_collection
+      << " last_secs: " << (GetLastDurationMicros() / 1000000.0)
+      << " secs_since: "
+      << ((clock->NowMicros() - last_end_time_micros_) / 1000000U) << "\n";
+  str << "Block cache entry stats(count,size,portion):";
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    if (entry_counts[i] > 0) {
+      str << " " << kCacheEntryRoleToCamelString[i] << "(" << entry_counts[i]
+          << "," << BytesToHumanString(total_charges[i]) << ","
+          << (100.0 * total_charges[i] / cache_capacity) << "%)";
+    }
+  }
+  str << "\n";
+  return str.str();
+}
+
+void InternalStats::CacheEntryRoleStats::ToMap(
+    std::map<std::string, std::string>* values, SystemClock* clock) const {
+  values->clear();
+  auto& v = *values;
+  v[BlockCacheEntryStatsMapKeys::CacheId()] = cache_id;
+  v[BlockCacheEntryStatsMapKeys::CacheCapacityBytes()] =
+      std::to_string(cache_capacity);
+  v[BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds()] =
+      std::to_string(GetLastDurationMicros() / 1000000.0);
+  v[BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds()] =
+      std::to_string((clock->NowMicros() - last_end_time_micros_) / 1000000U);
+  for (size_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    auto role = static_cast<CacheEntryRole>(i);
+    v[BlockCacheEntryStatsMapKeys::EntryCount(role)] =
+        std::to_string(entry_counts[i]);
+    v[BlockCacheEntryStatsMapKeys::UsedBytes(role)] =
+        std::to_string(total_charges[i]);
+    v[BlockCacheEntryStatsMapKeys::UsedPercent(role)] =
+        std::to_string(100.0 * total_charges[i] / cache_capacity);
+  }
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsInternal(std::string* value,
+                                                       bool fast) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(!fast /* foreground */);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  *value = stats.ToString(clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMapInternal(
+    std::map<std::string, std::string>* values, bool fast) {
+  if (!cache_entry_stats_collector_) {
+    return false;
+  }
+  CollectCacheEntryStats(!fast /* foreground */);
+  CacheEntryRoleStats stats;
+  cache_entry_stats_collector_->GetStats(&stats);
+  stats.ToMap(values, clock_);
+  return true;
+}
+
+bool InternalStats::HandleBlockCacheEntryStats(std::string* value,
+                                               Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsInternal(value, false /* fast */);
+}
+
+bool InternalStats::HandleBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsMapInternal(values, false /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStats(std::string* value,
+                                                   Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsInternal(value, true /* fast */);
+}
+
+bool InternalStats::HandleFastBlockCacheEntryStatsMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  return HandleBlockCacheEntryStatsMapInternal(values, true /* fast */);
+}
+
+bool InternalStats::HandleLiveSstFilesSizeAtTemperature(std::string* value,
+                                                        Slice suffix) {
+  uint64_t temperature;
+  bool ok = ConsumeDecimalNumber(&suffix, &temperature) && suffix.empty();
+  if (!ok) {
+    return false;
+  }
+
+  uint64_t size = 0;
+  const auto* vstorage = cfd_->current()->storage_info();
+  for (int level = 0; level < vstorage->num_levels(); level++) {
+    for (const auto& file_meta : vstorage->LevelFiles(level)) {
+      if (static_cast<uint8_t>(file_meta->temperature) == temperature) {
+        size += file_meta->fd.GetFileSize();
+      }
+    }
+  }
+
+  *value = std::to_string(size);
+  return true;
+}
+
+bool InternalStats::HandleNumBlobFiles(uint64_t* value, DBImpl* /*db*/,
+                                       Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  const auto& blob_files = vstorage->GetBlobFiles();
+
+  *value = blob_files.size();
+
+  return true;
+}
+
+bool InternalStats::HandleBlobStats(std::string* value, Slice /*suffix*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  const auto blob_st = vstorage->GetBlobStats();
+
+  std::ostringstream oss;
+
+  oss << "Number of blob files: " << vstorage->GetBlobFiles().size()
+      << "\nTotal size of blob files: " << blob_st.total_file_size
+      << "\nTotal size of garbage in blob files: " << blob_st.total_garbage_size
+      << "\nBlob file space amplification: " << blob_st.space_amp << '\n';
+
+  value->append(oss.str());
+
+  return true;
+}
+
+bool InternalStats::HandleTotalBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  *value = cfd_->GetTotalBlobFileSize();
+
+  return true;
+}
+
+bool InternalStats::HandleLiveBlobFileSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  *value = vstorage->GetBlobStats().total_file_size;
+
+  return true;
+}
+
+bool InternalStats::HandleLiveBlobFileGarbageSize(uint64_t* value,
+                                                  DBImpl* /*db*/,
+                                                  Version* /*version*/) {
+  assert(value);
+  assert(cfd_);
+
+  const auto* current = cfd_->current();
+  assert(current);
+
+  const auto* vstorage = current->storage_info();
+  assert(vstorage);
+
+  *value = vstorage->GetBlobStats().total_garbage_size;
+
+  return true;
+}
+
+Cache* InternalStats::GetBlobCacheForStats() {
+  return cfd_->ioptions()->blob_cache.get();
+}
+
+bool InternalStats::HandleBlobCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetCapacity());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlobCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                         Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetUsage());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  Cache* blob_cache = GetBlobCacheForStats();
+  if (blob_cache) {
+    *value = static_cast<uint64_t>(blob_cache->GetPinnedUsage());
+    return true;
+  }
+  return false;
+}
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& property) {
+  std::string ppt_name = GetPropertyNameAndArg(property).first.ToString();
+  auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name);
+  if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) {
+    return nullptr;
+  }
+  return &ppt_info_iter->second;
+}
+
+bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info,
+                                      const Slice& property,
+                                      std::string* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_string != nullptr);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_string))(value, arg);
+}
+
+bool InternalStats::GetMapProperty(const DBPropertyInfo& property_info,
+                                   const Slice& property,
+                                   std::map<std::string, std::string>* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_map != nullptr);
+  Slice arg = GetPropertyNameAndArg(property).second;
+  return (this->*(property_info.handle_map))(value, arg);
+}
+
+bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info,
+                                   uint64_t* value, DBImpl* db) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         !property_info.need_out_of_mutex);
+  db->mutex_.AssertHeld();
+  return (this->*(property_info.handle_int))(value, db, nullptr /* version */);
+}
+
+bool InternalStats::GetIntPropertyOutOfMutex(
+    const DBPropertyInfo& property_info, Version* version, uint64_t* value) {
+  assert(value != nullptr);
+  assert(property_info.handle_int != nullptr &&
+         property_info.need_out_of_mutex);
+  return (this->*(property_info.handle_int))(value, nullptr /* db */, version);
+}
+
+bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  } else {
+    char buf[100];
+    snprintf(buf, sizeof(buf), "%d",
+             vstorage->NumLevelFiles(static_cast<int>(level)));
+    *value = buf;
+    return true;
+  }
+}
+
+bool InternalStats::HandleCompressionRatioAtLevelPrefix(std::string* value,
+                                                        Slice suffix) {
+  uint64_t level;
+  const auto* vstorage = cfd_->current()->storage_info();
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || level >= static_cast<uint64_t>(number_levels_)) {
+    return false;
+  }
+  *value = std::to_string(
+      vstorage->GetEstimatedCompressionRatioAtLevel(static_cast<int>(level)));
+  return true;
+}
+
+bool InternalStats::HandleLevelStats(std::string* value, Slice /*suffix*/) {
+  char buf[1000];
+  const auto* vstorage = cfd_->current()->storage_info();
+  snprintf(buf, sizeof(buf),
+           "Level Files Size(MB)\n"
+           "--------------------\n");
+  value->append(buf);
+
+  for (int level = 0; level < number_levels_; level++) {
+    snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level,
+             vstorage->NumLevelFiles(level),
+             vstorage->NumLevelBytes(level) / kMB);
+    value->append(buf);
+  }
+  return true;
+}
+
+bool InternalStats::HandleStats(std::string* value, Slice suffix) {
+  if (!HandleCFStats(value, suffix)) {
+    return false;
+  }
+  if (!HandleDBStats(value, suffix)) {
+    return false;
+  }
+  return true;
+}
+
+bool InternalStats::HandleCFMapStats(
+    std::map<std::string, std::string>* cf_stats, Slice /*suffix*/) {
+  DumpCFMapStats(cf_stats);
+  return true;
+}
+
+bool InternalStats::HandleCFStats(std::string* value, Slice /*suffix*/) {
+  DumpCFStats(value);
+  return true;
+}
+
+bool InternalStats::HandleCFStatsPeriodic(std::string* value,
+                                          Slice /*suffix*/) {
+  bool has_change = has_cf_change_since_dump_;
+  if (!has_change) {
+    // If file histogram changes, there is activity in this period too.
+    uint64_t new_histogram_num = 0;
+    for (int level = 0; level < number_levels_; level++) {
+      new_histogram_num += file_read_latency_[level].num();
+    }
+    new_histogram_num += blob_file_read_latency_.num();
+    if (new_histogram_num != last_histogram_num) {
+      has_change = true;
+      last_histogram_num = new_histogram_num;
+    }
+  }
+  if (has_change) {
+    no_cf_change_period_since_dump_ = 0;
+    has_cf_change_since_dump_ = false;
+  } else if (no_cf_change_period_since_dump_++ > 0) {
+    // Not ready to sync
+    if (no_cf_change_period_since_dump_ == kMaxNoChangePeriodSinceDump) {
+      // Next periodic, we need to dump stats even if there is no change.
+      no_cf_change_period_since_dump_ = 0;
+    }
+    return true;
+  }
+
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/true, value);
+  DumpCFFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleCFStatsNoFileHistogram(std::string* value,
+                                                 Slice /*suffix*/) {
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+  return true;
+}
+
+bool InternalStats::HandleCFFileHistogram(std::string* value,
+                                          Slice /*suffix*/) {
+  DumpCFFileHistogram(value);
+  return true;
+}
+
+bool InternalStats::HandleDBMapStats(
+    std::map<std::string, std::string>* db_stats, Slice /*suffix*/) {
+  DumpDBMapStats(db_stats);
+  return true;
+}
+
+bool InternalStats::HandleDBStats(std::string* value, Slice /*suffix*/) {
+  DumpDBStats(value);
+  return true;
+}
+
+bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) {
+  auto* current = cfd_->current();
+  *value = current->DebugString(true, true);
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTableProperties(std::string* value,
+                                                    Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = tp->ToString();
+  return true;
+}
+
+static std::map<std::string, std::string> MapUint64ValuesToString(
+    const std::map<std::string, uint64_t>& from) {
+  std::map<std::string, std::string> to;
+  for (const auto& e : from) {
+    to[e.first] = std::to_string(e.second);
+  }
+  return to;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesMap(
+    std::map<std::string, std::string>* values, Slice /*suffix*/) {
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(&tp);
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values,
+                                                           Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = tp->ToString();
+  return true;
+}
+
+bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap(
+    std::map<std::string, std::string>* values, Slice suffix) {
+  uint64_t level;
+  bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty();
+  if (!ok || static_cast<int>(level) >= number_levels_) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  auto s = cfd_->current()->GetAggregatedTableProperties(
+      &tp, static_cast<int>(level));
+  if (!s.ok()) {
+    return false;
+  }
+  *values = MapUint64ValuesToString(tp->GetAggregatablePropertiesAsMap());
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = cfd_->imm()->NumNotFlushed();
+  return true;
+}
+
+bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value,
+                                                      DBImpl* /*db*/,
+                                                      Version* /*version*/) {
+  *value = cfd_->imm()->NumFlushed();
+  return true;
+}
+
+bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* /*db*/,
+                                               Version* /*version*/) {
+  *value = (cfd_->imm()->IsFlushPending() ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db,
+                                            Version* /*version*/) {
+  *value = db->num_running_flushes();
+  return true;
+}
+
+bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  // 1 if the system already determines at least one compaction is needed.
+  // 0 otherwise,
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0);
+  return true;
+}
+
+bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                                Version* /*version*/) {
+  *value = db->num_running_compactions_;
+  return true;
+}
+
+bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  // Accumulated number of  errors in background flushes or compactions.
+  *value = GetBackgroundErrorCount();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // Current size of the active memtable
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast();
+  return true;
+}
+
+bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                              Version* /*version*/) {
+  // Current size of the active memtable + immutable memtables
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+           cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  // Using ApproximateMemoryUsageFast to avoid the need for synchronization
+  *value = cfd_->mem()->ApproximateMemoryUsageFast() +
+           cfd_->imm()->ApproximateMemoryUsage();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_entries();
+  return true;
+}
+
+bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumEntries();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value,
+                                                   DBImpl* /*db*/,
+                                                   Version* /*version*/) {
+  // Current number of entires in the active memtable
+  *value = cfd_->mem()->num_deletes();
+  return true;
+}
+
+bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value,
+                                                 DBImpl* /*db*/,
+                                                 Version* /*version*/) {
+  // Current number of entries in the immutable memtables
+  *value = cfd_->imm()->current()->GetTotalNumDeletes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  // Estimate number of entries in the column family:
+  // Use estimated entries in tables + total entries in memtables.
+  const auto* vstorage = cfd_->current()->storage_info();
+  uint64_t estimate_keys = cfd_->mem()->num_entries() +
+                           cfd_->imm()->current()->GetTotalNumEntries() +
+                           vstorage->GetEstimatedActiveKeys();
+  uint64_t estimate_deletes =
+      cfd_->mem()->num_deletes() + cfd_->imm()->current()->GetTotalNumDeletes();
+  *value = estimate_keys > estimate_deletes * 2
+               ? estimate_keys - (estimate_deletes * 2)
+               : 0;
+  return true;
+}
+
+bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db,
+                                       Version* /*version*/) {
+  *value = db->snapshots().count();
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotTime());
+  return true;
+}
+
+bool InternalStats::HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = static_cast<uint64_t>(db->snapshots().GetOldestSnapshotSequence());
+  return true;
+}
+
+bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  *value = cfd_->GetNumLiveVersions();
+  return true;
+}
+
+bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value,
+                                                    DBImpl* /*db*/,
+                                                    Version* /*version*/) {
+  *value = cfd_->GetSuperVersionNumber();
+  return true;
+}
+
+bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  *value = db->IsFileDeletionsEnabled() ? 1 : 0;
+  return true;
+}
+
+bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* /*db*/,
+                                    Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->base_level();
+  return true;
+}
+
+bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                            Version* /*version*/) {
+  *value = cfd_->GetTotalSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleLiveSstFilesSize(uint64_t* value, DBImpl* /*db*/,
+                                           Version* /*version*/) {
+  *value = cfd_->GetLiveSstFilesSize();
+  return true;
+}
+
+bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value,
+                                                         DBImpl* /*db*/,
+                                                         Version* /*version*/) {
+  const auto* vstorage = cfd_->current()->storage_info();
+  *value = vstorage->estimated_compaction_needed_bytes();
+  return true;
+}
+
+bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value,
+                                                  DBImpl* /*db*/,
+                                                  Version* version) {
+  *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders();
+  return true;
+}
+
+bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* /*db*/,
+                                               Version* version) {
+  const auto* vstorage = version->storage_info();
+  *value = vstorage->EstimateLiveDataSize();
+  return true;
+}
+
+bool InternalStats::HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db,
+                                             Version* /*version*/) {
+  *value = db->MinLogNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleMinObsoleteSstNumberToKeep(uint64_t* value,
+                                                     DBImpl* db,
+                                                     Version* /*version*/) {
+  *value = db->MinObsoleteSstNumberToKeep();
+  return true;
+}
+
+bool InternalStats::HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                                 Version* /*version*/) {
+  const WriteController& wc = db->write_controller();
+  if (!wc.NeedsDelay()) {
+    *value = 0;
+  } else {
+    *value = wc.delayed_write_rate();
+  }
+  return true;
+}
+
+bool InternalStats::HandleIsWriteStopped(uint64_t* value, DBImpl* db,
+                                         Version* /*version*/) {
+  *value = db->write_controller().IsStopped() ? 1 : 0;
+  return true;
+}
+
+bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  // TODO(yiwu): The property is currently available for fifo compaction
+  // with allow_compaction = false. This is because we don't propagate
+  // oldest_key_time on compaction.
+  if (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO ||
+      cfd_->GetCurrentMutableCFOptions()
+          ->compaction_options_fifo.allow_compaction) {
+    return false;
+  }
+
+  TablePropertiesCollection collection;
+  auto s = cfd_->current()->GetPropertiesOfAllTables(&collection);
+  if (!s.ok()) {
+    return false;
+  }
+  *value = std::numeric_limits<uint64_t>::max();
+  for (auto& p : collection) {
+    *value = std::min(*value, p.second->oldest_key_time);
+    if (*value == 0) {
+      break;
+    }
+  }
+  if (*value > 0) {
+    *value = std::min({cfd_->mem()->ApproximateOldestKeyTime(),
+                       cfd_->imm()->ApproximateOldestKeyTime(), *value});
+  }
+  return *value > 0 && *value < std::numeric_limits<uint64_t>::max();
+}
+
+Cache* InternalStats::GetBlockCacheForStats() {
+  auto* table_factory = cfd_->ioptions()->table_factory.get();
+  assert(table_factory != nullptr);
+  return table_factory->GetOptions<Cache>(TableFactory::kBlockCacheOpts());
+}
+
+bool InternalStats::HandleBlockCacheCapacity(uint64_t* value, DBImpl* /*db*/,
+                                             Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetCapacity());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlockCacheUsage(uint64_t* value, DBImpl* /*db*/,
+                                          Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetUsage());
+    return true;
+  }
+  return false;
+}
+
+bool InternalStats::HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* /*db*/,
+                                                Version* /*version*/) {
+  Cache* block_cache = GetBlockCacheForStats();
+  if (block_cache) {
+    *value = static_cast<uint64_t>(block_cache->GetPinnedUsage());
+    return true;
+  }
+  return false;
+}
+
+void InternalStats::DumpDBMapStats(
+    std::map<std::string, std::string>* db_stats) {
+  for (int i = 0; i < static_cast<int>(kIntStatsNumMax); ++i) {
+    InternalDBStatsType type = static_cast<InternalDBStatsType>(i);
+    (*db_stats)[db_stats_type_to_info.at(type).property_name] =
+        std::to_string(GetDBStats(type));
+  }
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+  (*db_stats)["db.uptime"] = std::to_string(seconds_up);
+}
+
+void InternalStats::DumpDBStats(std::string* value) {
+  char buf[1000];
+  // DB-level stats, only available from default column family
+  double seconds_up = (clock_->NowMicros() - started_at_) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - db_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf),
+           "\n** DB Stats **\nUptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  // Cumulative
+  uint64_t user_bytes_written =
+      GetDBStats(InternalStats::kIntStatsBytesWritten);
+  uint64_t num_keys_written =
+      GetDBStats(InternalStats::kIntStatsNumKeysWritten);
+  uint64_t write_other = GetDBStats(InternalStats::kIntStatsWriteDoneByOther);
+  uint64_t write_self = GetDBStats(InternalStats::kIntStatsWriteDoneBySelf);
+  uint64_t wal_bytes = GetDBStats(InternalStats::kIntStatsWalFileBytes);
+  uint64_t wal_synced = GetDBStats(InternalStats::kIntStatsWalFileSynced);
+  uint64_t write_with_wal = GetDBStats(InternalStats::kIntStatsWriteWithWal);
+  uint64_t write_stall_micros =
+      GetDBStats(InternalStats::kIntStatsWriteStallMicros);
+
+  const int kHumanMicrosLen = 32;
+  char human_micros[kHumanMicrosLen];
+
+  // Data
+  // writes: total number of write requests.
+  // keys: total number of key updates issued by all the write requests
+  // commit groups: number of group commits issued to the DB. Each group can
+  //                contain one or more writes.
+  // so writes/keys is the average number of put in multi-put or put
+  // writes/groups is the average group commit size.
+  //
+  // The format is the same for interval stats.
+  snprintf(buf, sizeof(buf),
+           "Cumulative writes: %s writes, %s keys, %s commit groups, "
+           "%.1f writes per commit group, ingest: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_other + write_self).c_str(),
+           NumberToHumanString(num_keys_written).c_str(),
+           NumberToHumanString(write_self).c_str(),
+           (write_other + write_self) /
+               std::max(1.0, static_cast<double>(write_self)),
+           user_bytes_written / kGB,
+           user_bytes_written / kMB / std::max(seconds_up, 0.001));
+  value->append(buf);
+  // WAL
+  snprintf(buf, sizeof(buf),
+           "Cumulative WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(write_with_wal).c_str(),
+           NumberToHumanString(wal_synced).c_str(),
+           write_with_wal / std::max(1.0, static_cast<double>(wal_synced)),
+           wal_bytes / kGB, wal_bytes / kMB / std::max(seconds_up, 0.001));
+  value->append(buf);
+  // Stall
+  AppendHumanMicros(write_stall_micros, human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Cumulative stall: %s, %.1f percent\n",
+           human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           write_stall_micros / 10000.0 / std::max(seconds_up, 0.001));
+  value->append(buf);
+
+  // Interval
+  uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other;
+  uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self;
+  uint64_t interval_num_keys_written =
+      num_keys_written - db_stats_snapshot_.num_keys_written;
+  snprintf(
+      buf, sizeof(buf),
+      "Interval writes: %s writes, %s keys, %s commit groups, "
+      "%.1f writes per commit group, ingest: %.2f MB, %.2f MB/s\n",
+      NumberToHumanString(interval_write_other + interval_write_self).c_str(),
+      NumberToHumanString(interval_num_keys_written).c_str(),
+      NumberToHumanString(interval_write_self).c_str(),
+      static_cast<double>(interval_write_other + interval_write_self) /
+          std::max(1.0, static_cast<double>(interval_write_self)),
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB,
+      (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB /
+          std::max(interval_seconds_up, 0.001)),
+      value->append(buf);
+
+  uint64_t interval_write_with_wal =
+      write_with_wal - db_stats_snapshot_.write_with_wal;
+  uint64_t interval_wal_synced = wal_synced - db_stats_snapshot_.wal_synced;
+  uint64_t interval_wal_bytes = wal_bytes - db_stats_snapshot_.wal_bytes;
+
+  snprintf(buf, sizeof(buf),
+           "Interval WAL: %s writes, %s syncs, "
+           "%.2f writes per sync, written: %.2f GB, %.2f MB/s\n",
+           NumberToHumanString(interval_write_with_wal).c_str(),
+           NumberToHumanString(interval_wal_synced).c_str(),
+           interval_write_with_wal /
+               std::max(1.0, static_cast<double>(interval_wal_synced)),
+           interval_wal_bytes / kGB,
+           interval_wal_bytes / kMB / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  // Stall
+  AppendHumanMicros(write_stall_micros - db_stats_snapshot_.write_stall_micros,
+                    human_micros, kHumanMicrosLen, true);
+  snprintf(buf, sizeof(buf), "Interval stall: %s, %.1f percent\n", human_micros,
+           // 10000 = divide by 1M to get secs, then multiply by 100 for pct
+           (write_stall_micros - db_stats_snapshot_.write_stall_micros) /
+               10000.0 / std::max(interval_seconds_up, 0.001));
+  value->append(buf);
+
+  db_stats_snapshot_.seconds_up = seconds_up;
+  db_stats_snapshot_.ingest_bytes = user_bytes_written;
+  db_stats_snapshot_.write_other = write_other;
+  db_stats_snapshot_.write_self = write_self;
+  db_stats_snapshot_.num_keys_written = num_keys_written;
+  db_stats_snapshot_.wal_bytes = wal_bytes;
+  db_stats_snapshot_.wal_synced = wal_synced;
+  db_stats_snapshot_.write_with_wal = write_with_wal;
+  db_stats_snapshot_.write_stall_micros = write_stall_micros;
+}
+
+/**
+ * Dump Compaction Level stats to a map of stat name with "compaction." prefix
+ * to value in double as string. The level in stat name is represented with
+ * a prefix "Lx" where "x" is the level number. A special level "Sum"
+ * represents the sum of a stat for all levels.
+ * The result also contains IO stall counters which keys start with "io_stalls."
+ * and values represent uint64 encoded as strings.
+ */
+void InternalStats::DumpCFMapStats(
+    std::map<std::string, std::string>* cf_stats) {
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  CompactionStats compaction_stats_sum;
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+  for (auto const& level_ent : levels_stats) {
+    auto level_str =
+        level_ent.first == -1 ? "Sum" : "L" + std::to_string(level_ent.first);
+    for (auto const& stat_ent : level_ent.second) {
+      auto stat_type = stat_ent.first;
+      auto key_str =
+          "compaction." + level_str + "." +
+          InternalStats::compaction_level_stats.at(stat_type).property_name;
+      (*cf_stats)[key_str] = std::to_string(stat_ent.second);
+    }
+  }
+
+  DumpCFMapStatsIOStalls(cf_stats);
+}
+
+void InternalStats::DumpCFMapStats(
+    const VersionStorageInfo* vstorage,
+    std::map<int, std::map<LevelStatType, double>>* levels_stats,
+    CompactionStats* compaction_stats_sum) {
+  assert(vstorage);
+
+  int num_levels_to_check =
+      (cfd_->ioptions()->compaction_style != kCompactionStyleFIFO)
+          ? vstorage->num_levels() - 1
+          : 1;
+
+  // Compaction scores are sorted based on its value. Restore them to the
+  // level order
+  std::vector<double> compaction_score(number_levels_, 0);
+  for (int i = 0; i < num_levels_to_check; ++i) {
+    compaction_score[vstorage->CompactionScoreLevel(i)] =
+        vstorage->CompactionScore(i);
+  }
+  // Count # of files being compacted for each level
+  std::vector<int> files_being_compacted(number_levels_, 0);
+  for (int level = 0; level < number_levels_; ++level) {
+    for (auto* f : vstorage->LevelFiles(level)) {
+      if (f->being_compacted) {
+        ++files_being_compacted[level];
+      }
+    }
+  }
+
+  int total_files = 0;
+  int total_files_being_compacted = 0;
+  double total_file_size = 0;
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t curr_ingest = flush_ingest + add_file_ingest;
+  for (int level = 0; level < number_levels_; level++) {
+    int files = vstorage->NumLevelFiles(level);
+    total_files += files;
+    total_files_being_compacted += files_being_compacted[level];
+    if (comp_stats_[level].micros > 0 || comp_stats_[level].cpu_micros > 0 ||
+        files > 0) {
+      compaction_stats_sum->Add(comp_stats_[level]);
+      total_file_size += vstorage->NumLevelBytes(level);
+      uint64_t input_bytes;
+      if (level == 0) {
+        input_bytes = curr_ingest;
+      } else {
+        input_bytes = comp_stats_[level].bytes_read_non_output_levels +
+                      comp_stats_[level].bytes_read_blob;
+      }
+      double w_amp =
+          (input_bytes == 0)
+              ? 0.0
+              : static_cast<double>(comp_stats_[level].bytes_written +
+                                    comp_stats_[level].bytes_written_blob) /
+                    input_bytes;
+      std::map<LevelStatType, double> level_stats;
+      PrepareLevelStats(&level_stats, files, files_being_compacted[level],
+                        static_cast<double>(vstorage->NumLevelBytes(level)),
+                        compaction_score[level], w_amp, comp_stats_[level]);
+      (*levels_stats)[level] = level_stats;
+    }
+  }
+  // Cumulative summary
+  double w_amp = (0 == curr_ingest)
+                     ? 0.0
+                     : (compaction_stats_sum->bytes_written +
+                        compaction_stats_sum->bytes_written_blob) /
+                           static_cast<double>(curr_ingest);
+  // Stats summary across levels
+  std::map<LevelStatType, double> sum_stats;
+  PrepareLevelStats(&sum_stats, total_files, total_files_being_compacted,
+                    total_file_size, 0, w_amp, *compaction_stats_sum);
+  (*levels_stats)[-1] = sum_stats;  //  -1 is for the Sum level
+}
+
+void InternalStats::DumpCFMapStatsByPriority(
+    std::map<int, std::map<LevelStatType, double>>* priorities_stats) {
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); priority++) {
+    if (comp_stats_by_pri_[priority].micros > 0) {
+      std::map<LevelStatType, double> priority_stats;
+      PrepareLevelStats(&priority_stats, 0 /* num_files */,
+                        0 /* being_compacted */, 0 /* total_file_size */,
+                        0 /* compaction_score */, 0 /* w_amp */,
+                        comp_stats_by_pri_[priority]);
+      (*priorities_stats)[static_cast<int>(priority)] = priority_stats;
+    }
+  }
+}
+
+void InternalStats::DumpCFMapStatsIOStalls(
+    std::map<std::string, std::string>* cf_stats) {
+  (*cf_stats)["io_stalls.level0_slowdown"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_slowdown_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.level0_numfiles"] =
+      std::to_string(cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.level0_numfiles_with_compaction"] =
+      std::to_string(cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.stop_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.slowdown_for_pending_compaction_bytes"] =
+      std::to_string(cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS]);
+  (*cf_stats)["io_stalls.memtable_compaction"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_STOPS]);
+  (*cf_stats)["io_stalls.memtable_slowdown"] =
+      std::to_string(cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS]);
+
+  uint64_t total_stop = cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+                        cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+                        cf_stats_count_[MEMTABLE_LIMIT_STOPS];
+
+  uint64_t total_slowdown =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+
+  (*cf_stats)["io_stalls.total_stop"] = std::to_string(total_stop);
+  (*cf_stats)["io_stalls.total_slowdown"] = std::to_string(total_slowdown);
+}
+
+void InternalStats::DumpCFStats(std::string* value) {
+  DumpCFStatsNoFileHistogram(/*is_periodic=*/false, value);
+  DumpCFFileHistogram(value);
+}
+
+void InternalStats::DumpCFStatsNoFileHistogram(bool is_periodic,
+                                               std::string* value) {
+  char buf[2000];
+  // Per-ColumnFamily stats
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Level");
+  value->append(buf);
+
+  // Print stats for each level
+  const VersionStorageInfo* vstorage = cfd_->current()->storage_info();
+  std::map<int, std::map<LevelStatType, double>> levels_stats;
+  CompactionStats compaction_stats_sum;
+  DumpCFMapStats(vstorage, &levels_stats, &compaction_stats_sum);
+  for (int l = 0; l < number_levels_; ++l) {
+    if (levels_stats.find(l) != levels_stats.end()) {
+      PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(l),
+                      levels_stats[l]);
+      value->append(buf);
+    }
+  }
+
+  // Print sum of level stats
+  PrintLevelStats(buf, sizeof(buf), "Sum", levels_stats[-1]);
+  value->append(buf);
+
+  uint64_t flush_ingest = cf_stats_value_[BYTES_FLUSHED];
+  uint64_t add_file_ingest = cf_stats_value_[BYTES_INGESTED_ADD_FILE];
+  uint64_t ingest_files_addfile = cf_stats_value_[INGESTED_NUM_FILES_TOTAL];
+  uint64_t ingest_l0_files_addfile =
+      cf_stats_value_[INGESTED_LEVEL0_NUM_FILES_TOTAL];
+  uint64_t ingest_keys_addfile = cf_stats_value_[INGESTED_NUM_KEYS_TOTAL];
+  // Cumulative summary
+  uint64_t total_stall_count =
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS] +
+      cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_STOPS] +
+      cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS];
+  // Interval summary
+  uint64_t interval_flush_ingest =
+      flush_ingest - cf_stats_snapshot_.ingest_bytes_flush;
+  uint64_t interval_add_file_inget =
+      add_file_ingest - cf_stats_snapshot_.ingest_bytes_addfile;
+  uint64_t interval_ingest =
+      interval_flush_ingest + interval_add_file_inget + 1;
+  CompactionStats interval_stats(compaction_stats_sum);
+  interval_stats.Subtract(cf_stats_snapshot_.comp_stats);
+  double w_amp =
+      (interval_stats.bytes_written + interval_stats.bytes_written_blob) /
+      static_cast<double>(interval_ingest);
+  PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, w_amp, interval_stats);
+  value->append(buf);
+
+  PrintLevelStatsHeader(buf, sizeof(buf), cfd_->GetName(), "Priority");
+  value->append(buf);
+  std::map<int, std::map<LevelStatType, double>> priorities_stats;
+  DumpCFMapStatsByPriority(&priorities_stats);
+  for (size_t priority = 0; priority < comp_stats_by_pri_.size(); ++priority) {
+    if (priorities_stats.find(static_cast<int>(priority)) !=
+        priorities_stats.end()) {
+      PrintLevelStats(
+          buf, sizeof(buf),
+          Env::PriorityToString(static_cast<Env::Priority>(priority)),
+          priorities_stats[static_cast<int>(priority)]);
+      value->append(buf);
+    }
+  }
+
+  const auto blob_st = vstorage->GetBlobStats();
+
+  snprintf(buf, sizeof(buf),
+           "\nBlob file count: %" ROCKSDB_PRIszt
+           ", total size: %.1f GB, garbage size: %.1f GB, space amp: %.1f\n\n",
+           vstorage->GetBlobFiles().size(), blob_st.total_file_size / kGB,
+           blob_st.total_garbage_size / kGB, blob_st.space_amp);
+  value->append(buf);
+
+  uint64_t now_micros = clock_->NowMicros();
+  double seconds_up = (now_micros - started_at_) / kMicrosInSec;
+  double interval_seconds_up = seconds_up - cf_stats_snapshot_.seconds_up;
+  snprintf(buf, sizeof(buf), "Uptime(secs): %.1f total, %.1f interval\n",
+           seconds_up, interval_seconds_up);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "Flush(GB): cumulative %.3f, interval %.3f\n",
+           flush_ingest / kGB, interval_flush_ingest / kGB);
+  value->append(buf);
+  snprintf(buf, sizeof(buf), "AddFile(GB): cumulative %.3f, interval %.3f\n",
+           add_file_ingest / kGB, interval_add_file_inget / kGB);
+  value->append(buf);
+
+  uint64_t interval_ingest_files_addfile =
+      ingest_files_addfile - cf_stats_snapshot_.ingest_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Total Files): cumulative %" PRIu64 ", interval %" PRIu64
+           "\n",
+           ingest_files_addfile, interval_ingest_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_l0_files_addfile =
+      ingest_l0_files_addfile - cf_stats_snapshot_.ingest_l0_files_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(L0 Files): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_l0_files_addfile, interval_ingest_l0_files_addfile);
+  value->append(buf);
+
+  uint64_t interval_ingest_keys_addfile =
+      ingest_keys_addfile - cf_stats_snapshot_.ingest_keys_addfile;
+  snprintf(buf, sizeof(buf),
+           "AddFile(Keys): cumulative %" PRIu64 ", interval %" PRIu64 "\n",
+           ingest_keys_addfile, interval_ingest_keys_addfile);
+  value->append(buf);
+
+  // Compact
+  uint64_t compact_bytes_read = 0;
+  uint64_t compact_bytes_write = 0;
+  uint64_t compact_micros = 0;
+  for (int level = 0; level < number_levels_; level++) {
+    compact_bytes_read += comp_stats_[level].bytes_read_output_level +
+                          comp_stats_[level].bytes_read_non_output_levels +
+                          comp_stats_[level].bytes_read_blob;
+    compact_bytes_write += comp_stats_[level].bytes_written +
+                           comp_stats_[level].bytes_written_blob;
+    compact_micros += comp_stats_[level].micros;
+  }
+
+  snprintf(buf, sizeof(buf),
+           "Cumulative compaction: %.2f GB write, %.2f MB/s write, "
+           "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+           compact_bytes_write / kGB,
+           compact_bytes_write / kMB / std::max(seconds_up, 0.001),
+           compact_bytes_read / kGB,
+           compact_bytes_read / kMB / std::max(seconds_up, 0.001),
+           compact_micros / kMicrosInSec);
+  value->append(buf);
+
+  // Compaction interval
+  uint64_t interval_compact_bytes_write =
+      compact_bytes_write - cf_stats_snapshot_.compact_bytes_write;
+  uint64_t interval_compact_bytes_read =
+      compact_bytes_read - cf_stats_snapshot_.compact_bytes_read;
+  uint64_t interval_compact_micros =
+      compact_micros - cf_stats_snapshot_.compact_micros;
+
+  snprintf(
+      buf, sizeof(buf),
+      "Interval compaction: %.2f GB write, %.2f MB/s write, "
+      "%.2f GB read, %.2f MB/s read, %.1f seconds\n",
+      interval_compact_bytes_write / kGB,
+      interval_compact_bytes_write / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_bytes_read / kGB,
+      interval_compact_bytes_read / kMB / std::max(interval_seconds_up, 0.001),
+      interval_compact_micros / kMicrosInSec);
+  value->append(buf);
+  if (is_periodic) {
+    cf_stats_snapshot_.compact_bytes_write = compact_bytes_write;
+    cf_stats_snapshot_.compact_bytes_read = compact_bytes_read;
+    cf_stats_snapshot_.compact_micros = compact_micros;
+  }
+
+  snprintf(buf, sizeof(buf),
+           "Stalls(count): %" PRIu64
+           " level0_slowdown, "
+           "%" PRIu64
+           " level0_slowdown_with_compaction, "
+           "%" PRIu64
+           " level0_numfiles, "
+           "%" PRIu64
+           " level0_numfiles_with_compaction, "
+           "%" PRIu64
+           " stop for pending_compaction_bytes, "
+           "%" PRIu64
+           " slowdown for pending_compaction_bytes, "
+           "%" PRIu64
+           " memtable_compaction, "
+           "%" PRIu64
+           " memtable_slowdown, "
+           "interval %" PRIu64 " total count\n",
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS],
+           cf_stats_count_[L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[LOCKED_L0_FILE_COUNT_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_STOPS],
+           cf_stats_count_[PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS],
+           cf_stats_count_[MEMTABLE_LIMIT_STOPS],
+           cf_stats_count_[MEMTABLE_LIMIT_SLOWDOWNS],
+           total_stall_count - cf_stats_snapshot_.stall_count);
+  value->append(buf);
+
+  if (is_periodic) {
+    cf_stats_snapshot_.seconds_up = seconds_up;
+    cf_stats_snapshot_.ingest_bytes_flush = flush_ingest;
+    cf_stats_snapshot_.ingest_bytes_addfile = add_file_ingest;
+    cf_stats_snapshot_.ingest_files_addfile = ingest_files_addfile;
+    cf_stats_snapshot_.ingest_l0_files_addfile = ingest_l0_files_addfile;
+    cf_stats_snapshot_.ingest_keys_addfile = ingest_keys_addfile;
+    cf_stats_snapshot_.comp_stats = compaction_stats_sum;
+    cf_stats_snapshot_.stall_count = total_stall_count;
+  }
+
+  // Do not gather cache entry stats during CFStats because DB
+  // mutex is held. Only dump last cached collection (rely on DB
+  // periodic stats dump to update)
+  if (cache_entry_stats_collector_) {
+    CacheEntryRoleStats stats;
+    // thread safe
+    cache_entry_stats_collector_->GetStats(&stats);
+
+    constexpr uint64_t kDayInMicros = uint64_t{86400} * 1000000U;
+
+    // Skip if stats are extremely old (> 1 day, incl not yet populated)
+    if (now_micros - stats.last_end_time_micros_ < kDayInMicros) {
+      value->append(stats.ToString(clock_));
+    }
+  }
+}
+
+void InternalStats::DumpCFFileHistogram(std::string* value) {
+  assert(value);
+  assert(cfd_);
+
+  std::ostringstream oss;
+  oss << "\n** File Read Latency Histogram By Level [" << cfd_->GetName()
+      << "] **\n";
+
+  for (int level = 0; level < number_levels_; level++) {
+    if (!file_read_latency_[level].Empty()) {
+      oss << "** Level " << level << " read latency histogram (micros):\n"
+          << file_read_latency_[level].ToString() << '\n';
+    }
+  }
+
+  if (!blob_file_read_latency_.Empty()) {
+    oss << "** Blob file read latency histogram (micros):\n"
+        << blob_file_read_latency_.ToString() << '\n';
+  }
+
+  value->append(oss.str());
+}
+
+#else
+
+const DBPropertyInfo* GetPropertyInfo(const Slice& /*property*/) {
+  return nullptr;
+}
+
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/internal_stats.h b/src/rocksdb/db/internal_stats.h
new file mode 100644
index 000000000..b0cd5899b
--- /dev/null
+++ b/src/rocksdb/db/internal_stats.h
@@ -0,0 +1,996 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "cache/cache_entry_roles.h"
+#include "db/version_set.h"
+#include "rocksdb/system_clock.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <class Stats>
+class CacheEntryStatsCollector;
+class DBImpl;
+class MemTableList;
+
+// Config for retrieving a property's value.
+struct DBPropertyInfo {
+  bool need_out_of_mutex;
+
+  // gcc had an internal error for initializing union of pointer-to-member-
+  // functions. Workaround is to populate exactly one of the following function
+  // pointers with a non-nullptr value.
+
+  // @param value Value-result argument for storing the property's string value
+  // @param suffix Argument portion of the property. For example, suffix would
+  //      be "5" for the property "rocksdb.num-files-at-level5". So far, only
+  //      certain string properties take an argument.
+  bool (InternalStats::*handle_string)(std::string* value, Slice suffix);
+
+  // @param value Value-result argument for storing the property's uint64 value
+  // @param db Many of the int properties rely on DBImpl methods.
+  // @param version Version is needed in case the property is retrieved without
+  //      holding db mutex, which is only supported for int properties.
+  bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db,
+                                    Version* version);
+
+  // @param props Map of general properties to populate
+  // @param suffix Argument portion of the property. (see handle_string)
+  bool (InternalStats::*handle_map)(std::map<std::string, std::string>* props,
+                                    Slice suffix);
+
+  // handle the string type properties rely on DBImpl methods
+  // @param value Value-result argument for storing the property's string value
+  bool (DBImpl::*handle_string_dbimpl)(std::string* value);
+};
+
+extern const DBPropertyInfo* GetPropertyInfo(const Slice& property);
+
+#ifndef ROCKSDB_LITE
+#undef SCORE
+enum class LevelStatType {
+  INVALID = 0,
+  NUM_FILES,
+  COMPACTED_FILES,
+  SIZE_BYTES,
+  SCORE,
+  READ_GB,
+  RN_GB,
+  RNP1_GB,
+  WRITE_GB,
+  W_NEW_GB,
+  MOVED_GB,
+  WRITE_AMP,
+  READ_MBPS,
+  WRITE_MBPS,
+  COMP_SEC,
+  COMP_CPU_SEC,
+  COMP_COUNT,
+  AVG_SEC,
+  KEY_IN,
+  KEY_DROP,
+  R_BLOB_GB,
+  W_BLOB_GB,
+  TOTAL  // total number of types
+};
+
+struct LevelStat {
+  // This what will be L?.property_name in the flat map returned to the user
+  std::string property_name;
+  // This will be what we will print in the header in the cli
+  std::string header_name;
+};
+
+struct DBStatInfo {
+  // This what will be property_name in the flat map returned to the user
+  std::string property_name;
+};
+
+class InternalStats {
+ public:
+  static const std::map<LevelStatType, LevelStat> compaction_level_stats;
+
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  static const std::map<InternalDBStatsType, DBStatInfo> db_stats_type_to_info;
+
+  InternalStats(int num_levels, SystemClock* clock, ColumnFamilyData* cfd);
+
+  // Per level compaction stats
+  struct CompactionOutputsStats {
+    uint64_t num_output_records = 0;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_written_blob = 0;
+    uint64_t num_output_files = 0;
+    uint64_t num_output_files_blob = 0;
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files += stats.num_output_files;
+      this->num_output_files_blob += stats.num_output_files_blob;
+    }
+  };
+
+  // Per level compaction stats.  comp_stats_[level] stores the stats for
+  // compactions that produced data for the specified "level".
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+
+    // The number of bytes read from all non-output levels (table files)
+    uint64_t bytes_read_non_output_levels;
+
+    // The number of bytes read from the compaction output level (table files)
+    uint64_t bytes_read_output_level;
+
+    // The number of bytes read from blob files
+    uint64_t bytes_read_blob;
+
+    // Total number of bytes written to table files during compaction
+    uint64_t bytes_written;
+
+    // Total number of bytes written to blob files during compaction
+    uint64_t bytes_written_blob;
+
+    // Total number of bytes moved to the output level (table files)
+    uint64_t bytes_moved;
+
+    // The number of compaction input files in all non-output levels (table
+    // files)
+    int num_input_files_in_non_output_levels;
+
+    // The number of compaction input files in the output level (table files)
+    int num_input_files_in_output_level;
+
+    // The number of compaction output files (table files)
+    int num_output_files;
+
+    // The number of compaction output files (blob files)
+    int num_output_files_blob;
+
+    // Total incoming entries during compaction between levels N and N+1
+    uint64_t num_input_records;
+
+    // Accumulated diff number of entries
+    // (num input entries - num output entries) for compaction levels N and N+1
+    uint64_t num_dropped_records;
+
+    // Total output entries from compaction
+    uint64_t num_output_records;
+
+    // Number of compactions done
+    int count;
+
+    // Number of compactions done per CompactionReason
+    int counts[static_cast<int>(CompactionReason::kNumOfReasons)]{};
+
+    explicit CompactionStats()
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_read_blob(0),
+          bytes_written(0),
+          bytes_written_blob(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_output_files_blob(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          num_output_records(0),
+          count(0) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    explicit CompactionStats(CompactionReason reason, int c)
+        : micros(0),
+          cpu_micros(0),
+          bytes_read_non_output_levels(0),
+          bytes_read_output_level(0),
+          bytes_read_blob(0),
+          bytes_written(0),
+          bytes_written_blob(0),
+          bytes_moved(0),
+          num_input_files_in_non_output_levels(0),
+          num_input_files_in_output_level(0),
+          num_output_files(0),
+          num_output_files_blob(0),
+          num_input_records(0),
+          num_dropped_records(0),
+          num_output_records(0),
+          count(c) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      if (r >= 0 && r < num_of_reasons) {
+        counts[r] = c;
+      } else {
+        count = 0;
+      }
+    }
+
+    CompactionStats(const CompactionStats& c)
+        : micros(c.micros),
+          cpu_micros(c.cpu_micros),
+          bytes_read_non_output_levels(c.bytes_read_non_output_levels),
+          bytes_read_output_level(c.bytes_read_output_level),
+          bytes_read_blob(c.bytes_read_blob),
+          bytes_written(c.bytes_written),
+          bytes_written_blob(c.bytes_written_blob),
+          bytes_moved(c.bytes_moved),
+          num_input_files_in_non_output_levels(
+              c.num_input_files_in_non_output_levels),
+          num_input_files_in_output_level(c.num_input_files_in_output_level),
+          num_output_files(c.num_output_files),
+          num_output_files_blob(c.num_output_files_blob),
+          num_input_records(c.num_input_records),
+          num_dropped_records(c.num_dropped_records),
+          num_output_records(c.num_output_records),
+          count(c.count) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+    }
+
+    CompactionStats& operator=(const CompactionStats& c) {
+      micros = c.micros;
+      cpu_micros = c.cpu_micros;
+      bytes_read_non_output_levels = c.bytes_read_non_output_levels;
+      bytes_read_output_level = c.bytes_read_output_level;
+      bytes_read_blob = c.bytes_read_blob;
+      bytes_written = c.bytes_written;
+      bytes_written_blob = c.bytes_written_blob;
+      bytes_moved = c.bytes_moved;
+      num_input_files_in_non_output_levels =
+          c.num_input_files_in_non_output_levels;
+      num_input_files_in_output_level = c.num_input_files_in_output_level;
+      num_output_files = c.num_output_files;
+      num_output_files_blob = c.num_output_files_blob;
+      num_input_records = c.num_input_records;
+      num_dropped_records = c.num_dropped_records;
+      num_output_records = c.num_output_records;
+      count = c.count;
+
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = c.counts[i];
+      }
+      return *this;
+    }
+
+    void Clear() {
+      this->micros = 0;
+      this->cpu_micros = 0;
+      this->bytes_read_non_output_levels = 0;
+      this->bytes_read_output_level = 0;
+      this->bytes_read_blob = 0;
+      this->bytes_written = 0;
+      this->bytes_written_blob = 0;
+      this->bytes_moved = 0;
+      this->num_input_files_in_non_output_levels = 0;
+      this->num_input_files_in_output_level = 0;
+      this->num_output_files = 0;
+      this->num_output_files_blob = 0;
+      this->num_input_records = 0;
+      this->num_dropped_records = 0;
+      this->num_output_records = 0;
+      this->count = 0;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+    }
+
+    void Add(const CompactionStats& c) {
+      this->micros += c.micros;
+      this->cpu_micros += c.cpu_micros;
+      this->bytes_read_non_output_levels += c.bytes_read_non_output_levels;
+      this->bytes_read_output_level += c.bytes_read_output_level;
+      this->bytes_read_blob += c.bytes_read_blob;
+      this->bytes_written += c.bytes_written;
+      this->bytes_written_blob += c.bytes_written_blob;
+      this->bytes_moved += c.bytes_moved;
+      this->num_input_files_in_non_output_levels +=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level +=
+          c.num_input_files_in_output_level;
+      this->num_output_files += c.num_output_files;
+      this->num_output_files_blob += c.num_output_files_blob;
+      this->num_input_records += c.num_input_records;
+      this->num_dropped_records += c.num_dropped_records;
+      this->num_output_records += c.num_output_records;
+      this->count += c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] += c.counts[i];
+      }
+    }
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_files += static_cast<int>(stats.num_output_files);
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files_blob +=
+          static_cast<int>(stats.num_output_files_blob);
+    }
+
+    void Subtract(const CompactionStats& c) {
+      this->micros -= c.micros;
+      this->cpu_micros -= c.cpu_micros;
+      this->bytes_read_non_output_levels -= c.bytes_read_non_output_levels;
+      this->bytes_read_output_level -= c.bytes_read_output_level;
+      this->bytes_read_blob -= c.bytes_read_blob;
+      this->bytes_written -= c.bytes_written;
+      this->bytes_written_blob -= c.bytes_written_blob;
+      this->bytes_moved -= c.bytes_moved;
+      this->num_input_files_in_non_output_levels -=
+          c.num_input_files_in_non_output_levels;
+      this->num_input_files_in_output_level -=
+          c.num_input_files_in_output_level;
+      this->num_output_files -= c.num_output_files;
+      this->num_output_files_blob -= c.num_output_files_blob;
+      this->num_input_records -= c.num_input_records;
+      this->num_dropped_records -= c.num_dropped_records;
+      this->num_output_records -= c.num_output_records;
+      this->count -= c.count;
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] -= c.counts[i];
+      }
+    }
+
+    void ResetCompactionReason(CompactionReason reason) {
+      int num_of_reasons = static_cast<int>(CompactionReason::kNumOfReasons);
+      assert(count == 1);  // only support update one compaction reason
+      for (int i = 0; i < num_of_reasons; i++) {
+        counts[i] = 0;
+      }
+      int r = static_cast<int>(reason);
+      assert(r >= 0 && r < num_of_reasons);
+      counts[r] = 1;
+    }
+  };
+
+  // Compaction stats, for per_key_placement compaction, it includes 2 levels
+  // stats: the last level and the penultimate level.
+  struct CompactionStatsFull {
+    // the stats for the target primary output level
+    CompactionStats stats;
+
+    // stats for penultimate level output if exist
+    bool has_penultimate_level_output = false;
+    CompactionStats penultimate_level_stats;
+
+    explicit CompactionStatsFull() : stats(), penultimate_level_stats() {}
+
+    explicit CompactionStatsFull(CompactionReason reason, int c)
+        : stats(reason, c), penultimate_level_stats(reason, c){};
+
+    uint64_t TotalBytesWritten() const {
+      uint64_t bytes_written = stats.bytes_written + stats.bytes_written_blob;
+      if (has_penultimate_level_output) {
+        bytes_written += penultimate_level_stats.bytes_written +
+                         penultimate_level_stats.bytes_written_blob;
+      }
+      return bytes_written;
+    }
+
+    uint64_t DroppedRecords() {
+      uint64_t output_records = stats.num_output_records;
+      if (has_penultimate_level_output) {
+        output_records += penultimate_level_stats.num_output_records;
+      }
+      if (stats.num_input_records > output_records) {
+        return stats.num_input_records - output_records;
+      }
+      return 0;
+    }
+
+    void SetMicros(uint64_t val) {
+      stats.micros = val;
+      penultimate_level_stats.micros = val;
+    }
+
+    void AddCpuMicros(uint64_t val) {
+      stats.cpu_micros += val;
+      penultimate_level_stats.cpu_micros += val;
+    }
+  };
+
+  // For use with CacheEntryStatsCollector
+  struct CacheEntryRoleStats {
+    uint64_t cache_capacity = 0;
+    uint64_t cache_usage = 0;
+    size_t table_size = 0;
+    size_t occupancy = 0;
+    std::string cache_id;
+    std::array<uint64_t, kNumCacheEntryRoles> total_charges;
+    std::array<size_t, kNumCacheEntryRoles> entry_counts;
+    uint32_t collection_count = 0;
+    uint32_t copies_of_last_collection = 0;
+    uint64_t last_start_time_micros_ = 0;
+    uint64_t last_end_time_micros_ = 0;
+
+    void Clear() {
+      // Wipe everything except collection_count
+      uint32_t saved_collection_count = collection_count;
+      *this = CacheEntryRoleStats();
+      collection_count = saved_collection_count;
+    }
+
+    void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
+    std::function<void(const Slice&, void*, size_t, Cache::DeleterFn)>
+    GetEntryCallback();
+    void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
+    void SkippedCollection();
+
+    std::string ToString(SystemClock* clock) const;
+    void ToMap(std::map<std::string, std::string>* values,
+               SystemClock* clock) const;
+
+   private:
+    UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map_;
+    uint64_t GetLastDurationMicros() const;
+  };
+
+  void Clear() {
+    for (int i = 0; i < kIntStatsNumMax; i++) {
+      db_stats_[i].store(0);
+    }
+    for (int i = 0; i < INTERNAL_CF_STATS_ENUM_MAX; i++) {
+      cf_stats_count_[i] = 0;
+      cf_stats_value_[i] = 0;
+    }
+    for (auto& comp_stat : comp_stats_) {
+      comp_stat.Clear();
+    }
+    per_key_placement_comp_stats_.Clear();
+    for (auto& h : file_read_latency_) {
+      h.Clear();
+    }
+    blob_file_read_latency_.Clear();
+    cf_stats_snapshot_.Clear();
+    db_stats_snapshot_.Clear();
+    bg_error_count_ = 0;
+    started_at_ = clock_->NowMicros();
+    has_cf_change_since_dump_ = true;
+  }
+
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStats& stats) {
+    comp_stats_[level].Add(stats);
+    comp_stats_by_pri_[thread_pri].Add(stats);
+  }
+
+  void AddCompactionStats(int level, Env::Priority thread_pri,
+                          const CompactionStatsFull& comp_stats_full) {
+    AddCompactionStats(level, thread_pri, comp_stats_full.stats);
+    if (comp_stats_full.has_penultimate_level_output) {
+      per_key_placement_comp_stats_.Add(
+          comp_stats_full.penultimate_level_stats);
+    }
+  }
+
+  void IncBytesMoved(int level, uint64_t amount) {
+    comp_stats_[level].bytes_moved += amount;
+  }
+
+  void AddCFStats(InternalCFStatsType type, uint64_t value) {
+    has_cf_change_since_dump_ = true;
+    cf_stats_value_[type] += value;
+    ++cf_stats_count_[type];
+  }
+
+  void AddDBStats(InternalDBStatsType type, uint64_t value,
+                  bool concurrent = false) {
+    auto& v = db_stats_[type];
+    if (concurrent) {
+      v.fetch_add(value, std::memory_order_relaxed);
+    } else {
+      v.store(v.load(std::memory_order_relaxed) + value,
+              std::memory_order_relaxed);
+    }
+  }
+
+  uint64_t GetDBStats(InternalDBStatsType type) {
+    return db_stats_[type].load(std::memory_order_relaxed);
+  }
+
+  HistogramImpl* GetFileReadHist(int level) {
+    return &file_read_latency_[level];
+  }
+
+  HistogramImpl* GetBlobFileReadHist() { return &blob_file_read_latency_; }
+
+  uint64_t GetBackgroundErrorCount() const { return bg_error_count_; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; }
+
+  bool GetStringProperty(const DBPropertyInfo& property_info,
+                         const Slice& property, std::string* value);
+
+  bool GetMapProperty(const DBPropertyInfo& property_info,
+                      const Slice& property,
+                      std::map<std::string, std::string>* value);
+
+  bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value,
+                      DBImpl* db);
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info,
+                                Version* version, uint64_t* value);
+
+  // Unless there is a recent enough collection of the stats, collect and
+  // saved new cache entry stats. If `foreground`, require data to be more
+  // recent to skip re-collection.
+  //
+  // This should only be called while NOT holding the DB mutex.
+  void CollectCacheEntryStats(bool foreground);
+
+  const uint64_t* TEST_GetCFStatsValue() const { return cf_stats_value_; }
+
+  const std::vector<CompactionStats>& TEST_GetCompactionStats() const {
+    return comp_stats_;
+  }
+
+  const CompactionStats& TEST_GetPerKeyPlacementCompactionStats() const {
+    return per_key_placement_comp_stats_;
+  }
+
+  void TEST_GetCacheEntryRoleStats(CacheEntryRoleStats* stats, bool foreground);
+
+  // Store a mapping from the user-facing DB::Properties string to our
+  // DBPropertyInfo struct used internally for retrieving properties.
+  static const UnorderedMap<std::string, DBPropertyInfo> ppt_name_to_info;
+
+  static const std::string kPeriodicCFStats;
+
+ private:
+  void DumpDBMapStats(std::map<std::string, std::string>* db_stats);
+  void DumpDBStats(std::string* value);
+  void DumpCFMapStats(std::map<std::string, std::string>* cf_stats);
+  void DumpCFMapStats(
+      const VersionStorageInfo* vstorage,
+      std::map<int, std::map<LevelStatType, double>>* level_stats,
+      CompactionStats* compaction_stats_sum);
+  void DumpCFMapStatsByPriority(
+      std::map<int, std::map<LevelStatType, double>>* priorities_stats);
+  void DumpCFMapStatsIOStalls(std::map<std::string, std::string>* cf_stats);
+  void DumpCFStats(std::string* value);
+  // if is_periodic = true, it is an internal call by RocksDB periodically to
+  // dump the status.
+  void DumpCFStatsNoFileHistogram(bool is_periodic, std::string* value);
+  // if is_periodic = true, it is an internal call by RocksDB periodically to
+  // dump the status.
+  void DumpCFFileHistogram(std::string* value);
+
+  Cache* GetBlockCacheForStats();
+  Cache* GetBlobCacheForStats();
+
+  // Per-DB stats
+  std::atomic<uint64_t> db_stats_[kIntStatsNumMax];
+  // Per-ColumnFamily stats
+  uint64_t cf_stats_value_[INTERNAL_CF_STATS_ENUM_MAX];
+  uint64_t cf_stats_count_[INTERNAL_CF_STATS_ENUM_MAX];
+  // Initialize/reference the collector in constructor so that we don't need
+  // additional synchronization in InternalStats, relying on synchronization
+  // in CacheEntryStatsCollector::GetStats. This collector is pinned in cache
+  // (through a shared_ptr) so that it does not get immediately ejected from
+  // a full cache, which would force a re-scan on the next GetStats.
+  std::shared_ptr<CacheEntryStatsCollector<CacheEntryRoleStats>>
+      cache_entry_stats_collector_;
+  // Per-ColumnFamily/level compaction stats
+  std::vector<CompactionStats> comp_stats_;
+  std::vector<CompactionStats> comp_stats_by_pri_;
+  CompactionStats per_key_placement_comp_stats_;
+  std::vector<HistogramImpl> file_read_latency_;
+  HistogramImpl blob_file_read_latency_;
+  bool has_cf_change_since_dump_;
+  // How many periods of no change since the last time stats are dumped for
+  // a periodic dump.
+  int no_cf_change_period_since_dump_ = 0;
+  uint64_t last_histogram_num = std::numeric_limits<uint64_t>::max();
+  static const int kMaxNoChangePeriodSinceDump;
+
+  // Used to compute per-interval statistics
+  struct CFStatsSnapshot {
+    // ColumnFamily-level stats
+    CompactionStats comp_stats;
+    uint64_t ingest_bytes_flush;  // Bytes written to L0 (Flush)
+    uint64_t stall_count;         // Stall count
+    // Stats from compaction jobs - bytes written, bytes read, duration.
+    uint64_t compact_bytes_write;
+    uint64_t compact_bytes_read;
+    uint64_t compact_micros;
+    double seconds_up;
+
+    // AddFile specific stats
+    uint64_t ingest_bytes_addfile;     // Total Bytes ingested
+    uint64_t ingest_files_addfile;     // Total number of files ingested
+    uint64_t ingest_l0_files_addfile;  // Total number of files ingested to L0
+    uint64_t ingest_keys_addfile;      // Total number of keys ingested
+
+    CFStatsSnapshot()
+        : ingest_bytes_flush(0),
+          stall_count(0),
+          compact_bytes_write(0),
+          compact_bytes_read(0),
+          compact_micros(0),
+          seconds_up(0),
+          ingest_bytes_addfile(0),
+          ingest_files_addfile(0),
+          ingest_l0_files_addfile(0),
+          ingest_keys_addfile(0) {}
+
+    void Clear() {
+      comp_stats.Clear();
+      ingest_bytes_flush = 0;
+      stall_count = 0;
+      compact_bytes_write = 0;
+      compact_bytes_read = 0;
+      compact_micros = 0;
+      seconds_up = 0;
+      ingest_bytes_addfile = 0;
+      ingest_files_addfile = 0;
+      ingest_l0_files_addfile = 0;
+      ingest_keys_addfile = 0;
+    }
+  } cf_stats_snapshot_;
+
+  struct DBStatsSnapshot {
+    // DB-level stats
+    uint64_t ingest_bytes;    // Bytes written by user
+    uint64_t wal_bytes;       // Bytes written to WAL
+    uint64_t wal_synced;      // Number of times WAL is synced
+    uint64_t write_with_wal;  // Number of writes that request WAL
+    // These count the number of writes processed by the calling thread or
+    // another thread.
+    uint64_t write_other;
+    uint64_t write_self;
+    // Total number of keys written. write_self and write_other measure number
+    // of write requests written, Each of the write request can contain updates
+    // to multiple keys. num_keys_written is total number of keys updated by all
+    // those writes.
+    uint64_t num_keys_written;
+    // Total time writes delayed by stalls.
+    uint64_t write_stall_micros;
+    double seconds_up;
+
+    DBStatsSnapshot()
+        : ingest_bytes(0),
+          wal_bytes(0),
+          wal_synced(0),
+          write_with_wal(0),
+          write_other(0),
+          write_self(0),
+          num_keys_written(0),
+          write_stall_micros(0),
+          seconds_up(0) {}
+
+    void Clear() {
+      ingest_bytes = 0;
+      wal_bytes = 0;
+      wal_synced = 0;
+      write_with_wal = 0;
+      write_other = 0;
+      write_self = 0;
+      num_keys_written = 0;
+      write_stall_micros = 0;
+      seconds_up = 0;
+    }
+  } db_stats_snapshot_;
+
+  // Handler functions for getting property values. They use "value" as a value-
+  // result argument, and return true upon successfully setting "value".
+  bool HandleNumFilesAtLevel(std::string* value, Slice suffix);
+  bool HandleCompressionRatioAtLevelPrefix(std::string* value, Slice suffix);
+  bool HandleLevelStats(std::string* value, Slice suffix);
+  bool HandleStats(std::string* value, Slice suffix);
+  bool HandleCFMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
+  bool HandleCFStats(std::string* value, Slice suffix);
+  bool HandleCFStatsNoFileHistogram(std::string* value, Slice suffix);
+  bool HandleCFFileHistogram(std::string* value, Slice suffix);
+  bool HandleCFStatsPeriodic(std::string* value, Slice suffix);
+  bool HandleDBMapStats(std::map<std::string, std::string>* compaction_stats,
+                        Slice suffix);
+  bool HandleDBStats(std::string* value, Slice suffix);
+  bool HandleSsTables(std::string* value, Slice suffix);
+  bool HandleAggregatedTableProperties(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix);
+  bool HandleAggregatedTablePropertiesMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleAggregatedTablePropertiesAtLevelMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db,
+                                         Version* version);
+  bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db,
+                                      Version* version);
+  bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleOldestSnapshotSequence(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db,
+                                       Version* version);
+  bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveSstFilesSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db,
+                                            Version* version);
+  bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db,
+                                     Version* version);
+  bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db,
+                                  Version* version);
+  bool HandleMinLogNumberToKeep(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleMinObsoleteSstNumberToKeep(uint64_t* value, DBImpl* db,
+                                        Version* version);
+  bool HandleActualDelayedWriteRate(uint64_t* value, DBImpl* db,
+                                    Version* version);
+  bool HandleIsWriteStopped(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBlockCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlockCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                   Version* version);
+  bool HandleBlockCacheEntryStatsInternal(std::string* value, bool fast);
+  bool HandleBlockCacheEntryStatsMapInternal(
+      std::map<std::string, std::string>* values, bool fast);
+  bool HandleBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleBlockCacheEntryStatsMap(std::map<std::string, std::string>* values,
+                                     Slice suffix);
+  bool HandleFastBlockCacheEntryStats(std::string* value, Slice suffix);
+  bool HandleFastBlockCacheEntryStatsMap(
+      std::map<std::string, std::string>* values, Slice suffix);
+  bool HandleLiveSstFilesSizeAtTemperature(std::string* value, Slice suffix);
+  bool HandleNumBlobFiles(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobStats(std::string* value, Slice suffix);
+  bool HandleTotalBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveBlobFileSize(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleLiveBlobFileGarbageSize(uint64_t* value, DBImpl* db,
+                                     Version* version);
+  bool HandleBlobCacheCapacity(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobCacheUsage(uint64_t* value, DBImpl* db, Version* version);
+  bool HandleBlobCachePinnedUsage(uint64_t* value, DBImpl* db,
+                                  Version* version);
+
+  // Total number of background errors encountered. Every time a flush task
+  // or compaction task fails, this counter is incremented. The failure can
+  // be caused by any possible reason, including file system errors, out of
+  // resources, or input file corruption. Failing when retrying the same flush
+  // or compaction will cause the counter to increase too.
+  uint64_t bg_error_count_;
+
+  const int number_levels_;
+  SystemClock* clock_;
+  ColumnFamilyData* cfd_;
+  uint64_t started_at_;
+};
+
+#else
+
+class InternalStats {
+ public:
+  enum InternalCFStatsType {
+    L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS,
+    MEMTABLE_LIMIT_STOPS,
+    MEMTABLE_LIMIT_SLOWDOWNS,
+    L0_FILE_COUNT_LIMIT_STOPS,
+    LOCKED_L0_FILE_COUNT_LIMIT_STOPS,
+    PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS,
+    PENDING_COMPACTION_BYTES_LIMIT_STOPS,
+    WRITE_STALLS_ENUM_MAX,
+    BYTES_FLUSHED,
+    BYTES_INGESTED_ADD_FILE,
+    INGESTED_NUM_FILES_TOTAL,
+    INGESTED_LEVEL0_NUM_FILES_TOTAL,
+    INGESTED_NUM_KEYS_TOTAL,
+    INTERNAL_CF_STATS_ENUM_MAX,
+  };
+
+  enum InternalDBStatsType {
+    kIntStatsWalFileBytes,
+    kIntStatsWalFileSynced,
+    kIntStatsBytesWritten,
+    kIntStatsNumKeysWritten,
+    kIntStatsWriteDoneByOther,
+    kIntStatsWriteDoneBySelf,
+    kIntStatsWriteWithWal,
+    kIntStatsWriteStallMicros,
+    kIntStatsNumMax,
+  };
+
+  InternalStats(int /*num_levels*/, SystemClock* /*clock*/,
+                ColumnFamilyData* /*cfd*/) {}
+
+  // Per level compaction stats
+  struct CompactionOutputsStats {
+    uint64_t num_output_records = 0;
+    uint64_t bytes_written = 0;
+    uint64_t bytes_written_blob = 0;
+    uint64_t num_output_files = 0;
+    uint64_t num_output_files_blob = 0;
+
+    void Add(const CompactionOutputsStats& stats) {
+      this->num_output_records += stats.num_output_records;
+      this->bytes_written += stats.bytes_written;
+      this->bytes_written_blob += stats.bytes_written_blob;
+      this->num_output_files += stats.num_output_files;
+      this->num_output_files_blob += stats.num_output_files_blob;
+    }
+  };
+
+  struct CompactionStats {
+    uint64_t micros;
+    uint64_t cpu_micros;
+    uint64_t bytes_read_non_output_levels;
+    uint64_t bytes_read_output_level;
+    uint64_t bytes_read_blob;
+    uint64_t bytes_written;
+    uint64_t bytes_written_blob;
+    uint64_t bytes_moved;
+    int num_input_files_in_non_output_levels;
+    int num_input_files_in_output_level;
+    int num_output_files;
+    int num_output_files_blob;
+    uint64_t num_input_records;
+    uint64_t num_dropped_records;
+    uint64_t num_output_records;
+    int count;
+
+    explicit CompactionStats() {}
+
+    explicit CompactionStats(CompactionReason /*reason*/, int /*c*/) {}
+
+    explicit CompactionStats(const CompactionStats& /*c*/) {}
+
+    void Add(const CompactionStats& /*c*/) {}
+
+    void Add(const CompactionOutputsStats& /*c*/) {}
+
+    void Subtract(const CompactionStats& /*c*/) {}
+  };
+
+  struct CompactionStatsFull {
+    // the stats for the target primary output level (per level stats)
+    CompactionStats stats;
+
+    // stats for output_to_penultimate_level level (per level stats)
+    bool has_penultimate_level_output = false;
+    CompactionStats penultimate_level_stats;
+
+    explicit CompactionStatsFull(){};
+
+    explicit CompactionStatsFull(CompactionReason /*reason*/, int /*c*/){};
+
+    uint64_t TotalBytesWritten() const { return 0; }
+
+    uint64_t DroppedRecords() { return 0; }
+
+    void SetMicros(uint64_t /*val*/){};
+
+    void AddCpuMicros(uint64_t /*val*/){};
+  };
+
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStats& /*stats*/) {}
+
+  void AddCompactionStats(int /*level*/, Env::Priority /*thread_pri*/,
+                          const CompactionStatsFull& /*unmerged_stats*/) {}
+
+  void IncBytesMoved(int /*level*/, uint64_t /*amount*/) {}
+
+  void AddCFStats(InternalCFStatsType /*type*/, uint64_t /*value*/) {}
+
+  void AddDBStats(InternalDBStatsType /*type*/, uint64_t /*value*/,
+                  bool /*concurrent */ = false) {}
+
+  HistogramImpl* GetFileReadHist(int /*level*/) { return nullptr; }
+
+  HistogramImpl* GetBlobFileReadHist() { return nullptr; }
+
+  uint64_t GetBackgroundErrorCount() const { return 0; }
+
+  uint64_t BumpAndGetBackgroundErrorCount() { return 0; }
+
+  bool GetStringProperty(const DBPropertyInfo& /*property_info*/,
+                         const Slice& /*property*/, std::string* /*value*/) {
+    return false;
+  }
+
+  bool GetMapProperty(const DBPropertyInfo& /*property_info*/,
+                      const Slice& /*property*/,
+                      std::map<std::string, std::string>* /*value*/) {
+    return false;
+  }
+
+  bool GetIntProperty(const DBPropertyInfo& /*property_info*/,
+                      uint64_t* /*value*/, DBImpl* /*db*/) const {
+    return false;
+  }
+
+  bool GetIntPropertyOutOfMutex(const DBPropertyInfo& /*property_info*/,
+                                Version* /*version*/,
+                                uint64_t* /*value*/) const {
+    return false;
+  }
+};
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/job_context.h b/src/rocksdb/db/job_context.h
new file mode 100644
index 000000000..352c58e82
--- /dev/null
+++ b/src/rocksdb/db/job_context.h
@@ -0,0 +1,238 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+struct SuperVersion;
+
+struct SuperVersionContext {
+  struct WriteStallNotification {
+    WriteStallInfo write_stall_info;
+    const ImmutableOptions* immutable_options;
+  };
+
+  autovector<SuperVersion*> superversions_to_free;
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+  autovector<WriteStallNotification> write_stall_notifications;
+#endif
+  std::unique_ptr<SuperVersion>
+      new_superversion;  // if nullptr no new superversion
+
+  explicit SuperVersionContext(bool create_superversion = false)
+      : new_superversion(create_superversion ? new SuperVersion() : nullptr) {}
+
+  explicit SuperVersionContext(SuperVersionContext&& other) noexcept
+      : superversions_to_free(std::move(other.superversions_to_free)),
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+        write_stall_notifications(std::move(other.write_stall_notifications)),
+#endif
+        new_superversion(std::move(other.new_superversion)) {
+  }
+  // No copies
+  SuperVersionContext(const SuperVersionContext& other) = delete;
+  void operator=(const SuperVersionContext& other) = delete;
+
+  void NewSuperVersion() {
+    new_superversion = std::unique_ptr<SuperVersion>(new SuperVersion());
+  }
+
+  inline bool HaveSomethingToDelete() const {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    return !superversions_to_free.empty() || !write_stall_notifications.empty();
+#else
+    return !superversions_to_free.empty();
+#endif
+  }
+
+  void PushWriteStallNotification(WriteStallCondition old_cond,
+                                  WriteStallCondition new_cond,
+                                  const std::string& name,
+                                  const ImmutableOptions* ioptions) {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    WriteStallNotification notif;
+    notif.write_stall_info.cf_name = name;
+    notif.write_stall_info.condition.prev = old_cond;
+    notif.write_stall_info.condition.cur = new_cond;
+    notif.immutable_options = ioptions;
+    write_stall_notifications.push_back(notif);
+#else
+    (void)old_cond;
+    (void)new_cond;
+    (void)name;
+    (void)ioptions;
+#endif  // !defined(ROCKSDB_LITE) &&
+        // !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+  }
+
+  void Clean() {
+#if !defined(ROCKSDB_LITE) && !defined(ROCKSDB_DISABLE_STALL_NOTIFICATION)
+    // notify listeners on changed write stall conditions
+    for (auto& notif : write_stall_notifications) {
+      for (auto& listener : notif.immutable_options->listeners) {
+        listener->OnStallConditionsChanged(notif.write_stall_info);
+      }
+    }
+    write_stall_notifications.clear();
+#endif  // !ROCKSDB_LITE
+    // free superversions
+    for (auto s : superversions_to_free) {
+      delete s;
+    }
+    superversions_to_free.clear();
+  }
+
+  ~SuperVersionContext() {
+#ifndef ROCKSDB_DISABLE_STALL_NOTIFICATION
+    assert(write_stall_notifications.empty());
+#endif
+    assert(superversions_to_free.empty());
+  }
+};
+
+struct JobContext {
+  inline bool HaveSomethingToDelete() const {
+    return !(full_scan_candidate_files.empty() && sst_delete_files.empty() &&
+             blob_delete_files.empty() && log_delete_files.empty() &&
+             manifest_delete_files.empty());
+  }
+
+  inline bool HaveSomethingToClean() const {
+    bool sv_have_sth = false;
+    for (const auto& sv_ctx : superversion_contexts) {
+      if (sv_ctx.HaveSomethingToDelete()) {
+        sv_have_sth = true;
+        break;
+      }
+    }
+    return memtables_to_free.size() > 0 || logs_to_free.size() > 0 ||
+           job_snapshot != nullptr || sv_have_sth;
+  }
+
+  SequenceNumber GetJobSnapshotSequence() const {
+    if (job_snapshot) {
+      assert(job_snapshot->snapshot());
+      return job_snapshot->snapshot()->GetSequenceNumber();
+    }
+    return kMaxSequenceNumber;
+  }
+
+  // Structure to store information for candidate files to delete.
+  struct CandidateFileInfo {
+    std::string file_name;
+    std::string file_path;
+    CandidateFileInfo(std::string name, std::string path)
+        : file_name(std::move(name)), file_path(std::move(path)) {}
+    bool operator==(const CandidateFileInfo& other) const {
+      return file_name == other.file_name && file_path == other.file_path;
+    }
+  };
+
+  // Unique job id
+  int job_id;
+
+  // a list of all files that we'll consider deleting
+  // (every once in a while this is filled up with all files
+  // in the DB directory)
+  // (filled only if we're doing full scan)
+  std::vector<CandidateFileInfo> full_scan_candidate_files;
+
+  // the list of all live sst files that cannot be deleted
+  std::vector<uint64_t> sst_live;
+
+  // the list of sst files that we need to delete
+  std::vector<ObsoleteFileInfo> sst_delete_files;
+
+  // the list of all live blob files that cannot be deleted
+  std::vector<uint64_t> blob_live;
+
+  // the list of blob files that we need to delete
+  std::vector<ObsoleteBlobFileInfo> blob_delete_files;
+
+  // a list of log files that we need to delete
+  std::vector<uint64_t> log_delete_files;
+
+  // a list of log files that we need to preserve during full purge since they
+  // will be reused later
+  std::vector<uint64_t> log_recycle_files;
+
+  // a list of manifest files that we need to delete
+  std::vector<std::string> manifest_delete_files;
+
+  // a list of memtables to be free
+  autovector<MemTable*> memtables_to_free;
+
+  // contexts for installing superversions for multiple column families
+  std::vector<SuperVersionContext> superversion_contexts;
+
+  autovector<log::Writer*> logs_to_free;
+
+  // the current manifest_file_number, log_number and prev_log_number
+  // that corresponds to the set of files in 'live'.
+  uint64_t manifest_file_number;
+  uint64_t pending_manifest_file_number;
+  uint64_t log_number;
+  uint64_t prev_log_number;
+
+  uint64_t min_pending_output = 0;
+  uint64_t prev_total_log_size = 0;
+  size_t num_alive_log_files = 0;
+  uint64_t size_log_to_delete = 0;
+
+  // Snapshot taken before flush/compaction job.
+  std::unique_ptr<ManagedSnapshot> job_snapshot;
+
+  explicit JobContext(int _job_id, bool create_superversion = false) {
+    job_id = _job_id;
+    manifest_file_number = 0;
+    pending_manifest_file_number = 0;
+    log_number = 0;
+    prev_log_number = 0;
+    superversion_contexts.emplace_back(
+        SuperVersionContext(create_superversion));
+  }
+
+  // For non-empty JobContext Clean() has to be called at least once before
+  // before destruction (see asserts in ~JobContext()). Should be called with
+  // unlocked DB mutex. Destructor doesn't call Clean() to avoid accidentally
+  // doing potentially slow Clean() with locked DB mutex.
+  void Clean() {
+    // free superversions
+    for (auto& sv_context : superversion_contexts) {
+      sv_context.Clean();
+    }
+    // free pending memtables
+    for (auto m : memtables_to_free) {
+      delete m;
+    }
+    for (auto l : logs_to_free) {
+      delete l;
+    }
+
+    memtables_to_free.clear();
+    logs_to_free.clear();
+    job_snapshot.reset();
+  }
+
+  ~JobContext() {
+    assert(memtables_to_free.size() == 0);
+    assert(logs_to_free.size() == 0);
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/kv_checksum.h b/src/rocksdb/db/kv_checksum.h
new file mode 100644
index 000000000..bce507fcf
--- /dev/null
+++ b/src/rocksdb/db/kv_checksum.h
@@ -0,0 +1,398 @@
+//  Copyright (c) 2020-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file contains classes containing fields to protect individual entries.
+// The classes are named "ProtectionInfo<suffix>", where <suffix> indicates the
+// combination of fields that are covered. Each field has a single letter
+// abbreviation as follows.
+//
+// K = key
+// V = value
+// O = optype aka value type
+// S = seqno
+// C = CF ID
+//
+// Then, for example, a class that protects an entry consisting of key, value,
+// optype, and CF ID (i.e., a `WriteBatch` entry) would be named
+// `ProtectionInfoKVOC`.
+//
+// The `ProtectionInfo.*` classes are templated on the integer type used to hold
+// the XOR of hashes for each field. Only unsigned integer types are supported,
+// and the maximum supported integer width is 64 bits. When the integer type is
+// narrower than the hash values, we lop off the most significant bits to make
+// them fit.
+//
+// The `ProtectionInfo.*` classes are all intended to be non-persistent. We do
+// not currently make the byte order consistent for integer fields before
+// hashing them, so the resulting values are endianness-dependent.
+
+#pragma once
+
+#include <type_traits>
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+template <typename T>
+class ProtectionInfo;
+template <typename T>
+class ProtectionInfoKVO;
+template <typename T>
+class ProtectionInfoKVOC;
+template <typename T>
+class ProtectionInfoKVOS;
+
+// Aliases for 64-bit protection infos.
+using ProtectionInfo64 = ProtectionInfo<uint64_t>;
+using ProtectionInfoKVO64 = ProtectionInfoKVO<uint64_t>;
+using ProtectionInfoKVOC64 = ProtectionInfoKVOC<uint64_t>;
+using ProtectionInfoKVOS64 = ProtectionInfoKVOS<uint64_t>;
+
+template <typename T>
+class ProtectionInfo {
+ public:
+  ProtectionInfo() = default;
+
+  Status GetStatus() const;
+  ProtectionInfoKVO<T> ProtectKVO(const Slice& key, const Slice& value,
+                                  ValueType op_type) const;
+  ProtectionInfoKVO<T> ProtectKVO(const SliceParts& key,
+                                  const SliceParts& value,
+                                  ValueType op_type) const;
+
+  T GetVal() const { return val_; }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  // Each field is hashed with an independent value so we can catch fields being
+  // swapped. Per the `NPHash64()` docs, using consecutive seeds is a pitfall,
+  // and we should instead vary our seeds by a large odd number. This value by
+  // which we increment (0xD28AAD72F49BD50B) was taken from
+  // `head -c8 /dev/urandom | hexdump`, run repeatedly until it yielded an odd
+  // number. The values are computed manually since the Windows C++ compiler
+  // complains about the overflow when adding constants.
+  static const uint64_t kSeedK = 0;
+  static const uint64_t kSeedV = 0xD28AAD72F49BD50B;
+  static const uint64_t kSeedO = 0xA5155AE5E937AA16;
+  static const uint64_t kSeedS = 0x77A00858DDD37F21;
+  static const uint64_t kSeedC = 0x4A2AB5CBD26F542C;
+
+  ProtectionInfo(T val) : val_(val) {
+    static_assert(sizeof(ProtectionInfo<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { val_ = val; }
+
+  T val_ = 0;
+};
+
+template <typename T>
+class ProtectionInfoKVO {
+ public:
+  ProtectionInfoKVO() = default;
+
+  ProtectionInfo<T> StripKVO(const Slice& key, const Slice& value,
+                             ValueType op_type) const;
+  ProtectionInfo<T> StripKVO(const SliceParts& key, const SliceParts& value,
+                             ValueType op_type) const;
+
+  ProtectionInfoKVOC<T> ProtectC(ColumnFamilyId column_family_id) const;
+  ProtectionInfoKVOS<T> ProtectS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key);
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key);
+  void UpdateV(const Slice& old_value, const Slice& new_value);
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value);
+  void UpdateO(ValueType old_op_type, ValueType new_op_type);
+
+  T GetVal() const { return info_.GetVal(); }
+
+ private:
+  friend class ProtectionInfo<T>;
+  friend class ProtectionInfoKVOS<T>;
+  friend class ProtectionInfoKVOC<T>;
+
+  explicit ProtectionInfoKVO(T val) : info_(val) {
+    static_assert(sizeof(ProtectionInfoKVO<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { info_.SetVal(val); }
+
+  ProtectionInfo<T> info_;
+};
+
+template <typename T>
+class ProtectionInfoKVOC {
+ public:
+  ProtectionInfoKVOC() = default;
+
+  ProtectionInfoKVO<T> StripC(ColumnFamilyId column_family_id) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateC(ColumnFamilyId old_column_family_id,
+               ColumnFamilyId new_column_family_id);
+
+  T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOC(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOC<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+class ProtectionInfoKVOS {
+ public:
+  ProtectionInfoKVOS() = default;
+
+  ProtectionInfoKVO<T> StripS(SequenceNumber sequence_number) const;
+
+  void UpdateK(const Slice& old_key, const Slice& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateK(const SliceParts& old_key, const SliceParts& new_key) {
+    kvo_.UpdateK(old_key, new_key);
+  }
+  void UpdateV(const Slice& old_value, const Slice& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateV(const SliceParts& old_value, const SliceParts& new_value) {
+    kvo_.UpdateV(old_value, new_value);
+  }
+  void UpdateO(ValueType old_op_type, ValueType new_op_type) {
+    kvo_.UpdateO(old_op_type, new_op_type);
+  }
+  void UpdateS(SequenceNumber old_sequence_number,
+               SequenceNumber new_sequence_number);
+
+  T GetVal() const { return kvo_.GetVal(); }
+
+ private:
+  friend class ProtectionInfoKVO<T>;
+
+  explicit ProtectionInfoKVOS(T val) : kvo_(val) {
+    static_assert(sizeof(ProtectionInfoKVOS<T>) == sizeof(T), "");
+  }
+
+  void SetVal(T val) { kvo_.SetVal(val); }
+
+  ProtectionInfoKVO<T> kvo_;
+};
+
+template <typename T>
+Status ProtectionInfo<T>::GetStatus() const {
+  if (val_ != 0) {
+    return Status::Corruption("ProtectionInfo mismatch");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const Slice& key,
+                                                   const Slice& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfo<T>::ProtectKVO(const SliceParts& key,
+                                                   const SliceParts& value,
+                                                   ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const Slice& old_key, const Slice& new_key) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateK(const SliceParts& old_key,
+                                   const SliceParts& new_key) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_key, ProtectionInfo<T>::kSeedK));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_key, ProtectionInfo<T>::kSeedK));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const Slice& old_value,
+                                   const Slice& new_value) {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(GetSliceNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateV(const SliceParts& old_value,
+                                   const SliceParts& new_value) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(old_value, ProtectionInfo<T>::kSeedV));
+  val = val ^ static_cast<T>(
+                  GetSlicePartsNPHash64(new_value, ProtectionInfo<T>::kSeedV));
+  SetVal(val);
+}
+
+template <typename T>
+void ProtectionInfoKVO<T>::UpdateO(ValueType old_op_type,
+                                   ValueType new_op_type) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&old_op_type),
+                                      sizeof(old_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&new_op_type),
+                                      sizeof(new_op_type),
+                                      ProtectionInfo<T>::kSeedO));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const Slice& key,
+                                                 const Slice& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(GetSliceNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val =
+      val ^ static_cast<T>(GetSliceNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfo<T> ProtectionInfoKVO<T>::StripKVO(const SliceParts& key,
+                                                 const SliceParts& value,
+                                                 ValueType op_type) const {
+  T val = GetVal();
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(key, ProtectionInfo<T>::kSeedK));
+  val = val ^
+        static_cast<T>(GetSlicePartsNPHash64(value, ProtectionInfo<T>::kSeedV));
+  val = val ^
+        static_cast<T>(NPHash64(reinterpret_cast<char*>(&op_type),
+                                sizeof(op_type), ProtectionInfo<T>::kSeedO));
+  return ProtectionInfo<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVOC<T> ProtectionInfoKVO<T>::ProtectC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVOC<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOC<T>::StripC(
+    ColumnFamilyId column_family_id) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&column_family_id),
+                  sizeof(column_family_id), ProtectionInfo<T>::kSeedC));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOC<T>::UpdateC(ColumnFamilyId old_column_family_id,
+                                    ColumnFamilyId new_column_family_id) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_column_family_id),
+                  sizeof(old_column_family_id), ProtectionInfo<T>::kSeedC));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_column_family_id),
+                  sizeof(new_column_family_id), ProtectionInfo<T>::kSeedC));
+  SetVal(val);
+}
+
+template <typename T>
+ProtectionInfoKVOS<T> ProtectionInfoKVO<T>::ProtectS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVOS<T>(val);
+}
+
+template <typename T>
+ProtectionInfoKVO<T> ProtectionInfoKVOS<T>::StripS(
+    SequenceNumber sequence_number) const {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(reinterpret_cast<char*>(&sequence_number),
+                                      sizeof(sequence_number),
+                                      ProtectionInfo<T>::kSeedS));
+  return ProtectionInfoKVO<T>(val);
+}
+
+template <typename T>
+void ProtectionInfoKVOS<T>::UpdateS(SequenceNumber old_sequence_number,
+                                    SequenceNumber new_sequence_number) {
+  T val = GetVal();
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&old_sequence_number),
+                  sizeof(old_sequence_number), ProtectionInfo<T>::kSeedS));
+  val = val ^ static_cast<T>(NPHash64(
+                  reinterpret_cast<char*>(&new_sequence_number),
+                  sizeof(new_sequence_number), ProtectionInfo<T>::kSeedS));
+  SetVal(val);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/listener_test.cc b/src/rocksdb/db/listener_test.cc
new file mode 100644
index 000000000..160866bb7
--- /dev/null
+++ b/src/rocksdb/db/listener_test.cc
@@ -0,0 +1,1595 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/blob/blob_index.h"
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "monitoring/statistics.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/options.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "rocksdb/table_properties.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/rate_limiter.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class EventListenerTest : public DBTestBase {
+ public:
+  EventListenerTest() : DBTestBase("listener_test", /*env_do_fsync=*/true) {}
+
+  static std::string BlobStr(uint64_t blob_file_number, uint64_t offset,
+                             uint64_t size) {
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+    return blob_index;
+  }
+
+  const size_t k110KB = 110 << 10;
+};
+
+struct TestPropertiesCollector
+    : public ROCKSDB_NAMESPACE::TablePropertiesCollector {
+  ROCKSDB_NAMESPACE::Status AddUserKey(
+      const ROCKSDB_NAMESPACE::Slice& /*key*/,
+      const ROCKSDB_NAMESPACE::Slice& /*value*/,
+      ROCKSDB_NAMESPACE::EntryType /*type*/,
+      ROCKSDB_NAMESPACE::SequenceNumber /*seq*/,
+      uint64_t /*file_size*/) override {
+    return Status::OK();
+  }
+  ROCKSDB_NAMESPACE::Status Finish(
+      ROCKSDB_NAMESPACE::UserCollectedProperties* properties) override {
+    properties->insert({"0", "1"});
+    return Status::OK();
+  }
+
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+
+  ROCKSDB_NAMESPACE::UserCollectedProperties GetReadableProperties()
+      const override {
+    ROCKSDB_NAMESPACE::UserCollectedProperties ret;
+    ret["2"] = "3";
+    return ret;
+  }
+};
+
+class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
+ public:
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context /*context*/) override {
+    return new TestPropertiesCollector;
+  }
+  const char* Name() const override { return "TestTablePropertiesCollector"; }
+};
+
+class TestCompactionListener : public EventListener {
+ public:
+  explicit TestCompactionListener(EventListenerTest* test) : test_(test) {}
+
+  void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compacted_dbs_.push_back(db);
+    ASSERT_GT(ci.input_files.size(), 0U);
+    ASSERT_EQ(ci.input_files.size(), ci.input_file_infos.size());
+
+    for (size_t i = 0; i < ci.input_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.input_file_infos[i].level, ci.base_input_level);
+      ASSERT_EQ(ci.input_file_infos[i].file_number,
+                TableFileNameToNumber(ci.input_files[i]));
+    }
+
+    ASSERT_GT(ci.output_files.size(), 0U);
+    ASSERT_EQ(ci.output_files.size(), ci.output_file_infos.size());
+
+    ASSERT_TRUE(test_);
+    ASSERT_EQ(test_->db_, db);
+
+    std::vector<std::vector<FileMetaData>> files_by_level;
+    test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[ci.cf_id],
+                                           &files_by_level);
+    ASSERT_GT(files_by_level.size(), ci.output_level);
+
+    for (size_t i = 0; i < ci.output_file_infos.size(); ++i) {
+      ASSERT_EQ(ci.output_file_infos[i].level, ci.output_level);
+      ASSERT_EQ(ci.output_file_infos[i].file_number,
+                TableFileNameToNumber(ci.output_files[i]));
+
+      auto it = std::find_if(
+          files_by_level[ci.output_level].begin(),
+          files_by_level[ci.output_level].end(), [&](const FileMetaData& meta) {
+            return meta.fd.GetNumber() == ci.output_file_infos[i].file_number;
+          });
+      ASSERT_NE(it, files_by_level[ci.output_level].end());
+
+      ASSERT_EQ(ci.output_file_infos[i].oldest_blob_file_number,
+                it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
+    ASSERT_GT(ci.thread_id, 0U);
+
+    for (auto fl : {ci.input_files, ci.output_files}) {
+      for (auto fn : fl) {
+        auto it = ci.table_properties.find(fn);
+        ASSERT_NE(it, ci.table_properties.end());
+        auto tp = it->second;
+        ASSERT_TRUE(tp != nullptr);
+        ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
+      }
+    }
+  }
+
+  EventListenerTest* test_;
+  std::vector<DB*> compacted_dbs_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
+  const int kTestKeySize = 16;
+  const int kTestValueSize = 984;
+  const int kEntrySize = kTestKeySize + kTestValueSize;
+  const int kEntriesPerBuffer = 100;
+  const int kNumL0Files = 4;
+
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.write_buffer_size = kEntrySize * kEntriesPerBuffer;
+  options.compaction_style = kCompactionStyleLevel;
+  options.target_file_size_base = options.write_buffer_size;
+  options.max_bytes_for_level_base = options.target_file_size_base * 2;
+  options.max_bytes_for_level_multiplier = 2;
+  options.compression = kNoCompression;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  TestCompactionListener* listener = new TestCompactionListener(this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+  CreateAndReopenWithCF(cf_names, options);
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(123, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[i],
+                                     nullptr, nullptr));
+    ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  }
+
+  ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size());
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->compacted_dbs_[i], db_);
+  }
+}
+
+// This simple Listener can only handle one flush at a time.
+class TestFlushListener : public EventListener {
+ public:
+  TestFlushListener(Env* env, EventListenerTest* test)
+      : slowdown_count(0), stop_count(0), db_closed(), env_(env), test_(test) {
+    db_closed = false;
+  }
+
+  virtual ~TestFlushListener() {
+    prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
+  }
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    // remember the info for later checking the FlushJobInfo.
+    prev_fc_info_ = info;
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_GT(info.table_properties.data_size, 0U);
+    ASSERT_GT(info.table_properties.raw_key_size, 0U);
+    ASSERT_GT(info.table_properties.raw_value_size, 0U);
+    ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+    ASSERT_GT(info.table_properties.num_entries, 0U);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+
+#ifdef ROCKSDB_USING_THREAD_STATUS
+    // Verify the id of the current thread that created this table
+    // file matches the id of any active flush or compaction thread.
+    uint64_t thread_id = env_->GetThreadID();
+    std::vector<ThreadStatus> thread_list;
+    ASSERT_OK(env_->GetThreadList(&thread_list));
+    bool found_match = false;
+    for (auto thread_status : thread_list) {
+      if (thread_status.operation_type == ThreadStatus::OP_FLUSH ||
+          thread_status.operation_type == ThreadStatus::OP_COMPACTION) {
+        if (thread_id == thread_status.thread_id) {
+          found_match = true;
+          break;
+        }
+      }
+    }
+    ASSERT_TRUE(found_match);
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  }
+
+  void OnFlushCompleted(DB* db, const FlushJobInfo& info) override {
+    flushed_dbs_.push_back(db);
+    flushed_column_family_names_.push_back(info.cf_name);
+    if (info.triggered_writes_slowdown) {
+      slowdown_count++;
+    }
+    if (info.triggered_writes_stop) {
+      stop_count++;
+    }
+    // verify whether the previously created file matches the flushed file.
+    ASSERT_EQ(prev_fc_info_.db_name, db->GetName());
+    ASSERT_EQ(prev_fc_info_.cf_name, info.cf_name);
+    ASSERT_EQ(prev_fc_info_.job_id, info.job_id);
+    ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
+    ASSERT_EQ(TableFileNameToNumber(info.file_path), info.file_number);
+
+    // Note: the following chunk relies on the notification pertaining to the
+    // database pointed to by DBTestBase::db_, and is thus bypassed when
+    // that assumption does not hold (see the test case MultiDBMultiListeners
+    // below).
+    ASSERT_TRUE(test_);
+    if (db == test_->db_) {
+      std::vector<std::vector<FileMetaData>> files_by_level;
+      ASSERT_LT(info.cf_id, test_->handles_.size());
+      ASSERT_GE(info.cf_id, 0u);
+      ASSERT_NE(test_->handles_[info.cf_id], nullptr);
+      test_->dbfull()->TEST_GetFilesMetaData(test_->handles_[info.cf_id],
+                                             &files_by_level);
+
+      ASSERT_FALSE(files_by_level.empty());
+      auto it = std::find_if(files_by_level[0].begin(), files_by_level[0].end(),
+                             [&](const FileMetaData& meta) {
+                               return meta.fd.GetNumber() == info.file_number;
+                             });
+      ASSERT_NE(it, files_by_level[0].end());
+      ASSERT_EQ(info.oldest_blob_file_number, it->oldest_blob_file_number);
+    }
+
+    ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
+    ASSERT_GT(info.thread_id, 0U);
+    ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
+              "1");
+  }
+
+  std::vector<std::string> flushed_column_family_names_;
+  std::vector<DB*> flushed_dbs_;
+  int slowdown_count;
+  int stop_count;
+  bool db_closing;
+  std::atomic_bool db_closed;
+  TableFileCreationInfo prev_fc_info_;
+
+ protected:
+  Env* env_;
+  EventListenerTest* test_;
+};
+
+TEST_F(EventListenerTest, OnSingleDBFlushTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  options.listeners.emplace_back(listener);
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  CreateAndReopenWithCF(cf_names, options);
+
+  ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::PutBlobIndex(&batch, 1, "ditto",
+                                             BlobStr(456, 0, 1 << 10)));
+  ASSERT_OK(dbfull()->Write(WriteOptions(), &batch));
+
+  ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+  ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+  ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+  ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+  ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+  ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+  for (int i = 1; i < 8; ++i) {
+    ASSERT_OK(Flush(i));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+    ASSERT_EQ(listener->flushed_dbs_.size(), i);
+    ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+  }
+
+  // make sure callback functions are called in the right order
+  for (size_t i = 0; i < cf_names.size(); ++i) {
+    ASSERT_EQ(listener->flushed_dbs_[i], db_);
+    ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]);
+  }
+}
+
+TEST_F(EventListenerTest, MultiCF) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.write_buffer_size = k110KB;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  for (auto atomic_flush : {false, true}) {
+    options.atomic_flush = atomic_flush;
+    options.create_if_missing = true;
+    DestroyAndReopen(options);
+    TestFlushListener* listener = new TestFlushListener(options.env, this);
+    options.listeners.emplace_back(listener);
+    options.table_properties_collector_factories.push_back(
+        std::make_shared<TestPropertiesCollectorFactory>());
+    std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                         "dobrynia", "nikitich", "alyosha",
+                                         "popovich"};
+    CreateAndReopenWithCF(cf_names, options);
+
+    ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
+    ASSERT_OK(Put(2, "ilya", std::string(90000, 'i')));
+    ASSERT_OK(Put(3, "muromec", std::string(90000, 'm')));
+    ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd')));
+    ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n')));
+    ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a')));
+    ASSERT_OK(Put(7, "popovich", std::string(90000, 'p')));
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    for (int i = 1; i < 8; ++i) {
+      ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+          {{"DBImpl::NotifyOnFlushCompleted::PostAllOnFlushCompleted",
+            "EventListenerTest.MultiCF:PreVerifyListener"}});
+      ASSERT_OK(Flush(i));
+      TEST_SYNC_POINT("EventListenerTest.MultiCF:PreVerifyListener");
+      ASSERT_EQ(listener->flushed_dbs_.size(), i);
+      ASSERT_EQ(listener->flushed_column_family_names_.size(), i);
+      // make sure callback functions are called in the right order
+      if (i == 7) {
+        for (size_t j = 0; j < cf_names.size(); j++) {
+          ASSERT_EQ(listener->flushed_dbs_[j], db_);
+          ASSERT_EQ(listener->flushed_column_family_names_[j], cf_names[j]);
+        }
+      }
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+    Close();
+  }
+}
+
+TEST_F(EventListenerTest, MultiDBMultiListeners) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+  std::vector<TestFlushListener*> listeners;
+  const int kNumDBs = 5;
+  const int kNumListeners = 10;
+  for (int i = 0; i < kNumListeners; ++i) {
+    listeners.emplace_back(new TestFlushListener(options.env, this));
+  }
+
+  std::vector<std::string> cf_names = {"pikachu",  "ilya",     "muromec",
+                                       "dobrynia", "nikitich", "alyosha",
+                                       "popovich"};
+
+  options.create_if_missing = true;
+  for (int i = 0; i < kNumListeners; ++i) {
+    options.listeners.emplace_back(listeners[i]);
+  }
+  DBOptions db_opts(options);
+  ColumnFamilyOptions cf_opts(options);
+
+  std::vector<DB*> dbs;
+  std::vector<std::vector<ColumnFamilyHandle*>> vec_handles;
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    ASSERT_OK(DestroyDB(dbname_ + std::to_string(d), options));
+    DB* db;
+    std::vector<ColumnFamilyHandle*> handles;
+    ASSERT_OK(DB::Open(options, dbname_ + std::to_string(d), &db));
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ColumnFamilyHandle* handle;
+      ASSERT_OK(db->CreateColumnFamily(cf_opts, cf_names[c], &handle));
+      handles.push_back(handle);
+    }
+
+    vec_handles.push_back(std::move(handles));
+    dbs.push_back(db);
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], cf_names[c],
+                            cf_names[c]));
+    }
+  }
+
+  for (size_t c = 0; c < cf_names.size(); ++c) {
+    for (int d = 0; d < kNumDBs; ++d) {
+      ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c]));
+      ASSERT_OK(
+          static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForFlushMemTable());
+    }
+  }
+
+  for (int d = 0; d < kNumDBs; ++d) {
+    // Ensure background work is fully finished including listener callbacks
+    // before accessing listener state.
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(dbs[d])->TEST_WaitForBackgroundWork());
+  }
+
+  for (auto* listener : listeners) {
+    int pos = 0;
+    for (size_t c = 0; c < cf_names.size(); ++c) {
+      for (int d = 0; d < kNumDBs; ++d) {
+        ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]);
+        ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]);
+        pos++;
+      }
+    }
+  }
+
+  for (auto handles : vec_handles) {
+    for (auto h : handles) {
+      delete h;
+    }
+    handles.clear();
+  }
+  vec_handles.clear();
+
+  for (auto db : dbs) {
+    delete db;
+  }
+}
+
+TEST_F(EventListenerTest, DisableBGCompaction) {
+  Options options;
+  options.env = CurrentOptions().env;
+#ifdef ROCKSDB_USING_THREAD_STATUS
+  options.enable_thread_tracking = true;
+#endif  // ROCKSDB_USING_THREAD_STATUS
+  TestFlushListener* listener = new TestFlushListener(options.env, this);
+  const int kCompactionTrigger = 1;
+  const int kSlowdownTrigger = 5;
+  const int kStopTrigger = 100;
+  options.level0_file_num_compaction_trigger = kCompactionTrigger;
+  options.level0_slowdown_writes_trigger = kSlowdownTrigger;
+  options.level0_stop_writes_trigger = kStopTrigger;
+  options.max_write_buffer_number = 10;
+  options.listeners.emplace_back(listener);
+  // BG compaction is disabled.  Number of L0 files will simply keeps
+  // increasing in this test.
+  options.compaction_style = kCompactionStyleNone;
+  options.compression = kNoCompression;
+  options.write_buffer_size = 100000;  // Small write buffer
+  options.table_properties_collector_factories.push_back(
+      std::make_shared<TestPropertiesCollectorFactory>());
+
+  CreateAndReopenWithCF({"pikachu"}, options);
+  ColumnFamilyMetaData cf_meta;
+  db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+
+  // keep writing until writes are forced to stop.
+  for (int i = 0; static_cast<int>(cf_meta.file_count) < kSlowdownTrigger * 10;
+       ++i) {
+    ASSERT_OK(
+        Put(1, std::to_string(i), std::string(10000, 'x'), WriteOptions()));
+    FlushOptions fo;
+    fo.allow_write_stall = true;
+    ASSERT_OK(db_->Flush(fo, handles_[1]));
+    db_->GetColumnFamilyMetaData(handles_[1], &cf_meta);
+  }
+  // Ensure background work is fully finished including listener callbacks
+  // before accessing listener state.
+  ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork());
+  ASSERT_GE(listener->slowdown_count, kSlowdownTrigger * 9);
+}
+
+class TestCompactionReasonListener : public EventListener {
+ public:
+  void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    compaction_reasons_.push_back(ci.compaction_reason);
+  }
+
+  std::vector<CompactionReason> compaction_reasons_;
+  std::mutex mutex_;
+};
+
+TEST_F(EventListenerTest, CompactionReasonLevel) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleLevel;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(listener->compaction_reasons_.size(), 1);
+  ASSERT_EQ(listener->compaction_reasons_[0],
+            CompactionReason::kLevelL0FilesNum);
+
+  DestroyAndReopen(options);
+
+  // Write 3 non-overlapping files in L0
+  for (int k = 1; k <= 30; k++) {
+    ASSERT_OK(Put(Key(k), Key(k)));
+    if (k % 10 == 0) {
+      Flush();
+    }
+  }
+
+  // Do a trivial move from L0 -> L1
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  options.max_bytes_for_level_base = 1;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_GT(listener->compaction_reasons_.size(), 1);
+
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kLevelMaxLevelSize);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(Put("key", "value"));
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonUniversal) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.compaction_style = kCompactionStyleUniversal;
+
+  Random rnd(301);
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 100000;
+  options.compaction_options_universal.size_ratio = 100000;
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeRatio);
+  }
+
+  options.level0_file_num_compaction_trigger = 8;
+  options.compaction_options_universal.max_size_amplification_percent = 1;
+  options.compaction_options_universal.size_ratio = 100000;
+
+  DestroyAndReopen(options);
+  listener->compaction_reasons_.clear();
+
+  // Write 8 files in L0
+  for (int i = 0; i < 8; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kUniversalSizeAmplification);
+  }
+
+  options.disable_auto_compactions = true;
+  Close();
+  listener->compaction_reasons_.clear();
+  Reopen(options);
+
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kManualCompaction);
+  }
+}
+
+TEST_F(EventListenerTest, CompactionReasonFIFO) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(
+      DBTestBase::kNumKeysByGenerateNewRandomFile));
+
+  TestCompactionReasonListener* listener = new TestCompactionReasonListener();
+  options.listeners.emplace_back(listener);
+
+  options.level0_file_num_compaction_trigger = 4;
+  options.compaction_style = kCompactionStyleFIFO;
+  options.compaction_options_fifo.max_table_files_size = 1;
+
+  DestroyAndReopen(options);
+  Random rnd(301);
+
+  // Write 4 files in L0
+  for (int i = 0; i < 4; i++) {
+    GenerateNewRandomFile(&rnd);
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_GT(listener->compaction_reasons_.size(), 0);
+  for (auto compaction_reason : listener->compaction_reasons_) {
+    ASSERT_EQ(compaction_reason, CompactionReason::kFIFOMaxSize);
+  }
+}
+
+class TableFileCreationListener : public EventListener {
+ public:
+  class TestEnv : public EnvWrapper {
+   public:
+    explicit TestEnv(Env* t) : EnvWrapper(t) {}
+    static const char* kClassName() { return "TestEnv"; }
+    const char* Name() const override { return kClassName(); }
+
+    void SetStatus(Status s) { status_ = s; }
+
+    Status NewWritableFile(const std::string& fname,
+                           std::unique_ptr<WritableFile>* result,
+                           const EnvOptions& options) override {
+      if (fname.size() > 4 && fname.substr(fname.size() - 4) == ".sst") {
+        if (!status_.ok()) {
+          return status_;
+        }
+      }
+      return target()->NewWritableFile(fname, result, options);
+    }
+
+   private:
+    Status status_;
+  };
+
+  TableFileCreationListener() {
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  int Index(TableFileCreationReason reason) {
+    int idx;
+    switch (reason) {
+      case TableFileCreationReason::kFlush:
+        idx = 0;
+        break;
+      case TableFileCreationReason::kCompaction:
+        idx = 1;
+        break;
+      default:
+        idx = -1;
+    }
+    return idx;
+  }
+
+  void CheckAndResetCounters(int flush_started, int flush_finished,
+                             int flush_failure, int compaction_started,
+                             int compaction_finished, int compaction_failure) {
+    ASSERT_EQ(started_[0], flush_started);
+    ASSERT_EQ(finished_[0], flush_finished);
+    ASSERT_EQ(failure_[0], flush_failure);
+    ASSERT_EQ(started_[1], compaction_started);
+    ASSERT_EQ(finished_[1], compaction_finished);
+    ASSERT_EQ(failure_[1], compaction_failure);
+    for (int i = 0; i < 2; i++) {
+      started_[i] = finished_[i] = failure_[i] = 0;
+    }
+  }
+
+  void OnTableFileCreationStarted(
+      const TableFileCreationBriefInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      started_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+  }
+
+  void OnTableFileCreated(const TableFileCreationInfo& info) override {
+    int idx = Index(info.reason);
+    if (idx >= 0) {
+      finished_[idx]++;
+    }
+    ASSERT_GT(info.db_name.size(), 0U);
+    ASSERT_GT(info.cf_name.size(), 0U);
+    ASSERT_GT(info.file_path.size(), 0U);
+    ASSERT_GT(info.job_id, 0);
+    ASSERT_EQ(info.file_checksum, kUnknownFileChecksum);
+    ASSERT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+    if (info.status.ok()) {
+      if (info.table_properties.num_range_deletions == 0U) {
+        ASSERT_GT(info.table_properties.data_size, 0U);
+        ASSERT_GT(info.table_properties.raw_key_size, 0U);
+        ASSERT_GT(info.table_properties.raw_value_size, 0U);
+        ASSERT_GT(info.table_properties.num_data_blocks, 0U);
+        ASSERT_GT(info.table_properties.num_entries, 0U);
+      }
+    } else {
+      if (idx >= 0) {
+        failure_[idx]++;
+        last_failure_ = info.status;
+      }
+    }
+  }
+
+  int started_[2];
+  int finished_[2];
+  int failure_[2];
+  Status last_failure_;
+};
+
+TEST_F(EventListenerTest, TableFileCreationListenersTest) {
+  auto listener = std::make_shared<TableFileCreationListener>();
+  Options options;
+  std::unique_ptr<TableFileCreationListener::TestEnv> test_env(
+      new TableFileCreationListener::TestEnv(CurrentOptions().env));
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  options.env = test_env.get();
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("foo", "aaa"));
+  ASSERT_OK(Put("bar", "bbb"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+  ASSERT_OK(Put("foo", "aaa1"));
+  ASSERT_OK(Put("bar", "bbb1"));
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+  test_env->SetStatus(Status::OK());
+
+  Reopen(options);
+  ASSERT_OK(Put("foo", "aaa2"));
+  ASSERT_OK(Put("bar", "bbb2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  listener->CheckAndResetCounters(1, 1, 0, 0, 0, 0);
+
+  const Slice kRangeStart = "a";
+  const Slice kRangeEnd = "z";
+  ASSERT_OK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(0, 0, 0, 1, 1, 0);
+
+  ASSERT_OK(Put("foo", "aaa3"));
+  ASSERT_OK(Put("bar", "bbb3"));
+  ASSERT_OK(Flush());
+  test_env->SetStatus(Status::NotSupported("not supported"));
+  ASSERT_NOK(
+      dbfull()->CompactRange(CompactRangeOptions(), &kRangeStart, &kRangeEnd));
+  ASSERT_NOK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(1, 1, 0, 1, 1, 1);
+  ASSERT_TRUE(listener->last_failure_.IsNotSupported());
+
+  // Reset
+  test_env->SetStatus(Status::OK());
+  DestroyAndReopen(options);
+
+  // Verify that an empty table file that is immediately deleted gives Aborted
+  // status to listener.
+  ASSERT_OK(Put("baz", "z"));
+  ASSERT_OK(SingleDelete("baz"));
+  ASSERT_OK(Flush());
+  listener->CheckAndResetCounters(1, 1, 1, 0, 0, 0);
+  ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+  // Also in compaction
+  ASSERT_OK(Put("baz", "z"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                             kRangeStart, kRangeEnd));
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  listener->CheckAndResetCounters(2, 2, 0, 1, 1, 1);
+  ASSERT_TRUE(listener->last_failure_.IsAborted());
+
+  Close();  // Avoid UAF on listener
+}
+
+class MemTableSealedListener : public EventListener {
+ private:
+  SequenceNumber latest_seq_number_;
+
+ public:
+  MemTableSealedListener() {}
+  void OnMemTableSealed(const MemTableInfo& info) override {
+    latest_seq_number_ = info.first_seqno;
+  }
+
+  void OnFlushCompleted(DB* /*db*/,
+                        const FlushJobInfo& flush_job_info) override {
+    ASSERT_LE(flush_job_info.smallest_seqno, latest_seq_number_);
+  }
+};
+
+TEST_F(EventListenerTest, MemTableSealedListenerTest) {
+  auto listener = std::make_shared<MemTableSealedListener>();
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  DestroyAndReopen(options);
+
+  for (unsigned int i = 0; i < 10; i++) {
+    std::string tag = std::to_string(i);
+    ASSERT_OK(Put("foo" + tag, "aaa"));
+    ASSERT_OK(Put("bar" + tag, "bbb"));
+
+    ASSERT_OK(Flush());
+  }
+}
+
+class ColumnFamilyHandleDeletionStartedListener : public EventListener {
+ private:
+  std::vector<std::string> cfs_;
+  int counter;
+
+ public:
+  explicit ColumnFamilyHandleDeletionStartedListener(
+      const std::vector<std::string>& cfs)
+      : cfs_(cfs), counter(0) {
+    cfs_.insert(cfs_.begin(), kDefaultColumnFamilyName);
+  }
+  void OnColumnFamilyHandleDeletionStarted(
+      ColumnFamilyHandle* handle) override {
+    ASSERT_EQ(cfs_[handle->GetID()], handle->GetName());
+    counter++;
+  }
+  int getCounter() { return counter; }
+};
+
+TEST_F(EventListenerTest, ColumnFamilyHandleDeletionStartedListenerTest) {
+  std::vector<std::string> cfs{"pikachu", "eevee", "Mewtwo"};
+  auto listener =
+      std::make_shared<ColumnFamilyHandleDeletionStartedListener>(cfs);
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  options.listeners.push_back(listener);
+  CreateAndReopenWithCF(cfs, options);
+  ASSERT_EQ(handles_.size(), 4);
+  delete handles_[3];
+  delete handles_[2];
+  delete handles_[1];
+  handles_.resize(1);
+  ASSERT_EQ(listener->getCounter(), 3);
+}
+
+class BackgroundErrorListener : public EventListener {
+ private:
+  SpecialEnv* env_;
+  int counter_;
+
+ public:
+  BackgroundErrorListener(SpecialEnv* env) : env_(env), counter_(0) {}
+
+  void OnBackgroundError(BackgroundErrorReason /*reason*/,
+                         Status* bg_error) override {
+    if (counter_ == 0) {
+      // suppress the first error and disable write-dropping such that a retry
+      // can succeed.
+      *bg_error = Status::OK();
+      env_->drop_writes_.store(false, std::memory_order_release);
+      env_->SetMockSleep(false);
+    }
+    ++counter_;
+  }
+
+  int counter() { return counter_; }
+};
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedFlushTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.env = env_;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(1));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // the usual TEST_WaitForFlushMemTable() doesn't work for failed flushes, so
+  // forge a custom one for the failed flush case.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkFlush:done",
+        "EventListenerTest:BackgroundErrorListenerFailedFlushTest:1"}});
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->SetMockSleep();
+
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(Put("key1", "val"));
+  TEST_SYNC_POINT("EventListenerTest:BackgroundErrorListenerFailedFlushTest:1");
+  ASSERT_EQ(1, listener->counter());
+  ASSERT_OK(Put("key2", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_EQ(1, NumTableFilesAtLevel(0));
+}
+
+TEST_F(EventListenerTest, BackgroundErrorListenerFailedCompactionTest) {
+  auto listener = std::make_shared<BackgroundErrorListener>(env_);
+  Options options;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.env = env_;
+  options.level0_file_num_compaction_trigger = 2;
+  options.listeners.push_back(listener);
+  options.memtable_factory.reset(test::NewSpecialSkipListFactory(2));
+  options.paranoid_checks = true;
+  DestroyAndReopen(options);
+
+  // third iteration triggers the second memtable's flush
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_OK(Put("key0", "val"));
+    if (i > 0) {
+      ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    }
+    ASSERT_OK(Put("key1", "val"));
+  }
+  ASSERT_EQ(2, NumTableFilesAtLevel(0));
+
+  env_->drop_writes_.store(true, std::memory_order_release);
+  env_->SetMockSleep();
+  ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}}));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_EQ(1, listener->counter());
+
+  // trigger flush so compaction is triggered again; this time it succeeds
+  // The previous failed compaction may get retried automatically, so we may
+  // be left with 0 or 1 files in level 1, depending on when the retry gets
+  // scheduled
+  ASSERT_OK(Put("key0", "val"));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  ASSERT_LE(1, NumTableFilesAtLevel(0));
+}
+
+class TestFileOperationListener : public EventListener {
+ public:
+  TestFileOperationListener() {
+    file_reads_.store(0);
+    file_reads_success_.store(0);
+    file_writes_.store(0);
+    file_writes_success_.store(0);
+    file_flushes_.store(0);
+    file_flushes_success_.store(0);
+    file_closes_.store(0);
+    file_closes_success_.store(0);
+    file_syncs_.store(0);
+    file_syncs_success_.store(0);
+    file_truncates_.store(0);
+    file_truncates_success_.store(0);
+    file_seq_reads_.store(0);
+    blob_file_reads_.store(0);
+    blob_file_writes_.store(0);
+    blob_file_flushes_.store(0);
+    blob_file_closes_.store(0);
+    blob_file_syncs_.store(0);
+    blob_file_truncates_.store(0);
+  }
+
+  void OnFileReadFinish(const FileOperationInfo& info) override {
+    ++file_reads_;
+    if (info.status.ok()) {
+      ++file_reads_success_;
+    }
+    if (info.path.find("MANIFEST") != std::string::npos) {
+      ++file_seq_reads_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_reads_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileWriteFinish(const FileOperationInfo& info) override {
+    ++file_writes_;
+    if (info.status.ok()) {
+      ++file_writes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_writes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileFlushFinish(const FileOperationInfo& info) override {
+    ++file_flushes_;
+    if (info.status.ok()) {
+      ++file_flushes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_flushes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileCloseFinish(const FileOperationInfo& info) override {
+    ++file_closes_;
+    if (info.status.ok()) {
+      ++file_closes_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_closes_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileSyncFinish(const FileOperationInfo& info) override {
+    ++file_syncs_;
+    if (info.status.ok()) {
+      ++file_syncs_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_syncs_;
+    }
+    ReportDuration(info);
+  }
+
+  void OnFileTruncateFinish(const FileOperationInfo& info) override {
+    ++file_truncates_;
+    if (info.status.ok()) {
+      ++file_truncates_success_;
+    }
+    if (EndsWith(info.path, ".blob")) {
+      ++blob_file_truncates_;
+    }
+    ReportDuration(info);
+  }
+
+  bool ShouldBeNotifiedOnFileIO() override { return true; }
+
+  std::atomic<size_t> file_reads_;
+  std::atomic<size_t> file_reads_success_;
+  std::atomic<size_t> file_writes_;
+  std::atomic<size_t> file_writes_success_;
+  std::atomic<size_t> file_flushes_;
+  std::atomic<size_t> file_flushes_success_;
+  std::atomic<size_t> file_closes_;
+  std::atomic<size_t> file_closes_success_;
+  std::atomic<size_t> file_syncs_;
+  std::atomic<size_t> file_syncs_success_;
+  std::atomic<size_t> file_truncates_;
+  std::atomic<size_t> file_truncates_success_;
+  std::atomic<size_t> file_seq_reads_;
+  std::atomic<size_t> blob_file_reads_;
+  std::atomic<size_t> blob_file_writes_;
+  std::atomic<size_t> blob_file_flushes_;
+  std::atomic<size_t> blob_file_closes_;
+  std::atomic<size_t> blob_file_syncs_;
+  std::atomic<size_t> blob_file_truncates_;
+
+ private:
+  void ReportDuration(const FileOperationInfo& info) const {
+    ASSERT_GT(info.duration.count(), 0);
+  }
+};
+
+TEST_F(EventListenerTest, OnFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  ASSERT_OK(dbfull()->Flush(FlushOptions()));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+  ASSERT_GE(listener->file_writes_.load(),
+            listener->file_writes_success_.load());
+  ASSERT_GT(listener->file_writes_.load(), 0);
+  ASSERT_GE(listener->file_flushes_.load(),
+            listener->file_flushes_success_.load());
+  ASSERT_GT(listener->file_flushes_.load(), 0);
+  Close();
+
+  Reopen(options);
+  ASSERT_GE(listener->file_reads_.load(), listener->file_reads_success_.load());
+  ASSERT_GT(listener->file_reads_.load(), 0);
+  ASSERT_GE(listener->file_closes_.load(),
+            listener->file_closes_success_.load());
+  ASSERT_GT(listener->file_closes_.load(), 0);
+  ASSERT_GE(listener->file_syncs_.load(), listener->file_syncs_success_.load());
+  ASSERT_GT(listener->file_syncs_.load(), 0);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GE(listener->file_truncates_.load(),
+              listener->file_truncates_success_.load());
+    ASSERT_GT(listener->file_truncates_.load(), 0);
+  }
+}
+
+TEST_F(EventListenerTest, OnBlobFileOperationTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+  options.disable_auto_compactions = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  ASSERT_GT(listener->blob_file_writes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_flushes_.load(), 0U);
+  Close();
+
+  Reopen(options);
+  ASSERT_GT(listener->blob_file_closes_.load(), 0U);
+  ASSERT_GT(listener->blob_file_syncs_.load(), 0U);
+  if (true == options.use_direct_io_for_flush_and_compaction) {
+    ASSERT_GT(listener->blob_file_truncates_.load(), 0U);
+  }
+}
+
+TEST_F(EventListenerTest, ReadManifestAndWALOnRecovery) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.create_if_missing = true;
+
+  TestFileOperationListener* listener = new TestFileOperationListener();
+  options.listeners.emplace_back(listener);
+
+  options.use_direct_io_for_flush_and_compaction = false;
+  Status s = TryReopen(options);
+  if (s.IsInvalidArgument()) {
+    options.use_direct_io_for_flush_and_compaction = false;
+  } else {
+    ASSERT_OK(s);
+  }
+  DestroyAndReopen(options);
+  ASSERT_OK(Put("foo", "aaa"));
+  Close();
+
+  size_t seq_reads = listener->file_seq_reads_.load();
+  Reopen(options);
+  ASSERT_GT(listener->file_seq_reads_.load(), seq_reads);
+}
+
+class BlobDBJobLevelEventListenerTest : public EventListener {
+ public:
+  explicit BlobDBJobLevelEventListenerTest(EventListenerTest* test)
+      : test_(test), call_count_(0) {}
+
+  const VersionStorageInfo* GetVersionStorageInfo() const {
+    VersionSet* const versions = test_->dbfull()->GetVersionSet();
+    assert(versions);
+
+    ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+    EXPECT_NE(cfd, nullptr);
+
+    Version* const current = cfd->current();
+    EXPECT_NE(current, nullptr);
+
+    const VersionStorageInfo* const storage_info = current->storage_info();
+    EXPECT_NE(storage_info, nullptr);
+
+    return storage_info;
+  }
+
+  void CheckBlobFileAdditions(
+      const std::vector<BlobFileAdditionInfo>& blob_file_addition_infos) const {
+    const auto* vstorage = GetVersionStorageInfo();
+
+    EXPECT_FALSE(blob_file_addition_infos.empty());
+
+    for (const auto& blob_file_addition_info : blob_file_addition_infos) {
+      const auto meta = vstorage->GetBlobFileMetaData(
+          blob_file_addition_info.blob_file_number);
+
+      EXPECT_NE(meta, nullptr);
+      EXPECT_EQ(meta->GetBlobFileNumber(),
+                blob_file_addition_info.blob_file_number);
+      EXPECT_EQ(meta->GetTotalBlobBytes(),
+                blob_file_addition_info.total_blob_bytes);
+      EXPECT_EQ(meta->GetTotalBlobCount(),
+                blob_file_addition_info.total_blob_count);
+      EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+    }
+  }
+
+  std::vector<std::string> GetFlushedFiles() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    std::vector<std::string> result;
+    for (const auto& fname : flushed_files_) {
+      result.push_back(fname);
+    }
+    return result;
+  }
+
+  void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override {
+    call_count_++;
+
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      flushed_files_.push_back(info.file_path);
+    }
+
+    EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+    CheckBlobFileAdditions(info.blob_file_addition_infos);
+  }
+
+  void OnCompactionCompleted(DB* /*db*/,
+                             const CompactionJobInfo& info) override {
+    call_count_++;
+
+    EXPECT_EQ(info.blob_compression_type, kNoCompression);
+
+    CheckBlobFileAdditions(info.blob_file_addition_infos);
+
+    EXPECT_FALSE(info.blob_file_garbage_infos.empty());
+
+    for (const auto& blob_file_garbage_info : info.blob_file_garbage_infos) {
+      EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+      EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+      EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+    }
+  }
+
+  EventListenerTest* test_;
+  uint32_t call_count_;
+
+ private:
+  std::vector<std::string> flushed_files_;
+  std::mutex mutex_;
+};
+
+// Test OnFlushCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnFlushCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Flush());
+
+  ASSERT_EQ(Get("Key1"), "blob_value1");
+  ASSERT_EQ(Get("Key2"), "blob_value2");
+  ASSERT_EQ(Get("Key3"), "blob_value3");
+
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test OnCompactionCompleted EventListener called for blob files
+TEST_F(EventListenerTest, BlobDBOnCompactionCompleted) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  blob_event_listener->call_count_ = 0;
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+
+  // Make sure, OnCompactionCompleted is called.
+  ASSERT_GT(blob_event_listener->call_count_, 0U);
+}
+
+// Test CompactFiles calls OnCompactionCompleted EventListener for blob files
+// and populate the blob files info.
+TEST_F(EventListenerTest, BlobDBCompactFiles) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBJobLevelEventListenerTest* blob_event_listener =
+      new BlobDBJobLevelEventListenerTest(this);
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  std::vector<std::string> output_file_names;
+  CompactionJobInfo compaction_job_info;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction
+  // which will be populated in output_files_names.
+  ASSERT_OK(dbfull()->CompactFiles(
+      CompactionOptions(), blob_event_listener->GetFlushedFiles(), 1, -1,
+      &output_file_names, &compaction_job_info));
+
+  bool is_blob_in_output = false;
+  for (const auto& file : output_file_names) {
+    if (EndsWith(file, ".blob")) {
+      is_blob_in_output = true;
+    }
+  }
+  ASSERT_TRUE(is_blob_in_output);
+
+  for (const auto& blob_file_addition_info :
+       compaction_job_info.blob_file_addition_infos) {
+    EXPECT_GT(blob_file_addition_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_bytes, 0U);
+    EXPECT_GT(blob_file_addition_info.total_blob_count, 0U);
+    EXPECT_FALSE(blob_file_addition_info.blob_file_path.empty());
+  }
+
+  for (const auto& blob_file_garbage_info :
+       compaction_job_info.blob_file_garbage_infos) {
+    EXPECT_GT(blob_file_garbage_info.blob_file_number, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_count, 0U);
+    EXPECT_GT(blob_file_garbage_info.garbage_blob_bytes, 0U);
+    EXPECT_FALSE(blob_file_garbage_info.blob_file_path.empty());
+  }
+}
+
+class BlobDBFileLevelEventListener : public EventListener {
+ public:
+  void OnBlobFileCreationStarted(
+      const BlobFileCreationBriefInfo& info) override {
+    files_started_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+  }
+
+  void OnBlobFileCreated(const BlobFileCreationInfo& info) override {
+    files_created_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.cf_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_GT(info.total_blob_count, 0U);
+    EXPECT_GT(info.total_blob_bytes, 0U);
+    EXPECT_EQ(info.file_checksum, kUnknownFileChecksum);
+    EXPECT_EQ(info.file_checksum_func_name, kUnknownFileChecksumFuncName);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void OnBlobFileDeleted(const BlobFileDeletionInfo& info) override {
+    files_deleted_++;
+    EXPECT_FALSE(info.db_name.empty());
+    EXPECT_FALSE(info.file_path.empty());
+    EXPECT_GT(info.job_id, 0);
+    EXPECT_TRUE(info.status.ok());
+  }
+
+  void CheckCounters() {
+    EXPECT_EQ(files_started_, files_created_);
+    EXPECT_GT(files_started_, 0U);
+    EXPECT_GT(files_deleted_, 0U);
+    EXPECT_LT(files_deleted_, files_created_);
+  }
+
+ private:
+  std::atomic<uint32_t> files_started_{};
+  std::atomic<uint32_t> files_created_{};
+  std::atomic<uint32_t> files_deleted_{};
+};
+
+TEST_F(EventListenerTest, BlobDBFileTest) {
+  Options options;
+  options.env = CurrentOptions().env;
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  options.min_blob_size = 0;
+  options.enable_blob_garbage_collection = true;
+  options.blob_garbage_collection_age_cutoff = 0.5;
+
+  BlobDBFileLevelEventListener* blob_event_listener =
+      new BlobDBFileLevelEventListener();
+  options.listeners.emplace_back(blob_event_listener);
+
+  DestroyAndReopen(options);
+
+  ASSERT_OK(Put("Key1", "blob_value1"));
+  ASSERT_OK(Put("Key2", "blob_value2"));
+  ASSERT_OK(Put("Key3", "blob_value3"));
+  ASSERT_OK(Put("Key4", "blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key3", "new_blob_value3"));
+  ASSERT_OK(Put("Key4", "new_blob_value4"));
+  ASSERT_OK(Flush());
+
+  ASSERT_OK(Put("Key5", "blob_value5"));
+  ASSERT_OK(Put("Key6", "blob_value6"));
+  ASSERT_OK(Flush());
+
+  constexpr Slice* begin = nullptr;
+  constexpr Slice* end = nullptr;
+
+  // On compaction, because of blob_garbage_collection_age_cutoff, it will
+  // delete the oldest blob file and create new blob file during compaction.
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), begin, end));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  blob_event_listener->CheckCounters();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_format.h b/src/rocksdb/db/log_format.h
new file mode 100644
index 000000000..d397372f4
--- /dev/null
+++ b/src/rocksdb/db/log_format.h
@@ -0,0 +1,51 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Log format information shared by reader and writer.
+// See ../doc/log_format.txt for more detail.
+
+#pragma once
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+enum RecordType {
+  // Zero is reserved for preallocated files
+  kZeroType = 0,
+  kFullType = 1,
+
+  // For fragments
+  kFirstType = 2,
+  kMiddleType = 3,
+  kLastType = 4,
+
+  // For recycled log files
+  kRecyclableFullType = 5,
+  kRecyclableFirstType = 6,
+  kRecyclableMiddleType = 7,
+  kRecyclableLastType = 8,
+
+  // Compression Type
+  kSetCompressionType = 9,
+};
+static const int kMaxRecordType = kSetCompressionType;
+
+static const unsigned int kBlockSize = 32768;
+
+// Header is checksum (4 bytes), length (2 bytes), type (1 byte)
+static const int kHeaderSize = 4 + 2 + 1;
+
+// Recyclable header is checksum (4 bytes), length (2 bytes), type (1 byte),
+// log number (4 bytes).
+static const int kRecyclableHeaderSize = 4 + 2 + 1 + 4;
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.cc b/src/rocksdb/db/log_reader.cc
new file mode 100644
index 000000000..a21868776
--- /dev/null
+++ b/src/rocksdb/db/log_reader.cc
@@ -0,0 +1,854 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+
+#include <stdio.h>
+
+#include "file/sequence_file_reader.h"
+#include "port/lang.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Reader::Reporter::~Reporter() {}
+
+Reader::Reader(std::shared_ptr<Logger> info_log,
+               std::unique_ptr<SequentialFileReader>&& _file,
+               Reporter* reporter, bool checksum, uint64_t log_num)
+    : info_log_(info_log),
+      file_(std::move(_file)),
+      reporter_(reporter),
+      checksum_(checksum),
+      backing_store_(new char[kBlockSize]),
+      buffer_(),
+      eof_(false),
+      read_error_(false),
+      eof_offset_(0),
+      last_record_offset_(0),
+      end_of_buffer_offset_(0),
+      log_number_(log_num),
+      recycled_(false),
+      first_record_read_(false),
+      compression_type_(kNoCompression),
+      compression_type_record_read_(false),
+      uncompress_(nullptr),
+      hash_state_(nullptr),
+      uncompress_hash_state_(nullptr){};
+
+Reader::~Reader() {
+  delete[] backing_store_;
+  if (uncompress_) {
+    delete uncompress_;
+  }
+  if (hash_state_) {
+    XXH3_freeState(hash_state_);
+  }
+  if (uncompress_hash_state_) {
+    XXH3_freeState(uncompress_hash_state_);
+  }
+}
+
+// For kAbsoluteConsistency, on clean shutdown we don't expect any error
+// in the log files.  For other modes, we can ignore only incomplete records
+// in the last log file, which are presumably due to a write in progress
+// during restart (or from log recycling).
+//
+// TODO krad: Evaluate if we need to move to a more strict mode where we
+// restrict the inconsistency to only the last log
+bool Reader::ReadRecord(Slice* record, std::string* scratch,
+                        WALRecoveryMode wal_recovery_mode,
+                        uint64_t* record_checksum) {
+  scratch->clear();
+  record->clear();
+  if (record_checksum != nullptr) {
+    if (hash_state_ == nullptr) {
+      hash_state_ = XXH3_createState();
+    }
+    XXH3_64bits_reset(hash_state_);
+  }
+  if (uncompress_) {
+    uncompress_->Reset();
+  }
+  bool in_fragmented_record = false;
+  // Record offset of the logical record that we're reading
+  // 0 is a dummy value to make compilers happy
+  uint64_t prospective_record_offset = 0;
+
+  Slice fragment;
+  while (true) {
+    uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+    size_t drop_size = 0;
+    const unsigned int record_type =
+        ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
+    switch (record_type) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(1)");
+        }
+        // No need to compute record_checksum since the record
+        // consists of a single fragment and the checksum is computed
+        // in ReadPhysicalRecord() if WAL compression is enabled
+        if (record_checksum != nullptr && uncompress_ == nullptr) {
+          // No need to stream since the record is a single fragment
+          *record_checksum = XXH3_64bits(fragment.data(), fragment.size());
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        *record = fragment;
+        last_record_offset_ = prospective_record_offset;
+        first_record_read_ = true;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record && !scratch->empty()) {
+          // Handle bug in earlier versions of log::Writer where
+          // it could emit an empty kFirstType record at the tail end
+          // of a block followed by a kFullType or kFirstType record
+          // at the beginning of the next block.
+          ReportCorruption(scratch->size(), "partial record without end(2)");
+          XXH3_64bits_reset(hash_state_);
+        }
+        if (record_checksum != nullptr) {
+          XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->assign(fragment.data(), fragment.size());
+        in_fragmented_record = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          if (record_checksum != nullptr) {
+            XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+          }
+          scratch->append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          if (record_checksum != nullptr) {
+            XXH3_64bits_update(hash_state_, fragment.data(), fragment.size());
+            *record_checksum = XXH3_64bits_digest(hash_state_);
+          }
+          scratch->append(fragment.data(), fragment.size());
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          first_record_read_ = true;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+        if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+            wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+          // In clean shutdown we don't expect any error in the log files.
+          // In point-in-time recovery an incomplete record at the end could
+          // produce a hole in the recovered data. Report an error here, which
+          // higher layers can choose to ignore when it's provable there is no
+          // hole.
+          ReportCorruption(drop_size, "truncated header");
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kEof:
+        if (in_fragmented_record) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(scratch->size(), "error reading trailing data");
+          }
+          // This can be caused by the writer dying immediately after
+          //  writing a physical record but before completing the next; don't
+          //  treat it as a corruption, just ignore the entire logical record.
+          scratch->clear();
+        }
+        return false;
+
+      case kOldRecord:
+        if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
+          // Treat a record from a previous instance of the log as EOF.
+          if (in_fragmented_record) {
+            if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+                wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+              // In clean shutdown we don't expect any error in the log files.
+              // In point-in-time recovery an incomplete record at the end could
+              // produce a hole in the recovered data. Report an error here,
+              // which higher layers can choose to ignore when it's provable
+              // there is no hole.
+              ReportCorruption(scratch->size(), "error reading trailing data");
+            }
+            // This can be caused by the writer dying immediately after
+            //  writing a physical record but before completing the next; don't
+            //  treat it as a corruption, just ignore the entire logical record.
+            scratch->clear();
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kBadRecord:
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      case kBadRecordLen:
+        if (eof_) {
+          if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency ||
+              wal_recovery_mode == WALRecoveryMode::kPointInTimeRecovery) {
+            // In clean shutdown we don't expect any error in the log files.
+            // In point-in-time recovery an incomplete record at the end could
+            // produce a hole in the recovered data. Report an error here, which
+            // higher layers can choose to ignore when it's provable there is no
+            // hole.
+            ReportCorruption(drop_size, "truncated record body");
+          }
+          return false;
+        }
+        FALLTHROUGH_INTENDED;
+
+      case kBadRecordChecksum:
+        if (recycled_ && wal_recovery_mode ==
+                             WALRecoveryMode::kTolerateCorruptedTailRecords) {
+          scratch->clear();
+          return false;
+        }
+        if (record_type == kBadRecordLen) {
+          ReportCorruption(drop_size, "bad record length");
+        } else {
+          ReportCorruption(drop_size, "checksum mismatch");
+        }
+        if (in_fragmented_record) {
+          ReportCorruption(scratch->size(), "error in middle of record");
+          in_fragmented_record = false;
+          scratch->clear();
+        }
+        break;
+
+      case kSetCompressionType: {
+        if (compression_type_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "read multiple SetCompressionType records");
+        }
+        if (first_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "SetCompressionType not the first record");
+        }
+        prospective_record_offset = physical_record_offset;
+        scratch->clear();
+        last_record_offset_ = prospective_record_offset;
+        CompressionTypeRecord compression_record(kNoCompression);
+        Status s = compression_record.DecodeFrom(&fragment);
+        if (!s.ok()) {
+          ReportCorruption(fragment.size(),
+                           "could not decode SetCompressionType record");
+        } else {
+          InitCompression(compression_record);
+        }
+        break;
+      }
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
+        ReportCorruption(
+            (fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
+            buf);
+        in_fragmented_record = false;
+        scratch->clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+uint64_t Reader::LastRecordOffset() { return last_record_offset_; }
+
+uint64_t Reader::LastRecordEnd() {
+  return end_of_buffer_offset_ - buffer_.size();
+}
+
+void Reader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  if (eof_offset_ == 0) {
+    return;
+  }
+  UnmarkEOFInternal();
+}
+
+void Reader::UnmarkEOFInternal() {
+  // If the EOF was in the middle of a block (a partial block was read) we have
+  // to read the rest of the block as ReadPhysicalRecord can only read full
+  // blocks and expects the file position indicator to be aligned to the start
+  // of a block.
+  //
+  //      consumed_bytes + buffer_size() + remaining == kBlockSize
+
+  size_t consumed_bytes = eof_offset_ - buffer_.size();
+  size_t remaining = kBlockSize - eof_offset_;
+
+  // backing_store_ is used to concatenate what is left in buffer_ and
+  // the remainder of the block. If buffer_ already uses backing_store_,
+  // we just append the new data.
+  if (buffer_.data() != backing_store_ + consumed_bytes) {
+    // Buffer_ does not use backing_store_ for storage.
+    // Copy what is left in buffer_ to backing_store.
+    memmove(backing_store_ + consumed_bytes, buffer_.data(), buffer_.size());
+  }
+
+  Slice read_buffer;
+  // TODO: rate limit log reader with approriate priority.
+  // TODO: avoid overcharging rate limiter:
+  // Note that the Read here might overcharge SequentialFileReader's internal
+  // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+  // content left until EOF to read.
+  Status status =
+      file_->Read(remaining, &read_buffer, backing_store_ + eof_offset_,
+                  Env::IO_TOTAL /* rate_limiter_priority */);
+
+  size_t added = read_buffer.size();
+  end_of_buffer_offset_ += added;
+
+  if (!status.ok()) {
+    if (added > 0) {
+      ReportDrop(added, status);
+    }
+
+    read_error_ = true;
+    return;
+  }
+
+  if (read_buffer.data() != backing_store_ + eof_offset_) {
+    // Read did not write to backing_store_
+    memmove(backing_store_ + eof_offset_, read_buffer.data(),
+            read_buffer.size());
+  }
+
+  buffer_ = Slice(backing_store_ + consumed_bytes,
+                  eof_offset_ + added - consumed_bytes);
+
+  if (added < remaining) {
+    eof_ = true;
+    eof_offset_ += added;
+  } else {
+    eof_offset_ = 0;
+  }
+}
+
+void Reader::ReportCorruption(size_t bytes, const char* reason) {
+  ReportDrop(bytes, Status::Corruption(reason));
+}
+
+void Reader::ReportDrop(size_t bytes, const Status& reason) {
+  if (reporter_ != nullptr) {
+    reporter_->Corruption(bytes, reason);
+  }
+}
+
+bool Reader::ReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    // TODO: rate limit log reader with approriate priority.
+    // TODO: avoid overcharging rate limiter:
+    // Note that the Read here might overcharge SequentialFileReader's internal
+    // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+    // content left until EOF to read.
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+    TEST_SYNC_POINT_CALLBACK("LogReader::ReadMore:AfterReadFile", &status);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+    }
+    return true;
+  } else {
+    // Note that if buffer_ is non-empty, we have a truncated header at the
+    //  end of the file, which can be caused by the writer crashing in the
+    //  middle of writing the header. Unless explicitly requested we don't
+    //  considering this an error, just report EOF.
+    if (buffer_.size()) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *error = kBadHeader;
+      return false;
+    }
+    buffer_.clear();
+    *error = kEof;
+    return false;
+  }
+}
+
+unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size,
+                                        uint64_t* fragment_checksum) {
+  while (true) {
+    // We need at least the minimum header size
+    if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+      // the default value of r is meaningless because ReadMore will overwrite
+      // it if it returns false; in case it returns true, the return value will
+      // not be used anyway
+      int r = kEof;
+      if (!ReadMore(drop_size, &r)) {
+        return r;
+      }
+      continue;
+    }
+
+    // Parse the header
+    const char* header = buffer_.data();
+    const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+    const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+    const unsigned int type = header[6];
+    const uint32_t length = a | (b << 8);
+    int header_size = kHeaderSize;
+    if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+      if (end_of_buffer_offset_ - buffer_.size() == 0) {
+        recycled_ = true;
+      }
+      header_size = kRecyclableHeaderSize;
+      // We need enough for the larger header
+      if (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+        int r = kEof;
+        if (!ReadMore(drop_size, &r)) {
+          return r;
+        }
+        continue;
+      }
+      const uint32_t log_num = DecodeFixed32(header + 7);
+      if (log_num != log_number_) {
+        return kOldRecord;
+      }
+    }
+    if (header_size + length > buffer_.size()) {
+      assert(buffer_.size() >= static_cast<size_t>(header_size));
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      // If the end of the read has been reached without seeing
+      // `header_size + length` bytes of payload, report a corruption. The
+      // higher layers can decide how to handle it based on the recovery mode,
+      // whether this occurred at EOF, whether this is the final WAL, etc.
+      return kBadRecordLen;
+    }
+
+    if (type == kZeroType && length == 0) {
+      // Skip zero length record without reporting any drops since
+      // such records are produced by the mmap based writing code in
+      // env_posix.cc that preallocates file regions.
+      // NOTE: this should never happen in DB written by new RocksDB versions,
+      // since we turn off mmap writes to manifest and log files
+      buffer_.clear();
+      return kBadRecord;
+    }
+
+    // Check crc
+    if (checksum_) {
+      uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+      uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+      if (actual_crc != expected_crc) {
+        // Drop the rest of the buffer since "length" itself may have
+        // been corrupted and if we trust it, we could find some
+        // fragment of a real log record that just happens to look
+        // like a valid log record.
+        *drop_size = buffer_.size();
+        buffer_.clear();
+        return kBadRecordChecksum;
+      }
+    }
+
+    buffer_.remove_prefix(header_size + length);
+
+    if (!uncompress_ || type == kSetCompressionType) {
+      *result = Slice(header + header_size, length);
+      return type;
+    } else {
+      // Uncompress compressed records
+      uncompressed_record_.clear();
+      if (fragment_checksum != nullptr) {
+        if (uncompress_hash_state_ == nullptr) {
+          uncompress_hash_state_ = XXH3_createState();
+        }
+        XXH3_64bits_reset(uncompress_hash_state_);
+      }
+
+      size_t uncompressed_size = 0;
+      int remaining = 0;
+      do {
+        remaining = uncompress_->Uncompress(header + header_size, length,
+                                            uncompressed_buffer_.get(),
+                                            &uncompressed_size);
+        if (remaining < 0) {
+          buffer_.clear();
+          return kBadRecord;
+        }
+        if (uncompressed_size > 0) {
+          if (fragment_checksum != nullptr) {
+            XXH3_64bits_update(uncompress_hash_state_,
+                               uncompressed_buffer_.get(), uncompressed_size);
+          }
+          uncompressed_record_.append(uncompressed_buffer_.get(),
+                                      uncompressed_size);
+        }
+      } while (remaining > 0 || uncompressed_size == kBlockSize);
+
+      if (fragment_checksum != nullptr) {
+        // We can remove this check by updating hash_state_ directly,
+        // but that requires resetting hash_state_ for full and first types
+        // for edge cases like consecutive fist type records.
+        // Leaving the check as is since it is cleaner and can revert to the
+        // above approach if it causes performance impact.
+        *fragment_checksum = XXH3_64bits_digest(uncompress_hash_state_);
+        uint64_t actual_checksum = XXH3_64bits(uncompressed_record_.data(),
+                                               uncompressed_record_.size());
+        if (*fragment_checksum != actual_checksum) {
+          // uncompressed_record_ contains bad content that does not match
+          // actual decompressed content
+          return kBadRecord;
+        }
+      }
+      *result = Slice(uncompressed_record_);
+      return type;
+    }
+  }
+}
+
+// Initialize uncompress related fields
+void Reader::InitCompression(const CompressionTypeRecord& compression_record) {
+  compression_type_ = compression_record.GetCompressionType();
+  compression_type_record_read_ = true;
+  constexpr uint32_t compression_format_version = 2;
+  uncompress_ = StreamingUncompress::Create(
+      compression_type_, compression_format_version, kBlockSize);
+  assert(uncompress_ != nullptr);
+  uncompressed_buffer_ = std::unique_ptr<char[]>(new char[kBlockSize]);
+  assert(uncompressed_buffer_);
+}
+
+bool FragmentBufferedReader::ReadRecord(Slice* record, std::string* scratch,
+                                        WALRecoveryMode /*unused*/,
+                                        uint64_t* /* checksum */) {
+  assert(record != nullptr);
+  assert(scratch != nullptr);
+  record->clear();
+  scratch->clear();
+  if (uncompress_) {
+    uncompress_->Reset();
+  }
+
+  uint64_t prospective_record_offset = 0;
+  uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
+  size_t drop_size = 0;
+  unsigned int fragment_type_or_err = 0;  // Initialize to make compiler happy
+  Slice fragment;
+  while (TryReadFragment(&fragment, &drop_size, &fragment_type_or_err)) {
+    switch (fragment_type_or_err) {
+      case kFullType:
+      case kRecyclableFullType:
+        if (in_fragmented_record_ && !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(1)");
+        }
+        fragments_.clear();
+        *record = fragment;
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        first_record_read_ = true;
+        in_fragmented_record_ = false;
+        return true;
+
+      case kFirstType:
+      case kRecyclableFirstType:
+        if (in_fragmented_record_ || !fragments_.empty()) {
+          ReportCorruption(fragments_.size(), "partial record without end(2)");
+        }
+        prospective_record_offset = physical_record_offset;
+        fragments_.assign(fragment.data(), fragment.size());
+        in_fragmented_record_ = true;
+        break;
+
+      case kMiddleType:
+      case kRecyclableMiddleType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(1)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+        }
+        break;
+
+      case kLastType:
+      case kRecyclableLastType:
+        if (!in_fragmented_record_) {
+          ReportCorruption(fragment.size(),
+                           "missing start of fragmented record(2)");
+        } else {
+          fragments_.append(fragment.data(), fragment.size());
+          scratch->assign(fragments_.data(), fragments_.size());
+          fragments_.clear();
+          *record = Slice(*scratch);
+          last_record_offset_ = prospective_record_offset;
+          first_record_read_ = true;
+          in_fragmented_record_ = false;
+          return true;
+        }
+        break;
+
+      case kBadHeader:
+      case kBadRecord:
+      case kEof:
+      case kOldRecord:
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kBadRecordChecksum:
+        if (recycled_) {
+          fragments_.clear();
+          return false;
+        }
+        ReportCorruption(drop_size, "checksum mismatch");
+        if (in_fragmented_record_) {
+          ReportCorruption(fragments_.size(), "error in middle of record");
+          in_fragmented_record_ = false;
+          fragments_.clear();
+        }
+        break;
+
+      case kSetCompressionType: {
+        if (compression_type_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "read multiple SetCompressionType records");
+        }
+        if (first_record_read_) {
+          ReportCorruption(fragment.size(),
+                           "SetCompressionType not the first record");
+        }
+        fragments_.clear();
+        prospective_record_offset = physical_record_offset;
+        last_record_offset_ = prospective_record_offset;
+        in_fragmented_record_ = false;
+        CompressionTypeRecord compression_record(kNoCompression);
+        Status s = compression_record.DecodeFrom(&fragment);
+        if (!s.ok()) {
+          ReportCorruption(fragment.size(),
+                           "could not decode SetCompressionType record");
+        } else {
+          InitCompression(compression_record);
+        }
+        break;
+      }
+
+      default: {
+        char buf[40];
+        snprintf(buf, sizeof(buf), "unknown record type %u",
+                 fragment_type_or_err);
+        ReportCorruption(
+            fragment.size() + (in_fragmented_record_ ? fragments_.size() : 0),
+            buf);
+        in_fragmented_record_ = false;
+        fragments_.clear();
+        break;
+      }
+    }
+  }
+  return false;
+}
+
+void FragmentBufferedReader::UnmarkEOF() {
+  if (read_error_) {
+    return;
+  }
+  eof_ = false;
+  UnmarkEOFInternal();
+}
+
+bool FragmentBufferedReader::TryReadMore(size_t* drop_size, int* error) {
+  if (!eof_ && !read_error_) {
+    // Last read was a full read, so this is a trailer to skip
+    buffer_.clear();
+    // TODO: rate limit log reader with approriate priority.
+    // TODO: avoid overcharging rate limiter:
+    // Note that the Read here might overcharge SequentialFileReader's internal
+    // rate limiter if priority is not IO_TOTAL, e.g., when there is not enough
+    // content left until EOF to read.
+    Status status = file_->Read(kBlockSize, &buffer_, backing_store_,
+                                Env::IO_TOTAL /* rate_limiter_priority */);
+    end_of_buffer_offset_ += buffer_.size();
+    if (!status.ok()) {
+      buffer_.clear();
+      ReportDrop(kBlockSize, status);
+      read_error_ = true;
+      *error = kEof;
+      return false;
+    } else if (buffer_.size() < static_cast<size_t>(kBlockSize)) {
+      eof_ = true;
+      eof_offset_ = buffer_.size();
+      TEST_SYNC_POINT_CALLBACK(
+          "FragmentBufferedLogReader::TryReadMore:FirstEOF", nullptr);
+    }
+    return true;
+  } else if (!read_error_) {
+    UnmarkEOF();
+  }
+  if (!read_error_) {
+    return true;
+  }
+  *error = kEof;
+  *drop_size = buffer_.size();
+  if (buffer_.size() > 0) {
+    *error = kBadHeader;
+  }
+  buffer_.clear();
+  return false;
+}
+
+// return true if the caller should process the fragment_type_or_err.
+bool FragmentBufferedReader::TryReadFragment(
+    Slice* fragment, size_t* drop_size, unsigned int* fragment_type_or_err) {
+  assert(fragment != nullptr);
+  assert(drop_size != nullptr);
+  assert(fragment_type_or_err != nullptr);
+
+  while (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+  const char* header = buffer_.data();
+  const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
+  const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
+  const unsigned int type = header[6];
+  const uint32_t length = a | (b << 8);
+  int header_size = kHeaderSize;
+  if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
+    if (end_of_buffer_offset_ - buffer_.size() == 0) {
+      recycled_ = true;
+    }
+    header_size = kRecyclableHeaderSize;
+    while (buffer_.size() < static_cast<size_t>(kRecyclableHeaderSize)) {
+      size_t old_size = buffer_.size();
+      int error = kEof;
+      if (!TryReadMore(drop_size, &error)) {
+        *fragment_type_or_err = error;
+        return false;
+      } else if (old_size == buffer_.size()) {
+        return false;
+      }
+    }
+    const uint32_t log_num = DecodeFixed32(header + 7);
+    if (log_num != log_number_) {
+      *fragment_type_or_err = kOldRecord;
+      return true;
+    }
+  }
+
+  while (header_size + length > buffer_.size()) {
+    size_t old_size = buffer_.size();
+    int error = kEof;
+    if (!TryReadMore(drop_size, &error)) {
+      *fragment_type_or_err = error;
+      return false;
+    } else if (old_size == buffer_.size()) {
+      return false;
+    }
+  }
+
+  if (type == kZeroType && length == 0) {
+    buffer_.clear();
+    *fragment_type_or_err = kBadRecord;
+    return true;
+  }
+
+  if (checksum_) {
+    uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
+    uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
+    if (actual_crc != expected_crc) {
+      *drop_size = buffer_.size();
+      buffer_.clear();
+      *fragment_type_or_err = kBadRecordChecksum;
+      return true;
+    }
+  }
+
+  buffer_.remove_prefix(header_size + length);
+
+  if (!uncompress_ || type == kSetCompressionType) {
+    *fragment = Slice(header + header_size, length);
+    *fragment_type_or_err = type;
+    return true;
+  } else {
+    // Uncompress compressed records
+    uncompressed_record_.clear();
+    size_t uncompressed_size = 0;
+    int remaining = 0;
+    do {
+      remaining = uncompress_->Uncompress(header + header_size, length,
+                                          uncompressed_buffer_.get(),
+                                          &uncompressed_size);
+      if (remaining < 0) {
+        buffer_.clear();
+        *fragment_type_or_err = kBadRecord;
+        return true;
+      }
+      if (uncompressed_size > 0) {
+        uncompressed_record_.append(uncompressed_buffer_.get(),
+                                    uncompressed_size);
+      }
+    } while (remaining > 0 || uncompressed_size == kBlockSize);
+    *fragment = Slice(std::move(uncompressed_record_));
+    *fragment_type_or_err = type;
+    return true;
+  }
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_reader.h b/src/rocksdb/db/log_reader.h
new file mode 100644
index 000000000..e3be1570e
--- /dev/null
+++ b/src/rocksdb/db/log_reader.h
@@ -0,0 +1,225 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <stdint.h>
+
+#include <memory>
+
+#include "db/log_format.h"
+#include "file/sequence_file_reader.h"
+#include "rocksdb/options.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+#include "util/xxhash.h"
+
+namespace ROCKSDB_NAMESPACE {
+class Logger;
+
+namespace log {
+
+/**
+ * Reader is a general purpose log stream reader implementation. The actual job
+ * of reading from the device is implemented by the SequentialFile interface.
+ *
+ * Please see Writer for details on the file and record layout.
+ */
+class Reader {
+ public:
+  // Interface for reporting errors.
+  class Reporter {
+   public:
+    virtual ~Reporter();
+
+    // Some corruption was detected.  "size" is the approximate number
+    // of bytes dropped due to the corruption.
+    virtual void Corruption(size_t bytes, const Status& status) = 0;
+  };
+
+  // Create a reader that will return log records from "*file".
+  // "*file" must remain live while this Reader is in use.
+  //
+  // If "reporter" is non-nullptr, it is notified whenever some data is
+  // dropped due to a detected corruption.  "*reporter" must remain
+  // live while this Reader is in use.
+  //
+  // If "checksum" is true, verify checksums if available.
+  Reader(std::shared_ptr<Logger> info_log,
+         std::unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
+         bool checksum, uint64_t log_num);
+  // No copying allowed
+  Reader(const Reader&) = delete;
+  void operator=(const Reader&) = delete;
+
+  virtual ~Reader();
+
+  // Read the next record into *record.  Returns true if read
+  // successfully, false if we hit end of the input.  May use
+  // "*scratch" as temporary storage. The contents filled in *record
+  // will only be valid until the next mutating operation on this
+  // reader or the next mutation to *scratch.
+  // If record_checksum is not nullptr, then this function will calculate the
+  // checksum of the record read and set record_checksum to it. The checksum is
+  // calculated from the original buffers that contain the contents of the
+  // record.
+  virtual bool ReadRecord(Slice* record, std::string* scratch,
+                          WALRecoveryMode wal_recovery_mode =
+                              WALRecoveryMode::kTolerateCorruptedTailRecords,
+                          uint64_t* record_checksum = nullptr);
+
+  // Returns the physical offset of the last record returned by ReadRecord.
+  //
+  // Undefined before the first call to ReadRecord.
+  uint64_t LastRecordOffset();
+
+  // Returns the first physical offset after the last record returned by
+  // ReadRecord, or zero before first call to ReadRecord. This can also be
+  // thought of as the "current" position in processing the file bytes.
+  uint64_t LastRecordEnd();
+
+  // returns true if the reader has encountered an eof condition.
+  bool IsEOF() { return eof_; }
+
+  // returns true if the reader has encountered read error.
+  bool hasReadError() const { return read_error_; }
+
+  // when we know more data has been written to the file. we can use this
+  // function to force the reader to look again in the file.
+  // Also aligns the file position indicator to the start of the next block
+  // by reading the rest of the data from the EOF position to the end of the
+  // block that was partially read.
+  virtual void UnmarkEOF();
+
+  SequentialFileReader* file() { return file_.get(); }
+
+  Reporter* GetReporter() const { return reporter_; }
+
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  size_t GetReadOffset() const {
+    return static_cast<size_t>(end_of_buffer_offset_);
+  }
+
+  bool IsCompressedAndEmptyFile() {
+    return !first_record_read_ && compression_type_record_read_;
+  }
+
+ protected:
+  std::shared_ptr<Logger> info_log_;
+  const std::unique_ptr<SequentialFileReader> file_;
+  Reporter* const reporter_;
+  bool const checksum_;
+  char* const backing_store_;
+
+  // Internal state variables used for reading records
+  Slice buffer_;
+  bool eof_;         // Last Read() indicated EOF by returning < kBlockSize
+  bool read_error_;  // Error occurred while reading from file
+
+  // Offset of the file position indicator within the last block when an
+  // EOF was detected.
+  size_t eof_offset_;
+
+  // Offset of the last record returned by ReadRecord.
+  uint64_t last_record_offset_;
+  // Offset of the first location past the end of buffer_.
+  uint64_t end_of_buffer_offset_;
+
+  // which log number this is
+  uint64_t const log_number_;
+
+  // Whether this is a recycled log file
+  bool recycled_;
+
+  // Whether the first record has been read or not.
+  bool first_record_read_;
+  // Type of compression used
+  CompressionType compression_type_;
+  // Track whether the compression type record has been read or not.
+  bool compression_type_record_read_;
+  StreamingUncompress* uncompress_;
+  // Reusable uncompressed output buffer
+  std::unique_ptr<char[]> uncompressed_buffer_;
+  // Reusable uncompressed record
+  std::string uncompressed_record_;
+  // Used for stream hashing fragment content in ReadRecord()
+  XXH3_state_t* hash_state_;
+  // Used for stream hashing uncompressed buffer in ReadPhysicalRecord()
+  XXH3_state_t* uncompress_hash_state_;
+
+  // Extend record types with the following special values
+  enum {
+    kEof = kMaxRecordType + 1,
+    // Returned whenever we find an invalid physical record.
+    // Currently there are three situations in which this happens:
+    // * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
+    // * The record is a 0-length record (No drop is reported)
+    kBadRecord = kMaxRecordType + 2,
+    // Returned when we fail to read a valid header.
+    kBadHeader = kMaxRecordType + 3,
+    // Returned when we read an old record from a previous user of the log.
+    kOldRecord = kMaxRecordType + 4,
+    // Returned when we get a bad record length
+    kBadRecordLen = kMaxRecordType + 5,
+    // Returned when we get a bad record checksum
+    kBadRecordChecksum = kMaxRecordType + 6,
+  };
+
+  // Return type, or one of the preceding special values
+  // If WAL compressioned is enabled, fragment_checksum is the checksum of the
+  // fragment computed from the orginal buffer containinng uncompressed
+  // fragment.
+  unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size,
+                                  uint64_t* fragment_checksum = nullptr);
+
+  // Read some more
+  bool ReadMore(size_t* drop_size, int* error);
+
+  void UnmarkEOFInternal();
+
+  // Reports dropped bytes to the reporter.
+  // buffer_ must be updated to remove the dropped bytes prior to invocation.
+  void ReportCorruption(size_t bytes, const char* reason);
+  void ReportDrop(size_t bytes, const Status& reason);
+
+  void InitCompression(const CompressionTypeRecord& compression_record);
+};
+
+class FragmentBufferedReader : public Reader {
+ public:
+  FragmentBufferedReader(std::shared_ptr<Logger> info_log,
+                         std::unique_ptr<SequentialFileReader>&& _file,
+                         Reporter* reporter, bool checksum, uint64_t log_num)
+      : Reader(info_log, std::move(_file), reporter, checksum, log_num),
+        fragments_(),
+        in_fragmented_record_(false) {}
+  ~FragmentBufferedReader() override {}
+  bool ReadRecord(Slice* record, std::string* scratch,
+                  WALRecoveryMode wal_recovery_mode =
+                      WALRecoveryMode::kTolerateCorruptedTailRecords,
+                  uint64_t* record_checksum = nullptr) override;
+  void UnmarkEOF() override;
+
+ private:
+  std::string fragments_;
+  bool in_fragmented_record_;
+
+  bool TryReadFragment(Slice* result, size_t* drop_size,
+                       unsigned int* fragment_type_or_err);
+
+  bool TryReadMore(size_t* drop_size, int* error);
+
+  // No copy allowed
+  FragmentBufferedReader(const FragmentBufferedReader&);
+  void operator=(const FragmentBufferedReader&);
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_test.cc b/src/rocksdb/db/log_test.cc
new file mode 100644
index 000000000..2a43dc152
--- /dev/null
+++ b/src/rocksdb/db/log_test.cc
@@ -0,0 +1,1062 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+#include "util/random.h"
+#include "utilities/memory_allocators.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+// Construct a string of the specified length made out of the supplied
+// partial string.
+static std::string BigString(const std::string& partial_string, size_t n) {
+  std::string result;
+  while (result.size() < n) {
+    result.append(partial_string);
+  }
+  result.resize(n);
+  return result;
+}
+
+// Construct a string from a number
+static std::string NumberString(int n) {
+  char buf[50];
+  snprintf(buf, sizeof(buf), "%d.", n);
+  return std::string(buf);
+}
+
+// Return a skewed potentially long string
+static std::string RandomSkewedString(int i, Random* rnd) {
+  return BigString(NumberString(i), rnd->Skewed(17));
+}
+
+// Param type is tuple<int, bool>
+// get<0>(tuple): non-zero if recycling log, zero if regular log
+// get<1>(tuple): true if allow retry after read EOF, false otherwise
+class LogTest
+    : public ::testing::TestWithParam<std::tuple<int, bool, CompressionType>> {
+ private:
+  class StringSource : public FSSequentialFile {
+   public:
+    Slice& contents_;
+    bool force_error_;
+    size_t force_error_position_;
+    bool force_eof_;
+    size_t force_eof_position_;
+    bool returned_partial_;
+    bool fail_after_read_partial_;
+    explicit StringSource(Slice& contents, bool fail_after_read_partial)
+        : contents_(contents),
+          force_error_(false),
+          force_error_position_(0),
+          force_eof_(false),
+          force_eof_position_(0),
+          returned_partial_(false),
+          fail_after_read_partial_(fail_after_read_partial) {}
+
+    IOStatus Read(size_t n, const IOOptions& /*opts*/, Slice* result,
+                  char* scratch, IODebugContext* /*dbg*/) override {
+      if (fail_after_read_partial_) {
+        EXPECT_TRUE(!returned_partial_) << "must not Read() after eof/error";
+      }
+
+      if (force_error_) {
+        if (force_error_position_ >= n) {
+          force_error_position_ -= n;
+        } else {
+          *result = Slice(contents_.data(), force_error_position_);
+          contents_.remove_prefix(force_error_position_);
+          force_error_ = false;
+          returned_partial_ = true;
+          return IOStatus::Corruption("read error");
+        }
+      }
+
+      if (contents_.size() < n) {
+        n = contents_.size();
+        returned_partial_ = true;
+      }
+
+      if (force_eof_) {
+        if (force_eof_position_ >= n) {
+          force_eof_position_ -= n;
+        } else {
+          force_eof_ = false;
+          n = force_eof_position_;
+          returned_partial_ = true;
+        }
+      }
+
+      // By using scratch we ensure that caller has control over the
+      // lifetime of result.data()
+      memcpy(scratch, contents_.data(), n);
+      *result = Slice(scratch, n);
+
+      contents_.remove_prefix(n);
+      return IOStatus::OK();
+    }
+
+    IOStatus Skip(uint64_t n) override {
+      if (n > contents_.size()) {
+        contents_.clear();
+        return IOStatus::NotFound("in-memory file skipepd past end");
+      }
+
+      contents_.remove_prefix(n);
+
+      return IOStatus::OK();
+    }
+  };
+
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  std::string& dest_contents() { return sink_->contents_; }
+
+  const std::string& dest_contents() const { return sink_->contents_; }
+
+  void reset_source_contents() { source_->contents_ = dest_contents(); }
+
+  Slice reader_contents_;
+  test::StringSink* sink_;
+  StringSource* source_;
+  ReportCollector report_;
+
+ protected:
+  std::unique_ptr<Writer> writer_;
+  std::unique_ptr<Reader> reader_;
+  bool allow_retry_read_;
+  CompressionType compression_type_;
+
+ public:
+  LogTest()
+      : reader_contents_(),
+        sink_(new test::StringSink(&reader_contents_)),
+        source_(new StringSource(reader_contents_, !std::get<1>(GetParam()))),
+        allow_retry_read_(std::get<1>(GetParam())),
+        compression_type_(std::get<2>(GetParam())) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+        std::move(sink_holder), "" /* don't care */, FileOptions()));
+    Writer* writer =
+        new Writer(std::move(file_writer), 123, std::get<0>(GetParam()), false,
+                   compression_type_);
+    writer_.reset(writer);
+    std::unique_ptr<FSSequentialFile> source_holder(source_);
+    std::unique_ptr<SequentialFileReader> file_reader(
+        new SequentialFileReader(std::move(source_holder), "" /* file name */));
+    if (allow_retry_read_) {
+      reader_.reset(new FragmentBufferedReader(nullptr, std::move(file_reader),
+                                               &report_, true /* checksum */,
+                                               123 /* log_number */));
+    } else {
+      reader_.reset(new Reader(nullptr, std::move(file_reader), &report_,
+                               true /* checksum */, 123 /* log_number */));
+    }
+  }
+
+  Slice* get_reader_contents() { return &reader_contents_; }
+
+  void Write(const std::string& msg) {
+    ASSERT_OK(writer_->AddRecord(Slice(msg)));
+  }
+
+  size_t WrittenBytes() const { return dest_contents().size(); }
+
+  std::string Read(const WALRecoveryMode wal_recovery_mode =
+                       WALRecoveryMode::kTolerateCorruptedTailRecords) {
+    std::string scratch;
+    Slice record;
+    bool ret = false;
+    uint64_t record_checksum;
+    ret = reader_->ReadRecord(&record, &scratch, wal_recovery_mode,
+                              &record_checksum);
+    if (ret) {
+      if (!allow_retry_read_) {
+        // allow_retry_read_ means using FragmentBufferedReader which does not
+        // support record checksum yet.
+        uint64_t actual_record_checksum =
+            XXH3_64bits(record.data(), record.size());
+        assert(actual_record_checksum == record_checksum);
+      }
+      return record.ToString();
+    } else {
+      return "EOF";
+    }
+  }
+
+  void IncrementByte(int offset, char delta) {
+    dest_contents()[offset] += delta;
+  }
+
+  void SetByte(int offset, char new_byte) {
+    dest_contents()[offset] = new_byte;
+  }
+
+  void ShrinkSize(int bytes) { sink_->Drop(bytes); }
+
+  void FixChecksum(int header_offset, int len, bool recyclable) {
+    // Compute crc of type/len/data
+    int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
+    uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
+                                 header_size - 6 + len);
+    crc = crc32c::Mask(crc);
+    EncodeFixed32(&dest_contents()[header_offset], crc);
+  }
+
+  void ForceError(size_t position = 0) {
+    source_->force_error_ = true;
+    source_->force_error_position_ = position;
+  }
+
+  size_t DroppedBytes() const { return report_.dropped_bytes_; }
+
+  std::string ReportMessage() const { return report_.message_; }
+
+  void ForceEOF(size_t position = 0) {
+    source_->force_eof_ = true;
+    source_->force_eof_position_ = position;
+  }
+
+  void UnmarkEOF() {
+    source_->returned_partial_ = false;
+    reader_->UnmarkEOF();
+  }
+
+  bool IsEOF() { return reader_->IsEOF(); }
+
+  // Returns OK iff recorded error message contains "msg"
+  std::string MatchError(const std::string& msg) const {
+    if (report_.message_.find(msg) == std::string::npos) {
+      return report_.message_;
+    } else {
+      return "OK";
+    }
+  }
+};
+
+TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
+
+TEST_P(LogTest, ReadWrite) {
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST_P(LogTest, ManyBlocks) {
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, Fragmentation) {
+  Write("small");
+  Write(BigString("medium", 50000));
+  Write(BigString("large", 100000));
+  ASSERT_EQ("small", Read());
+  ASSERT_EQ(BigString("medium", 50000), Read());
+  ASSERT_EQ(BigString("large", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, MarginalTrailer2) {
+  // Make a trailer that is exactly the same length as an empty record.
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, ShortTrailer) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  Write("");
+  Write("bar");
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, AlignedEof) {
+  int header_size =
+      std::get<0>(GetParam()) ? kRecyclableHeaderSize : kHeaderSize;
+  const int n = kBlockSize - 2 * header_size + 4;
+  Write(BigString("foo", n));
+  ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, RandomRead) {
+  const int N = 500;
+  Random write_rnd(301);
+  for (int i = 0; i < N; i++) {
+    Write(RandomSkewedString(i, &write_rnd));
+  }
+  Random read_rnd(301);
+  for (int i = 0; i < N; i++) {
+    ASSERT_EQ(RandomSkewedString(i, &read_rnd), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+// Tests of all the error paths in log_reader.cc follow:
+
+TEST_P(LogTest, ReadError) {
+  Write("foo");
+  ForceError();
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ((unsigned int)kBlockSize, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, BadRecordType) {
+  Write("foo");
+  // Type is stored in header[6]
+  IncrementByte(6, 100);
+  FixChecksum(0, 3, false);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("unknown record type"));
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read());
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(4);  // Drop all payload as well as a header byte
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  // Truncated last record is ignored, not treated as an error
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
+}
+
+TEST_P(LogTest, BadLength) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  const int kPayloadSize = kBlockSize - header_size;
+  Write(BigString("bar", kPayloadSize));
+  Write("foo");
+  // Least significant size byte is stored in header[4].
+  IncrementByte(4, 1);
+  if (!recyclable_log) {
+    ASSERT_EQ("foo", Read());
+    ASSERT_EQ(kBlockSize, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("bad record length"));
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, BadLengthAtEndIsIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(0U, DroppedBytes());
+  ASSERT_EQ("", ReportMessage());
+}
+
+TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then we should not raise an error when the
+    // record length specified in header is longer than data currently
+    // available. It's possible that the body of the record is not written yet.
+    return;
+  }
+  Write("foo");
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ChecksumMismatch) {
+  Write("foooooo");
+  IncrementByte(0, 14);
+  ASSERT_EQ("EOF", Read());
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ(14U, DroppedBytes());
+    ASSERT_EQ("OK", MatchError("checksum mismatch"));
+  } else {
+    ASSERT_EQ(0U, DroppedBytes());
+    ASSERT_EQ("", ReportMessage());
+  }
+}
+
+TEST_P(LogTest, UnexpectedMiddleType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6, static_cast<char>(recyclable_log ? kRecyclableMiddleType
+                                              : kMiddleType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedLastType) {
+  Write("foo");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(6,
+          static_cast<char>(recyclable_log ? kRecyclableLastType : kLastType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("missing start"));
+}
+
+TEST_P(LogTest, UnexpectedFullType) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, UnexpectedFirstType) {
+  Write("foo");
+  Write(BigString("bar", 100000));
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  SetByte(
+      6, static_cast<char>(recyclable_log ? kRecyclableFirstType : kFirstType));
+  FixChecksum(0, 3, !!recyclable_log);
+  ASSERT_EQ(BigString("bar", 100000), Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("partial record without end"));
+}
+
+TEST_P(LogTest, MissingLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, MissingLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Remove the LAST block, including header.
+  ShrinkSize(14);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
+}
+
+TEST_P(LogTest, PartialLastIsIgnored) {
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("", ReportMessage());
+  ASSERT_EQ(0U, DroppedBytes());
+}
+
+TEST_P(LogTest, PartialLastIsNotIgnored) {
+  if (allow_retry_read_) {
+    // If read retry is allowed, then truncated trailing record should not
+    // raise an error.
+    return;
+  }
+  Write(BigString("bar", kBlockSize));
+  // Cause a bad record length in the LAST block.
+  ShrinkSize(1);
+  ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
+  ASSERT_GT(DroppedBytes(), 0U);
+  ASSERT_EQ("OK", MatchError("Corruption: truncated record body"));
+}
+
+TEST_P(LogTest, ErrorJoinsRecords) {
+  // Consider two fragmented records:
+  //    first(R1) last(R1) first(R2) last(R2)
+  // where the middle two fragments disappear.  We do not want
+  // first(R1),last(R2) to get joined and returned as a valid record.
+
+  // Write records that span two blocks
+  Write(BigString("foo", kBlockSize));
+  Write(BigString("bar", kBlockSize));
+  Write("correct");
+
+  // Wipe the middle block
+  for (unsigned int offset = kBlockSize; offset < 2 * kBlockSize; offset++) {
+    SetByte(offset, 'x');
+  }
+
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    ASSERT_EQ("correct", Read());
+    ASSERT_EQ("EOF", Read());
+    size_t dropped = DroppedBytes();
+    ASSERT_LE(dropped, 2 * kBlockSize + 100);
+    ASSERT_GE(dropped, 2 * kBlockSize);
+  } else {
+    ASSERT_EQ("EOF", Read());
+  }
+}
+
+TEST_P(LogTest, ClearEofSingleBlock) {
+  Write("foo");
+  Write("bar");
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  ForceEOF(3 + header_size + 2);
+  ASSERT_EQ("foo", Read());
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_TRUE(IsEOF());
+  ASSERT_EQ("EOF", Read());
+  Write("xxx");
+  UnmarkEOF();
+  ASSERT_EQ("xxx", Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofMultiBlock) {
+  size_t num_full_blocks = 5;
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  int header_size = recyclable_log ? kRecyclableHeaderSize : kHeaderSize;
+  size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
+  Write(BigString("foo", n));
+  Write(BigString("bar", n));
+  ForceEOF(n + num_full_blocks * header_size + header_size + 3);
+  ASSERT_EQ(BigString("foo", n), Read());
+  ASSERT_TRUE(IsEOF());
+  UnmarkEOF();
+  ASSERT_EQ(BigString("bar", n), Read());
+  ASSERT_TRUE(IsEOF());
+  Write(BigString("xxx", n));
+  UnmarkEOF();
+  ASSERT_EQ(BigString("xxx", n), Read());
+  ASSERT_TRUE(IsEOF());
+}
+
+TEST_P(LogTest, ClearEofError) {
+  // If an error occurs during Read() in UnmarkEOF(), the records contained
+  // in the buffer should be returned on subsequent calls of ReadRecord()
+  // until no more full records are left, whereafter ReadRecord() should return
+  // false to indicate that it cannot read any further.
+
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  ASSERT_TRUE(IsEOF());
+  Write("xxx");
+  ForceError(0);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(LogTest, ClearEofError2) {
+  Write("foo");
+  Write("bar");
+  UnmarkEOF();
+  ASSERT_EQ("foo", Read());
+  Write("xxx");
+  ForceError(3);
+  UnmarkEOF();
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ(3U, DroppedBytes());
+  ASSERT_EQ("OK", MatchError("read error"));
+}
+
+TEST_P(LogTest, Recycle) {
+  bool recyclable_log = (std::get<0>(GetParam()) != 0);
+  if (!recyclable_log) {
+    return;  // test is only valid for recycled logs
+  }
+  Write("foo");
+  Write("bar");
+  Write("baz");
+  Write("bif");
+  Write("blitz");
+  while (get_reader_contents()->size() < log::kBlockSize * 2) {
+    Write("xxxxxxxxxxxxxxxx");
+  }
+  std::unique_ptr<FSWritableFile> sink(
+      new test::OverwritingStringSink(get_reader_contents()));
+  std::unique_ptr<WritableFileWriter> dest_holder(new WritableFileWriter(
+      std::move(sink), "" /* don't care */, FileOptions()));
+  Writer recycle_writer(std::move(dest_holder), 123, true);
+  ASSERT_OK(recycle_writer.AddRecord(Slice("foooo")));
+  ASSERT_OK(recycle_writer.AddRecord(Slice("bar")));
+  ASSERT_GE(get_reader_contents()->size(), log::kBlockSize * 2);
+  ASSERT_EQ("foooo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("EOF", Read());
+}
+
+// Do NOT enable compression for this instantiation.
+INSTANTIATE_TEST_CASE_P(
+    Log, LogTest,
+    ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+                       ::testing::Values(CompressionType::kNoCompression)));
+
+class RetriableLogTest : public ::testing::TestWithParam<int> {
+ private:
+  class ReportCollector : public Reader::Reporter {
+   public:
+    size_t dropped_bytes_;
+    std::string message_;
+
+    ReportCollector() : dropped_bytes_(0) {}
+    void Corruption(size_t bytes, const Status& status) override {
+      dropped_bytes_ += bytes;
+      message_.append(status.ToString());
+    }
+  };
+
+  Slice contents_;
+  test::StringSink* sink_;
+  std::unique_ptr<Writer> log_writer_;
+  Env* env_;
+  const std::string test_dir_;
+  const std::string log_file_;
+  std::unique_ptr<WritableFileWriter> writer_;
+  std::unique_ptr<SequentialFileReader> reader_;
+  ReportCollector report_;
+  std::unique_ptr<FragmentBufferedReader> log_reader_;
+
+ public:
+  RetriableLogTest()
+      : contents_(),
+        sink_(new test::StringSink(&contents_)),
+        log_writer_(nullptr),
+        env_(Env::Default()),
+        test_dir_(test::PerThreadDBPath("retriable_log_test")),
+        log_file_(test_dir_ + "/log"),
+        writer_(nullptr),
+        reader_(nullptr),
+        log_reader_(nullptr) {
+    std::unique_ptr<FSWritableFile> sink_holder(sink_);
+    std::unique_ptr<WritableFileWriter> wfw(new WritableFileWriter(
+        std::move(sink_holder), "" /* file name */, FileOptions()));
+    log_writer_.reset(new Writer(std::move(wfw), 123, GetParam()));
+  }
+
+  Status SetupTestEnv() {
+    Status s;
+    FileOptions fopts;
+    auto fs = env_->GetFileSystem();
+    s = fs->CreateDirIfMissing(test_dir_, IOOptions(), nullptr);
+    std::unique_ptr<FSWritableFile> writable_file;
+    if (s.ok()) {
+      s = fs->NewWritableFile(log_file_, fopts, &writable_file, nullptr);
+    }
+    if (s.ok()) {
+      writer_.reset(
+          new WritableFileWriter(std::move(writable_file), log_file_, fopts));
+      EXPECT_NE(writer_, nullptr);
+    }
+    std::unique_ptr<FSSequentialFile> seq_file;
+    if (s.ok()) {
+      s = fs->NewSequentialFile(log_file_, fopts, &seq_file, nullptr);
+    }
+    if (s.ok()) {
+      reader_.reset(new SequentialFileReader(std::move(seq_file), log_file_));
+      EXPECT_NE(reader_, nullptr);
+      log_reader_.reset(new FragmentBufferedReader(
+          nullptr, std::move(reader_), &report_, true /* checksum */,
+          123 /* log_number */));
+      EXPECT_NE(log_reader_, nullptr);
+    }
+    return s;
+  }
+
+  std::string contents() { return sink_->contents_; }
+
+  void Encode(const std::string& msg) {
+    ASSERT_OK(log_writer_->AddRecord(Slice(msg)));
+  }
+
+  void Write(const Slice& data) {
+    ASSERT_OK(writer_->Append(data));
+    ASSERT_OK(writer_->Sync(true));
+  }
+
+  bool TryRead(std::string* result) {
+    assert(result != nullptr);
+    result->clear();
+    std::string scratch;
+    Slice record;
+    bool r = log_reader_->ReadRecord(&record, &scratch);
+    if (r) {
+      result->assign(record.data(), record.size());
+      return true;
+    } else {
+      return false;
+    }
+  }
+};
+
+TEST_P(RetriableLogTest, TailLog_PartialHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size - 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+  ASSERT_TRUE(eof);
+}
+
+TEST_P(RetriableLogTest, TailLog_FullHeader) {
+  ASSERT_OK(SetupTestEnv());
+  std::vector<int> remaining_bytes_in_last_record;
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  bool eof = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"RetriableLogTest::TailLog:AfterPart1",
+        "RetriableLogTest::TailLog:BeforeReadRecord"},
+       {"FragmentBufferedLogReader::TryReadMore:FirstEOF",
+        "RetriableLogTest::TailLog:BeforePart2"}});
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "FragmentBufferedLogReader::TryReadMore:FirstEOF",
+      [&](void* /*arg*/) { eof = true; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  size_t delta = header_size + 1;
+  port::Thread log_writer_thread([&]() {
+    size_t old_sz = contents().size();
+    Encode("foo");
+    size_t new_sz = contents().size();
+    std::string part1 = contents().substr(old_sz, delta);
+    std::string part2 =
+        contents().substr(old_sz + delta, new_sz - old_sz - delta);
+    Write(Slice(part1));
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:AfterPart1");
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforePart2");
+    Write(Slice(part2));
+    ASSERT_TRUE(eof);
+  });
+
+  std::string record;
+  port::Thread log_reader_thread([&]() {
+    TEST_SYNC_POINT("RetriableLogTest::TailLog:BeforeReadRecord");
+    while (!TryRead(&record)) {
+    }
+  });
+  log_reader_thread.join();
+  log_writer_thread.join();
+  ASSERT_EQ("foo", record);
+}
+
+TEST_P(RetriableLogTest, NonBlockingReadFullRecord) {
+  // Clear all sync point callbacks even if this test does not use sync point.
+  // It is necessary, otherwise the execute of this test may hit a sync point
+  // with which a callback is registered. The registered callback may access
+  // some dead variable, causing segfault.
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  ASSERT_OK(SetupTestEnv());
+  size_t header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
+  size_t delta = header_size - 1;
+  size_t old_sz = contents().size();
+  Encode("foo-bar");
+  size_t new_sz = contents().size();
+  std::string part1 = contents().substr(old_sz, delta);
+  std::string part2 =
+      contents().substr(old_sz + delta, new_sz - old_sz - delta);
+  Write(Slice(part1));
+  std::string record;
+  ASSERT_FALSE(TryRead(&record));
+  ASSERT_TRUE(record.empty());
+  Write(Slice(part2));
+  ASSERT_TRUE(TryRead(&record));
+  ASSERT_EQ("foo-bar", record);
+}
+
+INSTANTIATE_TEST_CASE_P(bool, RetriableLogTest, ::testing::Values(0, 2));
+
+class CompressionLogTest : public LogTest {
+ public:
+  Status SetupTestEnv() { return writer_->AddCompressionTypeRecord(); }
+};
+
+TEST_P(CompressionLogTest, Empty) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  const bool compression_enabled =
+      std::get<2>(GetParam()) == kNoCompression ? false : true;
+  // If WAL compression is enabled, a record is added for the compression type
+  const int compression_record_size = compression_enabled ? kHeaderSize + 4 : 0;
+  ASSERT_EQ(compression_record_size, WrittenBytes());
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, ReadWrite) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  Write("foo");
+  Write("bar");
+  Write("");
+  Write("xxxx");
+  ASSERT_EQ("foo", Read());
+  ASSERT_EQ("bar", Read());
+  ASSERT_EQ("", Read());
+  ASSERT_EQ("xxxx", Read());
+  ASSERT_EQ("EOF", Read());
+  ASSERT_EQ("EOF", Read());  // Make sure reads at eof work
+}
+
+TEST_P(CompressionLogTest, ManyBlocks) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  for (int i = 0; i < 100000; i++) {
+    Write(NumberString(i));
+  }
+  for (int i = 0; i < 100000; i++) {
+    ASSERT_EQ(NumberString(i), Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+TEST_P(CompressionLogTest, Fragmentation) {
+  CompressionType compression_type = std::get<2>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  ASSERT_OK(SetupTestEnv());
+  Random rnd(301);
+  const std::vector<std::string> wal_entries = {
+      "small",
+      rnd.RandomBinaryString(3 * kBlockSize / 2),  // Spans into block 2
+      rnd.RandomBinaryString(3 * kBlockSize),      // Spans into block 5
+  };
+  for (const std::string& wal_entry : wal_entries) {
+    Write(wal_entry);
+  }
+
+  for (const std::string& wal_entry : wal_entries) {
+    ASSERT_EQ(wal_entry, Read());
+  }
+  ASSERT_EQ("EOF", Read());
+}
+
+INSTANTIATE_TEST_CASE_P(
+    Compression, CompressionLogTest,
+    ::testing::Combine(::testing::Values(0, 1), ::testing::Bool(),
+                       ::testing::Values(CompressionType::kNoCompression,
+                                         CompressionType::kZSTD)));
+
+class StreamingCompressionTest
+    : public ::testing::TestWithParam<std::tuple<int, CompressionType>> {};
+
+TEST_P(StreamingCompressionTest, Basic) {
+  size_t input_size = std::get<0>(GetParam());
+  CompressionType compression_type = std::get<1>(GetParam());
+  if (!StreamingCompressionTypeSupported(compression_type)) {
+    ROCKSDB_GTEST_SKIP("Test requires support for compression type");
+    return;
+  }
+  CompressionOptions opts;
+  constexpr uint32_t compression_format_version = 2;
+  StreamingCompress* compress = StreamingCompress::Create(
+      compression_type, opts, compression_format_version, kBlockSize);
+  StreamingUncompress* uncompress = StreamingUncompress::Create(
+      compression_type, compression_format_version, kBlockSize);
+  MemoryAllocator* allocator = new DefaultMemoryAllocator();
+  std::string input_buffer = BigString("abc", input_size);
+  std::vector<std::string> compressed_buffers;
+  size_t remaining;
+  // Call compress till the entire input is consumed
+  do {
+    char* output_buffer = (char*)allocator->Allocate(kBlockSize);
+    size_t output_pos;
+    remaining = compress->Compress(input_buffer.c_str(), input_size,
+                                   output_buffer, &output_pos);
+    if (output_pos > 0) {
+      std::string compressed_buffer;
+      compressed_buffer.assign(output_buffer, output_pos);
+      compressed_buffers.emplace_back(std::move(compressed_buffer));
+    }
+    allocator->Deallocate((void*)output_buffer);
+  } while (remaining > 0);
+  std::string uncompressed_buffer = "";
+  int ret_val = 0;
+  size_t output_pos;
+  char* uncompressed_output_buffer = (char*)allocator->Allocate(kBlockSize);
+  // Uncompress the fragments and concatenate them.
+  for (int i = 0; i < (int)compressed_buffers.size(); i++) {
+    // Call uncompress till either the entire input is consumed or the output
+    // buffer size is equal to the allocated output buffer size.
+    do {
+      ret_val = uncompress->Uncompress(compressed_buffers[i].c_str(),
+                                       compressed_buffers[i].size(),
+                                       uncompressed_output_buffer, &output_pos);
+      if (output_pos > 0) {
+        std::string uncompressed_fragment;
+        uncompressed_fragment.assign(uncompressed_output_buffer, output_pos);
+        uncompressed_buffer += uncompressed_fragment;
+      }
+    } while (ret_val > 0 || output_pos == kBlockSize);
+  }
+  allocator->Deallocate((void*)uncompressed_output_buffer);
+  delete allocator;
+  delete compress;
+  delete uncompress;
+  // The final return value from uncompress() should be 0.
+  ASSERT_EQ(ret_val, 0);
+  ASSERT_EQ(input_buffer, uncompressed_buffer);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    StreamingCompression, StreamingCompressionTest,
+    ::testing::Combine(::testing::Values(10, 100, 1000, kBlockSize,
+                                         kBlockSize * 2),
+                       ::testing::Values(CompressionType::kZSTD)));
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/log_writer.cc b/src/rocksdb/db/log_writer.cc
new file mode 100644
index 000000000..56f58543e
--- /dev/null
+++ b/src/rocksdb/db/log_writer.cc
@@ -0,0 +1,249 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/log_writer.h"
+
+#include <stdint.h>
+
+#include "file/writable_file_writer.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "util/coding.h"
+#include "util/crc32c.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace log {
+
+Writer::Writer(std::unique_ptr<WritableFileWriter>&& dest, uint64_t log_number,
+               bool recycle_log_files, bool manual_flush,
+               CompressionType compression_type)
+    : dest_(std::move(dest)),
+      block_offset_(0),
+      log_number_(log_number),
+      recycle_log_files_(recycle_log_files),
+      manual_flush_(manual_flush),
+      compression_type_(compression_type),
+      compress_(nullptr) {
+  for (int i = 0; i <= kMaxRecordType; i++) {
+    char t = static_cast<char>(i);
+    type_crc_[i] = crc32c::Value(&t, 1);
+  }
+}
+
+Writer::~Writer() {
+  if (dest_) {
+    WriteBuffer().PermitUncheckedError();
+  }
+  if (compress_) {
+    delete compress_;
+  }
+}
+
+IOStatus Writer::WriteBuffer() {
+  if (dest_->seen_error()) {
+    return IOStatus::IOError("Seen error. Skip writing buffer.");
+  }
+  return dest_->Flush();
+}
+
+IOStatus Writer::Close() {
+  IOStatus s;
+  if (dest_) {
+    s = dest_->Close();
+    dest_.reset();
+  }
+  return s;
+}
+
+IOStatus Writer::AddRecord(const Slice& slice,
+                           Env::IOPriority rate_limiter_priority) {
+  const char* ptr = slice.data();
+  size_t left = slice.size();
+
+  // Header size varies depending on whether we are recycling or not.
+  const int header_size =
+      recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
+
+  // Fragment the record if necessary and emit it.  Note that if slice
+  // is empty, we still want to iterate once to emit a single
+  // zero-length record
+  IOStatus s;
+  bool begin = true;
+  int compress_remaining = 0;
+  bool compress_start = false;
+  if (compress_) {
+    compress_->Reset();
+    compress_start = true;
+  }
+  do {
+    const int64_t leftover = kBlockSize - block_offset_;
+    assert(leftover >= 0);
+    if (leftover < header_size) {
+      // Switch to a new block
+      if (leftover > 0) {
+        // Fill the trailer (literal below relies on kHeaderSize and
+        // kRecyclableHeaderSize being <= 11)
+        assert(header_size <= 11);
+        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+                                static_cast<size_t>(leftover)),
+                          0 /* crc32c_checksum */, rate_limiter_priority);
+        if (!s.ok()) {
+          break;
+        }
+      }
+      block_offset_ = 0;
+    }
+
+    // Invariant: we never leave < header_size bytes in a block.
+    assert(static_cast<int64_t>(kBlockSize - block_offset_) >= header_size);
+
+    const size_t avail = kBlockSize - block_offset_ - header_size;
+
+    // Compress the record if compression is enabled.
+    // Compress() is called at least once (compress_start=true) and after the
+    // previous generated compressed chunk is written out as one or more
+    // physical records (left=0).
+    if (compress_ && (compress_start || left == 0)) {
+      compress_remaining = compress_->Compress(slice.data(), slice.size(),
+                                               compressed_buffer_.get(), &left);
+
+      if (compress_remaining < 0) {
+        // Set failure status
+        s = IOStatus::IOError("Unexpected WAL compression error");
+        s.SetDataLoss(true);
+        break;
+      } else if (left == 0) {
+        // Nothing left to compress
+        if (!compress_start) {
+          break;
+        }
+      }
+      compress_start = false;
+      ptr = compressed_buffer_.get();
+    }
+
+    const size_t fragment_length = (left < avail) ? left : avail;
+
+    RecordType type;
+    const bool end = (left == fragment_length && compress_remaining == 0);
+    if (begin && end) {
+      type = recycle_log_files_ ? kRecyclableFullType : kFullType;
+    } else if (begin) {
+      type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
+    } else if (end) {
+      type = recycle_log_files_ ? kRecyclableLastType : kLastType;
+    } else {
+      type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
+    }
+
+    s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
+    ptr += fragment_length;
+    left -= fragment_length;
+    begin = false;
+  } while (s.ok() && (left > 0 || compress_remaining > 0));
+
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush(rate_limiter_priority);
+    }
+  }
+
+  return s;
+}
+
+IOStatus Writer::AddCompressionTypeRecord() {
+  // Should be the first record
+  assert(block_offset_ == 0);
+
+  if (compression_type_ == kNoCompression) {
+    // No need to add a record
+    return IOStatus::OK();
+  }
+
+  CompressionTypeRecord record(compression_type_);
+  std::string encode;
+  record.EncodeTo(&encode);
+  IOStatus s =
+      EmitPhysicalRecord(kSetCompressionType, encode.data(), encode.size());
+  if (s.ok()) {
+    if (!manual_flush_) {
+      s = dest_->Flush();
+    }
+    // Initialize fields required for compression
+    const size_t max_output_buffer_len =
+        kBlockSize - (recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize);
+    CompressionOptions opts;
+    constexpr uint32_t compression_format_version = 2;
+    compress_ = StreamingCompress::Create(compression_type_, opts,
+                                          compression_format_version,
+                                          max_output_buffer_len);
+    assert(compress_ != nullptr);
+    compressed_buffer_ =
+        std::unique_ptr<char[]>(new char[max_output_buffer_len]);
+    assert(compressed_buffer_);
+  } else {
+    // Disable compression if the record could not be added.
+    compression_type_ = kNoCompression;
+  }
+  return s;
+}
+
+bool Writer::BufferIsEmpty() { return dest_->BufferIsEmpty(); }
+
+IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
+                                    Env::IOPriority rate_limiter_priority) {
+  assert(n <= 0xffff);  // Must fit in two bytes
+
+  size_t header_size;
+  char buf[kRecyclableHeaderSize];
+
+  // Format the header
+  buf[4] = static_cast<char>(n & 0xff);
+  buf[5] = static_cast<char>(n >> 8);
+  buf[6] = static_cast<char>(t);
+
+  uint32_t crc = type_crc_[t];
+  if (t < kRecyclableFullType || t == kSetCompressionType) {
+    // Legacy record format
+    assert(block_offset_ + kHeaderSize + n <= kBlockSize);
+    header_size = kHeaderSize;
+  } else {
+    // Recyclable record format
+    assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
+    header_size = kRecyclableHeaderSize;
+
+    // Only encode low 32-bits of the 64-bit log number.  This means
+    // we will fail to detect an old record if we recycled a log from
+    // ~4 billion logs ago, but that is effectively impossible, and
+    // even if it were we'dbe far more likely to see a false positive
+    // on the 32-bit CRC.
+    EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
+    crc = crc32c::Extend(crc, buf + 7, 4);
+  }
+
+  // Compute the crc of the record type and the payload.
+  uint32_t payload_crc = crc32c::Value(ptr, n);
+  crc = crc32c::Crc32cCombine(crc, payload_crc, n);
+  crc = crc32c::Mask(crc);  // Adjust for storage
+  TEST_SYNC_POINT_CALLBACK("LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum",
+                           &crc);
+  EncodeFixed32(buf, crc);
+
+  // Write the header and the payload
+  IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
+                             rate_limiter_priority);
+  if (s.ok()) {
+    s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
+  }
+  block_offset_ += header_size + n;
+  return s;
+}
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/log_writer.h b/src/rocksdb/db/log_writer.h
new file mode 100644
index 000000000..5d266e434
--- /dev/null
+++ b/src/rocksdb/db/log_writer.h
@@ -0,0 +1,128 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "db/log_format.h"
+#include "rocksdb/compression_type.h"
+#include "rocksdb/env.h"
+#include "rocksdb/io_status.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/compression.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WritableFileWriter;
+
+namespace log {
+
+/**
+ * Writer is a general purpose log stream writer. It provides an append-only
+ * abstraction for writing data. The details of the how the data is written is
+ * handled by the WritableFile sub-class implementation.
+ *
+ * File format:
+ *
+ * File is broken down into variable sized records. The format of each record
+ * is described below.
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ * File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
+ *       +-----+-------------+--+----+----------+------+-- ... ----+
+ *       <--- kBlockSize ------>|<-- kBlockSize ------>|
+ *  rn = variable size records
+ *  P = Padding
+ *
+ * Data is written out in kBlockSize chunks. If next record does not fit
+ * into the space left, the leftover space will be padded with \0.
+ *
+ * Legacy record format:
+ *
+ * +---------+-----------+-----------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Payload   |
+ * +---------+-----------+-----------+--- ... ---+
+ *
+ * CRC = 32bit hash computed over the record type and payload using CRC
+ * Size = Length of the payload data
+ * Type = Type of record
+ *        (kZeroType, kFullType, kFirstType, kLastType, kMiddleType )
+ *        The type is used to group a bunch of records together to represent
+ *        blocks that are larger than kBlockSize
+ * Payload = Byte stream as long as specified by the payload size
+ *
+ * Recyclable record format:
+ *
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+ * +---------+-----------+-----------+----------------+--- ... ---+
+ *
+ * Same as above, with the addition of
+ * Log number = 32bit log file number, so that we can distinguish between
+ * records written by the most recent log writer vs a previous one.
+ */
+class Writer {
+ public:
+  // Create a writer that will append data to "*dest".
+  // "*dest" must be initially empty.
+  // "*dest" must remain live while this Writer is in use.
+  explicit Writer(std::unique_ptr<WritableFileWriter>&& dest,
+                  uint64_t log_number, bool recycle_log_files,
+                  bool manual_flush = false,
+                  CompressionType compressionType = kNoCompression);
+  // No copying allowed
+  Writer(const Writer&) = delete;
+  void operator=(const Writer&) = delete;
+
+  ~Writer();
+
+  IOStatus AddRecord(const Slice& slice,
+                     Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+  IOStatus AddCompressionTypeRecord();
+
+  WritableFileWriter* file() { return dest_.get(); }
+  const WritableFileWriter* file() const { return dest_.get(); }
+
+  uint64_t get_log_number() const { return log_number_; }
+
+  IOStatus WriteBuffer();
+
+  IOStatus Close();
+
+  bool BufferIsEmpty();
+
+ private:
+  std::unique_ptr<WritableFileWriter> dest_;
+  size_t block_offset_;  // Current offset in block
+  uint64_t log_number_;
+  bool recycle_log_files_;
+
+  // crc32c values for all supported record types.  These are
+  // pre-computed to reduce the overhead of computing the crc of the
+  // record type stored in the header.
+  uint32_t type_crc_[kMaxRecordType + 1];
+
+  IOStatus EmitPhysicalRecord(
+      RecordType type, const char* ptr, size_t length,
+      Env::IOPriority rate_limiter_priority = Env::IO_TOTAL);
+
+  // If true, it does not flush after each write. Instead it relies on the upper
+  // layer to manually does the flush by calling ::WriteBuffer()
+  bool manual_flush_;
+
+  // Compression Type
+  CompressionType compression_type_;
+  StreamingCompress* compress_;
+  // Reusable compressed output buffer
+  std::unique_ptr<char[]> compressed_buffer_;
+};
+
+}  // namespace log
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.cc b/src/rocksdb/db/logs_with_prep_tracker.cc
new file mode 100644
index 000000000..ff98155c4
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.cc
@@ -0,0 +1,67 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/logs_with_prep_tracker.h"
+
+#include "port/likely.h"
+
+namespace ROCKSDB_NAMESPACE {
+void LogsWithPrepTracker::MarkLogAsHavingPrepSectionFlushed(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(prepared_section_completed_mutex_);
+  auto it = prepared_section_completed_.find(log);
+  if (UNLIKELY(it == prepared_section_completed_.end())) {
+    prepared_section_completed_[log] = 1;
+  } else {
+    it->second += 1;
+  }
+}
+
+void LogsWithPrepTracker::MarkLogAsContainingPrepSection(uint64_t log) {
+  assert(log != 0);
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+
+  auto rit = logs_with_prep_.rbegin();
+  bool updated = false;
+  // Most probably the last log is the one that is being marked for
+  // having a prepare section; so search from the end.
+  for (; rit != logs_with_prep_.rend() && rit->log >= log; ++rit) {
+    if (rit->log == log) {
+      rit->cnt++;
+      updated = true;
+      break;
+    }
+  }
+  if (!updated) {
+    // We are either at the start, or at a position with rit->log < log
+    logs_with_prep_.insert(rit.base(), {log, 1});
+  }
+}
+
+uint64_t LogsWithPrepTracker::FindMinLogContainingOutstandingPrep() {
+  std::lock_guard<std::mutex> lock(logs_with_prep_mutex_);
+  auto it = logs_with_prep_.begin();
+  // start with the smallest log
+  for (; it != logs_with_prep_.end();) {
+    auto min_log = it->log;
+    {
+      std::lock_guard<std::mutex> lock2(prepared_section_completed_mutex_);
+      auto completed_it = prepared_section_completed_.find(min_log);
+      if (completed_it == prepared_section_completed_.end() ||
+          completed_it->second < it->cnt) {
+        return min_log;
+      }
+      assert(completed_it != prepared_section_completed_.end() &&
+             completed_it->second == it->cnt);
+      prepared_section_completed_.erase(completed_it);
+    }
+    // erase from beginning in vector is not efficient but this function is not
+    // on the fast path.
+    it = logs_with_prep_.erase(it);
+  }
+  // no such log found
+  return 0;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/logs_with_prep_tracker.h b/src/rocksdb/db/logs_with_prep_tracker.h
new file mode 100644
index 000000000..f72f0ca07
--- /dev/null
+++ b/src/rocksdb/db/logs_with_prep_tracker.h
@@ -0,0 +1,62 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <mutex>
+#include <unordered_map>
+#include <vector>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// This class is used to track the log files with outstanding prepare entries.
+class LogsWithPrepTracker {
+ public:
+  // Called when a transaction prepared in `log` has been committed or aborted.
+  void MarkLogAsHavingPrepSectionFlushed(uint64_t log);
+  // Called when a transaction is prepared in `log`.
+  void MarkLogAsContainingPrepSection(uint64_t log);
+  // Return the earliest log file with outstanding prepare entries.
+  uint64_t FindMinLogContainingOutstandingPrep();
+  size_t TEST_PreparedSectionCompletedSize() {
+    return prepared_section_completed_.size();
+  }
+  size_t TEST_LogsWithPrepSize() { return logs_with_prep_.size(); }
+
+ private:
+  // REQUIRES: logs_with_prep_mutex_ held
+  //
+  // sorted list of log numbers still containing prepared data.
+  // this is used by FindObsoleteFiles to determine which
+  // flushed logs we must keep around because they still
+  // contain prepared data which has not been committed or rolled back
+  struct LogCnt {
+    uint64_t log;  // the log number
+    uint64_t cnt;  // number of prepared sections in the log
+  };
+  std::vector<LogCnt> logs_with_prep_;
+  std::mutex logs_with_prep_mutex_;
+
+  // REQUIRES: prepared_section_completed_mutex_ held
+  //
+  // to be used in conjunction with logs_with_prep_.
+  // once a transaction with data in log L is committed or rolled back
+  // rather than updating logs_with_prep_ directly we keep track of that
+  // in prepared_section_completed_ which maps LOG -> instance_count. This helps
+  // avoiding contention between a commit thread and the prepare threads.
+  //
+  // when trying to determine the minimum log still active we first
+  // consult logs_with_prep_. while that root value maps to
+  // an equal value in prepared_section_completed_ we erase the log from
+  // both logs_with_prep_ and prepared_section_completed_.
+  std::unordered_map<uint64_t, uint64_t> prepared_section_completed_;
+  std::mutex prepared_section_completed_mutex_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/lookup_key.h b/src/rocksdb/db/lookup_key.h
new file mode 100644
index 000000000..68851bddd
--- /dev/null
+++ b/src/rocksdb/db/lookup_key.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <string>
+#include <utility>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// A helper class useful for DBImpl::Get()
+class LookupKey {
+ public:
+  // Initialize *this for looking up user_key at a snapshot with
+  // the specified sequence number.
+  LookupKey(const Slice& _user_key, SequenceNumber sequence,
+            const Slice* ts = nullptr);
+
+  ~LookupKey();
+
+  // Return a key suitable for lookup in a MemTable.
+  Slice memtable_key() const {
+    return Slice(start_, static_cast<size_t>(end_ - start_));
+  }
+
+  // Return an internal key (suitable for passing to an internal iterator)
+  Slice internal_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
+  }
+
+  // Return the user key.
+  // If user-defined timestamp is enabled, then timestamp is included in the
+  // result.
+  Slice user_key() const {
+    return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
+  }
+
+ private:
+  // We construct a char array of the form:
+  //    klength  varint32               <-- start_
+  //    userkey  char[klength]          <-- kstart_
+  //    tag      uint64
+  //                                    <-- end_
+  // The array is a suitable MemTable key.
+  // The suffix starting with "userkey" can be used as an InternalKey.
+  const char* start_;
+  const char* kstart_;
+  const char* end_;
+  char space_[200];  // Avoid allocation for short keys
+
+  // No copying allowed
+  LookupKey(const LookupKey&);
+  void operator=(const LookupKey&);
+};
+
+inline LookupKey::~LookupKey() {
+  if (start_ != space_) delete[] start_;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/malloc_stats.cc b/src/rocksdb/db/malloc_stats.cc
new file mode 100644
index 000000000..52f2e6e0f
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.cc
@@ -0,0 +1,55 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/malloc_stats.h"
+
+#ifndef ROCKSDB_LITE
+#include <string.h>
+
+#include <memory>
+
+#include "port/jemalloc_helper.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifdef ROCKSDB_JEMALLOC
+
+struct MallocStatus {
+  char* cur;
+  char* end;
+};
+
+static void GetJemallocStatus(void* mstat_arg, const char* status) {
+  MallocStatus* mstat = reinterpret_cast<MallocStatus*>(mstat_arg);
+  size_t status_len = status ? strlen(status) : 0;
+  size_t buf_size = (size_t)(mstat->end - mstat->cur);
+  if (!status_len || status_len > buf_size) {
+    return;
+  }
+
+  snprintf(mstat->cur, buf_size, "%s", status);
+  mstat->cur += status_len;
+}
+void DumpMallocStats(std::string* stats) {
+  if (!HasJemalloc()) {
+    return;
+  }
+  MallocStatus mstat;
+  const unsigned int kMallocStatusLen = 1000000;
+  std::unique_ptr<char[]> buf{new char[kMallocStatusLen + 1]};
+  mstat.cur = buf.get();
+  mstat.end = buf.get() + kMallocStatusLen;
+  malloc_stats_print(GetJemallocStatus, &mstat, "");
+  stats->append(buf.get());
+}
+#else
+void DumpMallocStats(std::string*) {}
+#endif  // ROCKSDB_JEMALLOC
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/malloc_stats.h b/src/rocksdb/db/malloc_stats.h
new file mode 100644
index 000000000..18aff3ad0
--- /dev/null
+++ b/src/rocksdb/db/malloc_stats.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void DumpMallocStats(std::string*);
+
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/manual_compaction_test.cc b/src/rocksdb/db/manual_compaction_test.cc
new file mode 100644
index 000000000..b92cb794b
--- /dev/null
+++ b/src/rocksdb/db/manual_compaction_test.cc
@@ -0,0 +1,308 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Test for issue 178: a manual compaction causes deleted data to reappear.
+#include <cstdlib>
+
+#include "port/port.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/testharness.h"
+
+using ROCKSDB_NAMESPACE::CompactionFilter;
+using ROCKSDB_NAMESPACE::CompactionStyle;
+using ROCKSDB_NAMESPACE::CompactRangeOptions;
+using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::DB;
+using ROCKSDB_NAMESPACE::DestroyDB;
+using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::Iterator;
+using ROCKSDB_NAMESPACE::Options;
+using ROCKSDB_NAMESPACE::ReadOptions;
+using ROCKSDB_NAMESPACE::Slice;
+using ROCKSDB_NAMESPACE::WriteBatch;
+using ROCKSDB_NAMESPACE::WriteOptions;
+
+namespace {
+
+// Reasoning: previously the number was 1100000. Since the keys are written to
+// the batch in one write each write will result into one SST file. each write
+// will result into one SST file. We reduced the write_buffer_size to 1K to
+// basically have the same effect with however less number of keys, which
+// results into less test runtime.
+const int kNumKeys = 1100;
+
+std::string Key1(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "my_key_%d", i);
+  return buf;
+}
+
+std::string Key2(int i) { return Key1(i) + "_xxx"; }
+
+class ManualCompactionTest : public testing::Test {
+ public:
+  ManualCompactionTest() {
+    // Get rid of any state from an old run.
+    dbname_ = ROCKSDB_NAMESPACE::test::PerThreadDBPath(
+        "rocksdb_manual_compaction_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  std::string dbname_;
+};
+
+class DestroyAllCompactionFilter : public CompactionFilter {
+ public:
+  DestroyAllCompactionFilter() {}
+
+  bool Filter(int /*level*/, const Slice& /*key*/, const Slice& existing_value,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    return existing_value.ToString() == "destroy";
+  }
+
+  const char* Name() const override { return "DestroyAllCompactionFilter"; }
+};
+
+class LogCompactionFilter : public CompactionFilter {
+ public:
+  const char* Name() const override { return "LogCompactionFilter"; }
+
+  bool Filter(int level, const Slice& key, const Slice& /*existing_value*/,
+              std::string* /*new_value*/,
+              bool* /*value_changed*/) const override {
+    key_level_[key.ToString()] = level;
+    return false;
+  }
+
+  void Reset() { key_level_.clear(); }
+
+  size_t NumKeys() const { return key_level_.size(); }
+
+  int KeyLevel(const Slice& key) {
+    auto it = key_level_.find(key.ToString());
+    if (it == key_level_.end()) {
+      return -1;
+    }
+    return it->second;
+  }
+
+ private:
+  mutable std::map<std::string, int> key_level_;
+};
+
+TEST_F(ManualCompactionTest, CompactTouchesAllKeys) {
+  for (int iter = 0; iter < 2; ++iter) {
+    DB* db;
+    Options options;
+    if (iter == 0) {  // level compaction
+      options.num_levels = 3;
+      options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+    } else {  // universal compaction
+      options.compaction_style = CompactionStyle::kCompactionStyleUniversal;
+    }
+    options.create_if_missing = true;
+    options.compression = CompressionType::kNoCompression;
+    options.compaction_filter = new DestroyAllCompactionFilter();
+    ASSERT_OK(DB::Open(options, dbname_, &db));
+
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key1"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key2"), Slice("destroy")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key3"), Slice("value3")));
+    ASSERT_OK(db->Put(WriteOptions(), Slice("key4"), Slice("destroy")));
+
+    Slice key4("key4");
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &key4));
+    Iterator* itr = db->NewIterator(ReadOptions());
+    itr->SeekToFirst();
+    ASSERT_TRUE(itr->Valid());
+    ASSERT_EQ("key3", itr->key().ToString());
+    itr->Next();
+    ASSERT_TRUE(!itr->Valid());
+    delete itr;
+
+    delete options.compaction_filter;
+    delete db;
+    ASSERT_OK(DestroyDB(dbname_, options));
+  }
+}
+
+TEST_F(ManualCompactionTest, Test) {
+  // Open database.  Disable compression since it affects the creation
+  // of layers and the code below is trying to test against a very
+  // specific scenario.
+  DB* db;
+  Options db_options;
+  db_options.write_buffer_size = 1024;
+  db_options.create_if_missing = true;
+  db_options.compression = CompressionType::kNoCompression;
+  ASSERT_OK(DB::Open(db_options, dbname_, &db));
+
+  // create first key range
+  WriteBatch batch;
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(Key1(i), "value for range 1 key"));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // create second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Put(Key2(i), "value for range 2 key"));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // delete second key range
+  batch.Clear();
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(batch.Delete(Key2(i)));
+  }
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+
+  // compact database
+  std::string start_key = Key1(0);
+  std::string end_key = Key1(kNumKeys - 1);
+  Slice least(start_key.data(), start_key.size());
+  Slice greatest(end_key.data(), end_key.size());
+
+  // commenting out the line below causes the example to work correctly
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), &least, &greatest));
+
+  // count the keys
+  Iterator* iter = db->NewIterator(ReadOptions());
+  int num_keys = 0;
+  for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+    num_keys++;
+  }
+  delete iter;
+  ASSERT_EQ(kNumKeys, num_keys) << "Bad number of keys";
+
+  // close database
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, Options()));
+}
+
+TEST_F(ManualCompactionTest, SkipLevel) {
+  DB* db;
+  Options options;
+  options.num_levels = 3;
+  // Initially, flushed L0 files won't exceed 100.
+  options.level0_file_num_compaction_trigger = 100;
+  options.compaction_style = CompactionStyle::kCompactionStyleLevel;
+  options.create_if_missing = true;
+  options.compression = CompressionType::kNoCompression;
+  LogCompactionFilter* filter = new LogCompactionFilter();
+  options.compaction_filter = filter;
+  ASSERT_OK(DB::Open(options, dbname_, &db));
+
+  WriteOptions wo;
+  FlushOptions fo;
+  ASSERT_OK(db->Put(wo, "1", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "2", ""));
+  ASSERT_OK(db->Flush(fo));
+  ASSERT_OK(db->Put(wo, "4", ""));
+  ASSERT_OK(db->Put(wo, "8", ""));
+  ASSERT_OK(db->Flush(fo));
+
+  {
+    // L0: 1, 2, [4, 8]
+    // no file has keys in range [5, 7]
+    Slice start("5");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2, [4, 8]
+    // [3, 7] overlaps with 4 in L0
+    Slice start("3");
+    Slice end("7");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(2, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("4"));
+    ASSERT_EQ(0, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range (-inf, 0]
+    Slice end("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, &end));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // no file has keys in range [9, inf)
+    Slice start("9");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(0, filter->NumKeys());
+  }
+
+  {
+    // L0: 1, 2
+    // L1: [4, 8]
+    // [2, 2] overlaps with 2 in L0
+    Slice start("2");
+    Slice end("2");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(1, filter->NumKeys());
+    ASSERT_EQ(0, filter->KeyLevel("2"));
+  }
+
+  {
+    // L0: 1
+    // L1: 2, [4, 8]
+    // [2, 5] overlaps with 2 and [4, 8) in L1, skip L0
+    Slice start("2");
+    Slice end("5");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, &end));
+    ASSERT_EQ(3, filter->NumKeys());
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  {
+    // L0: 1
+    // L1: [2, 4, 8]
+    // [0, inf) overlaps all files
+    Slice start("0");
+    filter->Reset();
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), &start, nullptr));
+    ASSERT_EQ(4, filter->NumKeys());
+    // 1 is first compacted to L1 and then further compacted into [2, 4, 8],
+    // so finally the logged level for 1 is L1.
+    ASSERT_EQ(1, filter->KeyLevel("1"));
+    ASSERT_EQ(1, filter->KeyLevel("2"));
+    ASSERT_EQ(1, filter->KeyLevel("4"));
+    ASSERT_EQ(1, filter->KeyLevel("8"));
+  }
+
+  delete filter;
+  delete db;
+  ASSERT_OK(DestroyDB(dbname_, options));
+}
+
+}  // anonymous namespace
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/memtable.cc b/src/rocksdb/db/memtable.cc
new file mode 100644
index 000000000..45b139e80
--- /dev/null
+++ b/src/rocksdb/db/memtable.cc
@@ -0,0 +1,1675 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/memtable.h"
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "memory/arena.h"
+#include "memory/memory_usage.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "port/port.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/merging_iterator.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ImmutableMemTableOptions::ImmutableMemTableOptions(
+    const ImmutableOptions& ioptions,
+    const MutableCFOptions& mutable_cf_options)
+    : arena_block_size(mutable_cf_options.arena_block_size),
+      memtable_prefix_bloom_bits(
+          static_cast<uint32_t>(
+              static_cast<double>(mutable_cf_options.write_buffer_size) *
+              mutable_cf_options.memtable_prefix_bloom_size_ratio) *
+          8u),
+      memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size),
+      memtable_whole_key_filtering(
+          mutable_cf_options.memtable_whole_key_filtering),
+      inplace_update_support(ioptions.inplace_update_support),
+      inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks),
+      inplace_callback(ioptions.inplace_callback),
+      max_successive_merges(mutable_cf_options.max_successive_merges),
+      statistics(ioptions.stats),
+      merge_operator(ioptions.merge_operator.get()),
+      info_log(ioptions.logger),
+      allow_data_in_errors(ioptions.allow_data_in_errors),
+      protection_bytes_per_key(
+          mutable_cf_options.memtable_protection_bytes_per_key) {}
+
+MemTable::MemTable(const InternalKeyComparator& cmp,
+                   const ImmutableOptions& ioptions,
+                   const MutableCFOptions& mutable_cf_options,
+                   WriteBufferManager* write_buffer_manager,
+                   SequenceNumber latest_seq, uint32_t column_family_id)
+    : comparator_(cmp),
+      moptions_(ioptions, mutable_cf_options),
+      refs_(0),
+      kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)),
+      mem_tracker_(write_buffer_manager),
+      arena_(moptions_.arena_block_size,
+             (write_buffer_manager != nullptr &&
+              (write_buffer_manager->enabled() ||
+               write_buffer_manager->cost_to_cache()))
+                 ? &mem_tracker_
+                 : nullptr,
+             mutable_cf_options.memtable_huge_page_size),
+      table_(ioptions.memtable_factory->CreateMemTableRep(
+          comparator_, &arena_, mutable_cf_options.prefix_extractor.get(),
+          ioptions.logger, column_family_id)),
+      range_del_table_(SkipListFactory().CreateMemTableRep(
+          comparator_, &arena_, nullptr /* transform */, ioptions.logger,
+          column_family_id)),
+      is_range_del_table_empty_(true),
+      data_size_(0),
+      num_entries_(0),
+      num_deletes_(0),
+      write_buffer_size_(mutable_cf_options.write_buffer_size),
+      flush_in_progress_(false),
+      flush_completed_(false),
+      file_number_(0),
+      first_seqno_(0),
+      earliest_seqno_(latest_seq),
+      creation_seq_(latest_seq),
+      mem_next_logfile_number_(0),
+      min_prep_log_referenced_(0),
+      locks_(moptions_.inplace_update_support
+                 ? moptions_.inplace_update_num_locks
+                 : 0),
+      prefix_extractor_(mutable_cf_options.prefix_extractor.get()),
+      flush_state_(FLUSH_NOT_REQUESTED),
+      clock_(ioptions.clock),
+      insert_with_hint_prefix_extractor_(
+          ioptions.memtable_insert_with_hint_prefix_extractor.get()),
+      oldest_key_time_(std::numeric_limits<uint64_t>::max()),
+      atomic_flush_seqno_(kMaxSequenceNumber),
+      approximate_memory_usage_(0) {
+  UpdateFlushState();
+  // something went wrong if we need to flush before inserting anything
+  assert(!ShouldScheduleFlush());
+
+  // use bloom_filter_ for both whole key and prefix bloom filter
+  if ((prefix_extractor_ || moptions_.memtable_whole_key_filtering) &&
+      moptions_.memtable_prefix_bloom_bits > 0) {
+    bloom_filter_.reset(
+        new DynamicBloom(&arena_, moptions_.memtable_prefix_bloom_bits,
+                         6 /* hard coded 6 probes */,
+                         moptions_.memtable_huge_page_size, ioptions.logger));
+  }
+  // Initialize cached_range_tombstone_ here since it could
+  // be read before it is constructed in MemTable::Add(), which could also lead
+  // to a data race on the global mutex table backing atomic shared_ptr.
+  auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+  size_t size = cached_range_tombstone_.Size();
+  for (size_t i = 0; i < size; ++i) {
+    std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+        cached_range_tombstone_.AccessAtCore(i);
+    auto new_local_cache_ref = std::make_shared<
+        const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+    std::atomic_store_explicit(
+        local_cache_ref_ptr,
+        std::shared_ptr<FragmentedRangeTombstoneListCache>(new_local_cache_ref,
+                                                           new_cache.get()),
+        std::memory_order_relaxed);
+  }
+}
+
+MemTable::~MemTable() {
+  mem_tracker_.FreeMem();
+  assert(refs_ == 0);
+}
+
+size_t MemTable::ApproximateMemoryUsage() {
+  autovector<size_t> usages = {
+      arena_.ApproximateMemoryUsage(), table_->ApproximateMemoryUsage(),
+      range_del_table_->ApproximateMemoryUsage(),
+      ROCKSDB_NAMESPACE::ApproximateMemoryUsage(insert_hints_)};
+  size_t total_usage = 0;
+  for (size_t usage : usages) {
+    // If usage + total_usage >= kMaxSizet, return kMaxSizet.
+    // the following variation is to avoid numeric overflow.
+    if (usage >= std::numeric_limits<size_t>::max() - total_usage) {
+      return std::numeric_limits<size_t>::max();
+    }
+    total_usage += usage;
+  }
+  approximate_memory_usage_.store(total_usage, std::memory_order_relaxed);
+  // otherwise, return the actual usage
+  return total_usage;
+}
+
+bool MemTable::ShouldFlushNow() {
+  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
+  // In a lot of times, we cannot allocate arena blocks that exactly matches the
+  // buffer size. Thus we have to decide if we should over-allocate or
+  // under-allocate.
+  // This constant variable can be interpreted as: if we still have more than
+  // "kAllowOverAllocationRatio * kArenaBlockSize" space left, we'd try to over
+  // allocate one more block.
+  const double kAllowOverAllocationRatio = 0.6;
+
+  // If arena still have room for new block allocation, we can safely say it
+  // shouldn't flush.
+  auto allocated_memory = table_->ApproximateMemoryUsage() +
+                          range_del_table_->ApproximateMemoryUsage() +
+                          arena_.MemoryAllocatedBytes();
+
+  approximate_memory_usage_.store(allocated_memory, std::memory_order_relaxed);
+
+  // if we can still allocate one more block without exceeding the
+  // over-allocation ratio, then we should not flush.
+  if (allocated_memory + kArenaBlockSize <
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return false;
+  }
+
+  // if user keeps adding entries that exceeds write_buffer_size, we need to
+  // flush earlier even though we still have much available memory left.
+  if (allocated_memory >
+      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
+    return true;
+  }
+
+  // In this code path, Arena has already allocated its "last block", which
+  // means the total allocatedmemory size is either:
+  //  (1) "moderately" over allocated the memory (no more than `0.6 * arena
+  // block size`. Or,
+  //  (2) the allocated memory is less than write buffer size, but we'll stop
+  // here since if we allocate a new arena block, we'll over allocate too much
+  // more (half of the arena block size) memory.
+  //
+  // In either case, to avoid over-allocate, the last block will stop allocation
+  // when its usage reaches a certain ratio, which we carefully choose "0.75
+  // full" as the stop condition because it addresses the following issue with
+  // great simplicity: What if the next inserted entry's size is
+  // bigger than AllocatedAndUnused()?
+  //
+  // The answer is: if the entry size is also bigger than 0.25 *
+  // kArenaBlockSize, a dedicated block will be allocated for it; otherwise
+  // arena will anyway skip the AllocatedAndUnused() and allocate a new, empty
+  // and regular block. In either case, we *overly* over-allocated.
+  //
+  // Therefore, setting the last block to be at most "0.75 full" avoids both
+  // cases.
+  //
+  // NOTE: the average percentage of waste space of this approach can be counted
+  // as: "arena block size * 0.25 / write buffer size". User who specify a small
+  // write buffer size and/or big arena block size may suffer.
+  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
+}
+
+void MemTable::UpdateFlushState() {
+  auto state = flush_state_.load(std::memory_order_relaxed);
+  if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
+    // ignore CAS failure, because that means somebody else requested
+    // a flush
+    flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
+                                         std::memory_order_relaxed,
+                                         std::memory_order_relaxed);
+  }
+}
+
+void MemTable::UpdateOldestKeyTime() {
+  uint64_t oldest_key_time = oldest_key_time_.load(std::memory_order_relaxed);
+  if (oldest_key_time == std::numeric_limits<uint64_t>::max()) {
+    int64_t current_time = 0;
+    auto s = clock_->GetCurrentTime(&current_time);
+    if (s.ok()) {
+      assert(current_time >= 0);
+      // If fail, the timestamp is already set.
+      oldest_key_time_.compare_exchange_strong(
+          oldest_key_time, static_cast<uint64_t>(current_time),
+          std::memory_order_relaxed, std::memory_order_relaxed);
+    }
+  }
+}
+
+Status MemTable::VerifyEntryChecksum(const char* entry,
+                                     size_t protection_bytes_per_key,
+                                     bool allow_data_in_errors) {
+  if (protection_bytes_per_key == 0) {
+    return Status::OK();
+  }
+  uint32_t key_length;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  if (key_ptr == nullptr) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  if (key_length < 8) {
+    return Status::Corruption("Memtable entry internal key length too short.");
+  }
+  Slice user_key = Slice(key_ptr, key_length - 8);
+
+  const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+  ValueType type;
+  SequenceNumber seq;
+  UnPackSequenceAndType(tag, &seq, &type);
+
+  uint32_t value_length = 0;
+  const char* value_ptr = GetVarint32Ptr(
+      key_ptr + key_length, key_ptr + key_length + 5, &value_length);
+  if (value_ptr == nullptr) {
+    return Status::Corruption("Unable to parse internal key value");
+  }
+  Slice value = Slice(value_ptr, value_length);
+
+  const char* checksum_ptr = value_ptr + value_length;
+  uint64_t expected = ProtectionInfo64()
+                          .ProtectKVO(user_key, value, type)
+                          .ProtectS(seq)
+                          .GetVal();
+  bool match = true;
+  switch (protection_bytes_per_key) {
+    case 1:
+      match = static_cast<uint8_t>(checksum_ptr[0]) ==
+              static_cast<uint8_t>(expected);
+      break;
+    case 2:
+      match = DecodeFixed16(checksum_ptr) == static_cast<uint16_t>(expected);
+      break;
+    case 4:
+      match = DecodeFixed32(checksum_ptr) == static_cast<uint32_t>(expected);
+      break;
+    case 8:
+      match = DecodeFixed64(checksum_ptr) == expected;
+      break;
+    default:
+      assert(false);
+  }
+  if (!match) {
+    std::string msg(
+        "Corrupted memtable entry, per key-value checksum verification "
+        "failed.");
+    if (allow_data_in_errors) {
+      msg.append("Unrecognized value type: " +
+                 std::to_string(static_cast<int>(type)) + ". ");
+      msg.append("User key: " + user_key.ToString(/*hex=*/true) + ". ");
+      msg.append("seq: " + std::to_string(seq) + ".");
+    }
+    return Status::Corruption(msg.c_str());
+  }
+  return Status::OK();
+}
+
+int MemTable::KeyComparator::operator()(const char* prefix_len_key1,
+                                        const char* prefix_len_key2) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice k1 = GetLengthPrefixedSlice(prefix_len_key1);
+  Slice k2 = GetLengthPrefixedSlice(prefix_len_key2);
+  return comparator.CompareKeySeq(k1, k2);
+}
+
+int MemTable::KeyComparator::operator()(
+    const char* prefix_len_key, const KeyComparator::DecodedType& key) const {
+  // Internal keys are encoded as length-prefixed strings.
+  Slice a = GetLengthPrefixedSlice(prefix_len_key);
+  return comparator.CompareKeySeq(a, key);
+}
+
+void MemTableRep::InsertConcurrently(KeyHandle /*handle*/) {
+#ifndef ROCKSDB_LITE
+  throw std::runtime_error("concurrent insert not supported");
+#else
+  abort();
+#endif
+}
+
+Slice MemTableRep::UserKey(const char* key) const {
+  Slice slice = GetLengthPrefixedSlice(key);
+  return Slice(slice.data(), slice.size() - 8);
+}
+
+KeyHandle MemTableRep::Allocate(const size_t len, char** buf) {
+  *buf = allocator_->Allocate(len);
+  return static_cast<KeyHandle>(*buf);
+}
+
+// Encode a suitable internal key target for "target" and return it.
+// Uses *scratch as scratch space, and the returned pointer will point
+// into this scratch space.
+const char* EncodeKey(std::string* scratch, const Slice& target) {
+  scratch->clear();
+  PutVarint32(scratch, static_cast<uint32_t>(target.size()));
+  scratch->append(target.data(), target.size());
+  return scratch->data();
+}
+
+class MemTableIterator : public InternalIterator {
+ public:
+  MemTableIterator(const MemTable& mem, const ReadOptions& read_options,
+                   Arena* arena, bool use_range_del_table = false)
+      : bloom_(nullptr),
+        prefix_extractor_(mem.prefix_extractor_),
+        comparator_(mem.comparator_),
+        valid_(false),
+        arena_mode_(arena != nullptr),
+        value_pinned_(
+            !mem.GetImmutableMemTableOptions()->inplace_update_support),
+        protection_bytes_per_key_(mem.moptions_.protection_bytes_per_key),
+        status_(Status::OK()),
+        logger_(mem.moptions_.info_log) {
+    if (use_range_del_table) {
+      iter_ = mem.range_del_table_->GetIterator(arena);
+    } else if (prefix_extractor_ != nullptr && !read_options.total_order_seek &&
+               !read_options.auto_prefix_mode) {
+      // Auto prefix mode is not implemented in memtable yet.
+      bloom_ = mem.bloom_filter_.get();
+      iter_ = mem.table_->GetDynamicPrefixIterator(arena);
+    } else {
+      iter_ = mem.table_->GetIterator(arena);
+    }
+    status_.PermitUncheckedError();
+  }
+  // No copying allowed
+  MemTableIterator(const MemTableIterator&) = delete;
+  void operator=(const MemTableIterator&) = delete;
+
+  ~MemTableIterator() override {
+#ifndef NDEBUG
+    // Assert that the MemTableIterator is never deleted while
+    // Pinning is Enabled.
+    assert(!pinned_iters_mgr_ || !pinned_iters_mgr_->PinningEnabled());
+#endif
+    if (arena_mode_) {
+      iter_->~Iterator();
+    } else {
+      delete iter_;
+    }
+  }
+
+#ifndef NDEBUG
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+  }
+  PinnedIteratorsManager* pinned_iters_mgr_ = nullptr;
+#endif
+
+  bool Valid() const override { return valid_ && status_.ok(); }
+  void Seek(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      // iterator should only use prefix bloom filter
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts)) {
+        if (!bloom_->MayContain(
+                prefix_extractor_->Transform(user_k_without_ts))) {
+          PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+          valid_ = false;
+          return;
+        } else {
+          PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        }
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void SeekForPrev(const Slice& k) override {
+    PERF_TIMER_GUARD(seek_on_memtable_time);
+    PERF_COUNTER_ADD(seek_on_memtable_count, 1);
+    if (bloom_) {
+      auto ts_sz = comparator_.comparator.user_comparator()->timestamp_size();
+      Slice user_k_without_ts(ExtractUserKeyAndStripTimestamp(k, ts_sz));
+      if (prefix_extractor_->InDomain(user_k_without_ts)) {
+        if (!bloom_->MayContain(
+                prefix_extractor_->Transform(user_k_without_ts))) {
+          PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+          valid_ = false;
+          return;
+        } else {
+          PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+        }
+      }
+    }
+    iter_->Seek(k, nullptr);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+    if (!Valid() && status().ok()) {
+      SeekToLast();
+    }
+    while (Valid() && comparator_.comparator.Compare(k, key()) < 0) {
+      Prev();
+    }
+  }
+  void SeekToFirst() override {
+    iter_->SeekToFirst();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void SeekToLast() override {
+    iter_->SeekToLast();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  void Next() override {
+    PERF_COUNTER_ADD(next_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Next();
+    TEST_SYNC_POINT_CALLBACK("MemTableIterator::Next:0", iter_);
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  bool NextAndGetResult(IterateResult* result) override {
+    Next();
+    bool is_valid = Valid();
+    if (is_valid) {
+      result->key = key();
+      result->bound_check_result = IterBoundCheck::kUnknown;
+      result->value_prepared = true;
+    }
+    return is_valid;
+  }
+  void Prev() override {
+    PERF_COUNTER_ADD(prev_on_memtable_count, 1);
+    assert(Valid());
+    iter_->Prev();
+    valid_ = iter_->Valid();
+    VerifyEntryChecksum();
+  }
+  Slice key() const override {
+    assert(Valid());
+    return GetLengthPrefixedSlice(iter_->key());
+  }
+  Slice value() const override {
+    assert(Valid());
+    Slice key_slice = GetLengthPrefixedSlice(iter_->key());
+    return GetLengthPrefixedSlice(key_slice.data() + key_slice.size());
+  }
+
+  Status status() const override { return status_; }
+
+  bool IsKeyPinned() const override {
+    // memtable data is always pinned
+    return true;
+  }
+
+  bool IsValuePinned() const override {
+    // memtable value is always pinned, except if we allow inplace update.
+    return value_pinned_;
+  }
+
+ private:
+  DynamicBloom* bloom_;
+  const SliceTransform* const prefix_extractor_;
+  const MemTable::KeyComparator comparator_;
+  MemTableRep::Iterator* iter_;
+  bool valid_;
+  bool arena_mode_;
+  bool value_pinned_;
+  size_t protection_bytes_per_key_;
+  Status status_;
+  Logger* logger_;
+
+  void VerifyEntryChecksum() {
+    if (protection_bytes_per_key_ > 0 && Valid()) {
+      status_ = MemTable::VerifyEntryChecksum(iter_->key(),
+                                              protection_bytes_per_key_);
+      if (!status_.ok()) {
+        ROCKS_LOG_ERROR(logger_, "In MemtableIterator: %s", status_.getState());
+      }
+    }
+  }
+};
+
+InternalIterator* MemTable::NewIterator(const ReadOptions& read_options,
+                                        Arena* arena) {
+  assert(arena != nullptr);
+  auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
+  return new (mem) MemTableIterator(*this, read_options, arena);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIterator(
+    const ReadOptions& read_options, SequenceNumber read_seq,
+    bool immutable_memtable) {
+  if (read_options.ignore_range_deletions ||
+      is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    return nullptr;
+  }
+  return NewRangeTombstoneIteratorInternal(read_options, read_seq,
+                                           immutable_memtable);
+}
+
+FragmentedRangeTombstoneIterator* MemTable::NewRangeTombstoneIteratorInternal(
+    const ReadOptions& read_options, SequenceNumber read_seq,
+    bool immutable_memtable) {
+  if (immutable_memtable) {
+    // Note that caller should already have verified that
+    // !is_range_del_table_empty_
+    assert(IsFragmentedRangeTombstonesConstructed());
+    return new FragmentedRangeTombstoneIterator(
+        fragmented_range_tombstone_list_.get(), comparator_.comparator,
+        read_seq, read_options.timestamp);
+  }
+
+  // takes current cache
+  std::shared_ptr<FragmentedRangeTombstoneListCache> cache =
+      std::atomic_load_explicit(cached_range_tombstone_.Access(),
+                                std::memory_order_relaxed);
+  // construct fragmented tombstone list if necessary
+  if (!cache->initialized.load(std::memory_order_acquire)) {
+    cache->reader_mutex.lock();
+    if (!cache->tombstones) {
+      auto* unfragmented_iter =
+          new MemTableIterator(*this, read_options, nullptr /* arena */,
+                               true /* use_range_del_table */);
+      cache->tombstones.reset(new FragmentedRangeTombstoneList(
+          std::unique_ptr<InternalIterator>(unfragmented_iter),
+          comparator_.comparator));
+      cache->initialized.store(true, std::memory_order_release);
+    }
+    cache->reader_mutex.unlock();
+  }
+
+  auto* fragmented_iter = new FragmentedRangeTombstoneIterator(
+      cache, comparator_.comparator, read_seq, read_options.timestamp);
+  return fragmented_iter;
+}
+
+void MemTable::ConstructFragmentedRangeTombstones() {
+  assert(!IsFragmentedRangeTombstonesConstructed(false));
+  // There should be no concurrent Construction
+  if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) {
+    auto* unfragmented_iter =
+        new MemTableIterator(*this, ReadOptions(), nullptr /* arena */,
+                             true /* use_range_del_table */);
+
+    fragmented_range_tombstone_list_ =
+        std::make_unique<FragmentedRangeTombstoneList>(
+            std::unique_ptr<InternalIterator>(unfragmented_iter),
+            comparator_.comparator);
+  }
+}
+
+port::RWMutex* MemTable::GetLock(const Slice& key) {
+  return &locks_[GetSliceRangedNPHash(key, locks_.size())];
+}
+
+MemTable::MemTableStats MemTable::ApproximateStats(const Slice& start_ikey,
+                                                   const Slice& end_ikey) {
+  uint64_t entry_count = table_->ApproximateNumEntries(start_ikey, end_ikey);
+  entry_count += range_del_table_->ApproximateNumEntries(start_ikey, end_ikey);
+  if (entry_count == 0) {
+    return {0, 0};
+  }
+  uint64_t n = num_entries_.load(std::memory_order_relaxed);
+  if (n == 0) {
+    return {0, 0};
+  }
+  if (entry_count > n) {
+    // (range_del_)table_->ApproximateNumEntries() is just an estimate so it can
+    // be larger than actual entries we have. Cap it to entries we have to limit
+    // the inaccuracy.
+    entry_count = n;
+  }
+  uint64_t data_size = data_size_.load(std::memory_order_relaxed);
+  return {entry_count * (data_size / n), entry_count};
+}
+
+Status MemTable::VerifyEncodedEntry(Slice encoded,
+                                    const ProtectionInfoKVOS64& kv_prot_info) {
+  uint32_t ikey_len = 0;
+  if (!GetVarint32(&encoded, &ikey_len)) {
+    return Status::Corruption("Unable to parse internal key length");
+  }
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  if (ikey_len < 8 + ts_sz) {
+    return Status::Corruption("Internal key length too short");
+  }
+  if (ikey_len > encoded.size()) {
+    return Status::Corruption("Internal key length too long");
+  }
+  uint32_t value_len = 0;
+  const size_t user_key_len = ikey_len - 8;
+  Slice key(encoded.data(), user_key_len);
+  encoded.remove_prefix(user_key_len);
+
+  uint64_t packed = DecodeFixed64(encoded.data());
+  ValueType value_type = kMaxValue;
+  SequenceNumber sequence_number = kMaxSequenceNumber;
+  UnPackSequenceAndType(packed, &sequence_number, &value_type);
+  encoded.remove_prefix(8);
+
+  if (!GetVarint32(&encoded, &value_len)) {
+    return Status::Corruption("Unable to parse value length");
+  }
+  if (value_len < encoded.size()) {
+    return Status::Corruption("Value length too short");
+  }
+  if (value_len > encoded.size()) {
+    return Status::Corruption("Value length too long");
+  }
+  Slice value(encoded.data(), value_len);
+
+  return kv_prot_info.StripS(sequence_number)
+      .StripKVO(key, value, value_type)
+      .GetStatus();
+}
+
+void MemTable::UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+                                   const Slice& key, const Slice& value,
+                                   ValueType type, SequenceNumber s,
+                                   char* checksum_ptr) {
+  if (moptions_.protection_bytes_per_key == 0) {
+    return;
+  }
+
+  uint64_t checksum = 0;
+  if (kv_prot_info == nullptr) {
+    checksum =
+        ProtectionInfo64().ProtectKVO(key, value, type).ProtectS(s).GetVal();
+  } else {
+    checksum = kv_prot_info->GetVal();
+  }
+  switch (moptions_.protection_bytes_per_key) {
+    case 1:
+      checksum_ptr[0] = static_cast<uint8_t>(checksum);
+      break;
+    case 2:
+      EncodeFixed16(checksum_ptr, static_cast<uint16_t>(checksum));
+      break;
+    case 4:
+      EncodeFixed32(checksum_ptr, static_cast<uint32_t>(checksum));
+      break;
+    case 8:
+      EncodeFixed64(checksum_ptr, checksum);
+      break;
+    default:
+      assert(false);
+  }
+}
+
+Status MemTable::Add(SequenceNumber s, ValueType type,
+                     const Slice& key, /* user key */
+                     const Slice& value,
+                     const ProtectionInfoKVOS64* kv_prot_info,
+                     bool allow_concurrent,
+                     MemTablePostProcessInfo* post_process_info, void** hint) {
+  // Format of an entry is concatenation of:
+  //  key_size     : varint32 of internal_key.size()
+  //  key bytes    : char[internal_key.size()]
+  //  value_size   : varint32 of value.size()
+  //  value bytes  : char[value.size()]
+  //  checksum     : char[moptions_.protection_bytes_per_key]
+  uint32_t key_size = static_cast<uint32_t>(key.size());
+  uint32_t val_size = static_cast<uint32_t>(value.size());
+  uint32_t internal_key_size = key_size + 8;
+  const uint32_t encoded_len = VarintLength(internal_key_size) +
+                               internal_key_size + VarintLength(val_size) +
+                               val_size + moptions_.protection_bytes_per_key;
+  char* buf = nullptr;
+  std::unique_ptr<MemTableRep>& table =
+      type == kTypeRangeDeletion ? range_del_table_ : table_;
+  KeyHandle handle = table->Allocate(encoded_len, &buf);
+
+  char* p = EncodeVarint32(buf, internal_key_size);
+  memcpy(p, key.data(), key_size);
+  Slice key_slice(p, key_size);
+  p += key_size;
+  uint64_t packed = PackSequenceAndType(s, type);
+  EncodeFixed64(p, packed);
+  p += 8;
+  p = EncodeVarint32(p, val_size);
+  memcpy(p, value.data(), val_size);
+  assert((unsigned)(p + val_size - buf + moptions_.protection_bytes_per_key) ==
+         (unsigned)encoded_len);
+
+  UpdateEntryChecksum(kv_prot_info, key, value, type, s,
+                      buf + encoded_len - moptions_.protection_bytes_per_key);
+  Slice encoded(buf, encoded_len - moptions_.protection_bytes_per_key);
+  if (kv_prot_info != nullptr) {
+    TEST_SYNC_POINT_CALLBACK("MemTable::Add:Encoded", &encoded);
+    Status status = VerifyEncodedEntry(encoded, *kv_prot_info);
+    if (!status.ok()) {
+      return status;
+    }
+  }
+
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice key_without_ts = StripTimestampFromUserKey(key, ts_sz);
+
+  if (!allow_concurrent) {
+    // Extract prefix for insert with hint.
+    if (insert_with_hint_prefix_extractor_ != nullptr &&
+        insert_with_hint_prefix_extractor_->InDomain(key_slice)) {
+      Slice prefix = insert_with_hint_prefix_extractor_->Transform(key_slice);
+      bool res = table->InsertKeyWithHint(handle, &insert_hints_[prefix]);
+      if (UNLIKELY(!res)) {
+        return Status::TryAgain("key+seq exists");
+      }
+    } else {
+      bool res = table->InsertKey(handle);
+      if (UNLIKELY(!res)) {
+        return Status::TryAgain("key+seq exists");
+      }
+    }
+
+    // this is a bit ugly, but is the way to avoid locked instructions
+    // when incrementing an atomic
+    num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
+                       std::memory_order_relaxed);
+    data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
+                     std::memory_order_relaxed);
+    if (type == kTypeDeletion || type == kTypeSingleDeletion ||
+        type == kTypeDeletionWithTimestamp) {
+      num_deletes_.store(num_deletes_.load(std::memory_order_relaxed) + 1,
+                         std::memory_order_relaxed);
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->Add(prefix_extractor_->Transform(key_without_ts));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->Add(key_without_ts);
+    }
+
+    // The first sequence number inserted into the memtable
+    assert(first_seqno_ == 0 || s >= first_seqno_);
+    if (first_seqno_ == 0) {
+      first_seqno_.store(s, std::memory_order_relaxed);
+
+      if (earliest_seqno_ == kMaxSequenceNumber) {
+        earliest_seqno_.store(GetFirstSequenceNumber(),
+                              std::memory_order_relaxed);
+      }
+      assert(first_seqno_.load() >= earliest_seqno_.load());
+    }
+    assert(post_process_info == nullptr);
+    UpdateFlushState();
+  } else {
+    bool res = (hint == nullptr)
+                   ? table->InsertKeyConcurrently(handle)
+                   : table->InsertKeyWithHintConcurrently(handle, hint);
+    if (UNLIKELY(!res)) {
+      return Status::TryAgain("key+seq exists");
+    }
+
+    assert(post_process_info != nullptr);
+    post_process_info->num_entries++;
+    post_process_info->data_size += encoded_len;
+    if (type == kTypeDeletion) {
+      post_process_info->num_deletes++;
+    }
+
+    if (bloom_filter_ && prefix_extractor_ &&
+        prefix_extractor_->InDomain(key_without_ts)) {
+      bloom_filter_->AddConcurrently(
+          prefix_extractor_->Transform(key_without_ts));
+    }
+    if (bloom_filter_ && moptions_.memtable_whole_key_filtering) {
+      bloom_filter_->AddConcurrently(key_without_ts);
+    }
+
+    // atomically update first_seqno_ and earliest_seqno_.
+    uint64_t cur_seq_num = first_seqno_.load(std::memory_order_relaxed);
+    while ((cur_seq_num == 0 || s < cur_seq_num) &&
+           !first_seqno_.compare_exchange_weak(cur_seq_num, s)) {
+    }
+    uint64_t cur_earliest_seqno =
+        earliest_seqno_.load(std::memory_order_relaxed);
+    while (
+        (cur_earliest_seqno == kMaxSequenceNumber || s < cur_earliest_seqno) &&
+        !first_seqno_.compare_exchange_weak(cur_earliest_seqno, s)) {
+    }
+  }
+  if (type == kTypeRangeDeletion) {
+    auto new_cache = std::make_shared<FragmentedRangeTombstoneListCache>();
+    size_t size = cached_range_tombstone_.Size();
+    if (allow_concurrent) {
+      range_del_mutex_.lock();
+    }
+    for (size_t i = 0; i < size; ++i) {
+      std::shared_ptr<FragmentedRangeTombstoneListCache>* local_cache_ref_ptr =
+          cached_range_tombstone_.AccessAtCore(i);
+      auto new_local_cache_ref = std::make_shared<
+          const std::shared_ptr<FragmentedRangeTombstoneListCache>>(new_cache);
+      // It is okay for some reader to load old cache during invalidation as
+      // the new sequence number is not published yet.
+      // Each core will have a shared_ptr to a shared_ptr to the cached
+      // fragmented range tombstones, so that ref count is maintianed locally
+      // per-core using the per-core shared_ptr.
+      std::atomic_store_explicit(
+          local_cache_ref_ptr,
+          std::shared_ptr<FragmentedRangeTombstoneListCache>(
+              new_local_cache_ref, new_cache.get()),
+          std::memory_order_relaxed);
+    }
+    if (allow_concurrent) {
+      range_del_mutex_.unlock();
+    }
+    is_range_del_table_empty_.store(false, std::memory_order_relaxed);
+  }
+  UpdateOldestKeyTime();
+
+  TEST_SYNC_POINT_CALLBACK("MemTable::Add:BeforeReturn:Encoded", &encoded);
+  return Status::OK();
+}
+
+// Callback from MemTable::Get()
+namespace {
+
+struct Saver {
+  Status* status;
+  const LookupKey* key;
+  bool* found_final_value;  // Is value set correctly? Used by KeyMayExist
+  bool* merge_in_progress;
+  std::string* value;
+  PinnableWideColumns* columns;
+  SequenceNumber seq;
+  std::string* timestamp;
+  const MergeOperator* merge_operator;
+  // the merge operations encountered;
+  MergeContext* merge_context;
+  SequenceNumber max_covering_tombstone_seq;
+  MemTable* mem;
+  Logger* logger;
+  Statistics* statistics;
+  bool inplace_update_support;
+  bool do_merge;
+  SystemClock* clock;
+
+  ReadCallback* callback_;
+  bool* is_blob_index;
+  bool allow_data_in_errors;
+  size_t protection_bytes_per_key;
+  bool CheckCallback(SequenceNumber _seq) {
+    if (callback_) {
+      return callback_->IsVisible(_seq);
+    }
+    return true;
+  }
+};
+}  // anonymous namespace
+
+static bool SaveValue(void* arg, const char* entry) {
+  TEST_SYNC_POINT_CALLBACK("Memtable::SaveValue:Begin:entry", &entry);
+  Saver* s = reinterpret_cast<Saver*>(arg);
+  assert(s != nullptr);
+  assert(!s->value || !s->columns);
+
+  if (s->protection_bytes_per_key > 0) {
+    *(s->status) = MemTable::VerifyEntryChecksum(
+        entry, s->protection_bytes_per_key, s->allow_data_in_errors);
+    if (!s->status->ok()) {
+      ROCKS_LOG_ERROR(s->logger, "In SaveValue: %s", s->status->getState());
+      // Memtable entry corrupted
+      return false;
+    }
+  }
+
+  MergeContext* merge_context = s->merge_context;
+  SequenceNumber max_covering_tombstone_seq = s->max_covering_tombstone_seq;
+  const MergeOperator* merge_operator = s->merge_operator;
+
+  assert(merge_context != nullptr);
+
+  // Refer to comments under MemTable::Add() for entry format.
+  // Check that it belongs to same user key.
+  uint32_t key_length = 0;
+  const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+  assert(key_length >= 8);
+  Slice user_key_slice = Slice(key_ptr, key_length - 8);
+  const Comparator* user_comparator =
+      s->mem->GetInternalKeyComparator().user_comparator();
+  size_t ts_sz = user_comparator->timestamp_size();
+  if (ts_sz && s->timestamp && max_covering_tombstone_seq > 0) {
+    // timestamp should already be set to range tombstone timestamp
+    assert(s->timestamp->size() == ts_sz);
+  }
+  if (user_comparator->EqualWithoutTimestamp(user_key_slice,
+                                             s->key->user_key())) {
+    // Correct user key
+    const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+    ValueType type;
+    SequenceNumber seq;
+    UnPackSequenceAndType(tag, &seq, &type);
+    // If the value is not in the snapshot, skip it
+    if (!s->CheckCallback(seq)) {
+      return true;  // to continue to the next seq
+    }
+
+    if (s->seq == kMaxSequenceNumber) {
+      s->seq = seq;
+      if (s->seq > max_covering_tombstone_seq) {
+        if (ts_sz && s->timestamp != nullptr) {
+          // `timestamp` was set to range tombstone's timestamp before
+          // `SaveValue` is ever called. This key has a higher sequence number
+          // than range tombstone, and is the key with the highest seqno across
+          // all keys with this user_key, so we update timestamp here.
+          Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+          s->timestamp->assign(ts.data(), ts_sz);
+        }
+      } else {
+        s->seq = max_covering_tombstone_seq;
+      }
+    }
+
+    if (ts_sz > 0 && s->timestamp != nullptr) {
+      if (!s->timestamp->empty()) {
+        assert(ts_sz == s->timestamp->size());
+      }
+      // TODO optimize for smaller size ts
+      const std::string kMaxTs(ts_sz, '\xff');
+      if (s->timestamp->empty() ||
+          user_comparator->CompareTimestamp(*(s->timestamp), kMaxTs) == 0) {
+        Slice ts = ExtractTimestampFromUserKey(user_key_slice, ts_sz);
+        s->timestamp->assign(ts.data(), ts_sz);
+      }
+    }
+
+    if ((type == kTypeValue || type == kTypeMerge || type == kTypeBlobIndex ||
+         type == kTypeWideColumnEntity || type == kTypeDeletion ||
+         type == kTypeSingleDeletion || type == kTypeDeletionWithTimestamp) &&
+        max_covering_tombstone_seq > seq) {
+      type = kTypeRangeDeletion;
+    }
+    switch (type) {
+      case kTypeBlobIndex: {
+        if (!s->do_merge) {
+          *(s->status) = Status::NotSupported(
+              "GetMergeOperands not supported by stacked BlobDB");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (*(s->merge_in_progress)) {
+          *(s->status) = Status::NotSupported(
+              "Merge operator not supported by stacked BlobDB");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (s->is_blob_index == nullptr) {
+          ROCKS_LOG_ERROR(s->logger, "Encountered unexpected blob index.");
+          *(s->status) = Status::NotSupported(
+              "Encountered unexpected blob index. Please open DB with "
+              "ROCKSDB_NAMESPACE::blob_db::BlobDB.");
+          *(s->found_final_value) = true;
+          return false;
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (s->value) {
+          s->value->assign(v.data(), v.size());
+        } else if (s->columns) {
+          s->columns->SetPlainValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+        *(s->is_blob_index) = true;
+
+        return false;
+      }
+      case kTypeValue: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+          // TODO(yanqin) update MergeContext so that timestamps information
+          // can also be retained.
+
+          merge_context->PushOperand(
+              v, s->inplace_update_support == false /* operand_pinned */);
+        } else if (*(s->merge_in_progress)) {
+          assert(s->do_merge);
+
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), &v,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+        } else if (s->value) {
+          s->value->assign(v.data(), v.size());
+        } else if (s->columns) {
+          s->columns->SetPlainValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = false;
+        }
+
+        return false;
+      }
+      case kTypeWideColumnEntity: {
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadLock();
+        }
+
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+
+        *(s->status) = Status::OK();
+
+        if (!s->do_merge) {
+          // Preserve the value with the goal of returning it as part of
+          // raw merge operands to the user
+
+          Slice value_of_default;
+          *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+              v, value_of_default);
+
+          if (s->status->ok()) {
+            merge_context->PushOperand(
+                value_of_default,
+                s->inplace_update_support == false /* operand_pinned */);
+          }
+        } else if (*(s->merge_in_progress)) {
+          assert(s->do_merge);
+
+          if (s->value) {
+            Slice value_of_default;
+            *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+                v, value_of_default);
+            if (s->status->ok()) {
+              *(s->status) = MergeHelper::TimedFullMerge(
+                  merge_operator, s->key->user_key(), &value_of_default,
+                  merge_context->GetOperands(), s->value, s->logger,
+                  s->statistics, s->clock, /* result_operand */ nullptr,
+                  /* update_num_ops_stats */ true);
+            }
+          } else if (s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMergeWithEntity(
+                merge_operator, s->key->user_key(), v,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              *(s->status) = s->columns->SetWideColumnValue(result);
+            }
+          }
+        } else if (s->value) {
+          Slice value_of_default;
+          *(s->status) = WideColumnSerialization::GetValueOfDefaultColumn(
+              v, value_of_default);
+          if (s->status->ok()) {
+            s->value->assign(value_of_default.data(), value_of_default.size());
+          }
+        } else if (s->columns) {
+          *(s->status) = s->columns->SetWideColumnValue(v);
+        }
+
+        if (s->inplace_update_support) {
+          s->mem->GetLock(s->key->user_key())->ReadUnlock();
+        }
+
+        *(s->found_final_value) = true;
+
+        if (s->is_blob_index != nullptr) {
+          *(s->is_blob_index) = false;
+        }
+
+        return false;
+      }
+      case kTypeDeletion:
+      case kTypeDeletionWithTimestamp:
+      case kTypeSingleDeletion:
+      case kTypeRangeDeletion: {
+        if (*(s->merge_in_progress)) {
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+        } else {
+          *(s->status) = Status::NotFound();
+        }
+        *(s->found_final_value) = true;
+        return false;
+      }
+      case kTypeMerge: {
+        if (!merge_operator) {
+          *(s->status) = Status::InvalidArgument(
+              "merge_operator is not properly initialized.");
+          // Normally we continue the loop (return true) when we see a merge
+          // operand.  But in case of an error, we should stop the loop
+          // immediately and pretend we have found the value to stop further
+          // seek.  Otherwise, the later call will override this error status.
+          *(s->found_final_value) = true;
+          return false;
+        }
+        Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
+        *(s->merge_in_progress) = true;
+        merge_context->PushOperand(
+            v, s->inplace_update_support == false /* operand_pinned */);
+        if (s->do_merge && merge_operator->ShouldMerge(
+                               merge_context->GetOperandsDirectionBackward())) {
+          if (s->value || s->columns) {
+            std::string result;
+            *(s->status) = MergeHelper::TimedFullMerge(
+                merge_operator, s->key->user_key(), nullptr,
+                merge_context->GetOperands(), &result, s->logger, s->statistics,
+                s->clock, /* result_operand */ nullptr,
+                /* update_num_ops_stats */ true);
+
+            if (s->status->ok()) {
+              if (s->value) {
+                *(s->value) = std::move(result);
+              } else {
+                assert(s->columns);
+                s->columns->SetPlainValue(result);
+              }
+            }
+          }
+
+          *(s->found_final_value) = true;
+          return false;
+        }
+        return true;
+      }
+      default: {
+        std::string msg("Corrupted value not expected.");
+        if (s->allow_data_in_errors) {
+          msg.append("Unrecognized value type: " +
+                     std::to_string(static_cast<int>(type)) + ". ");
+          msg.append("User key: " + user_key_slice.ToString(/*hex=*/true) +
+                     ". ");
+          msg.append("seq: " + std::to_string(seq) + ".");
+        }
+        *(s->status) = Status::Corruption(msg.c_str());
+        return false;
+      }
+    }
+  }
+
+  // s->state could be Corrupt, merge or notfound
+  return false;
+}
+
+bool MemTable::Get(const LookupKey& key, std::string* value,
+                   PinnableWideColumns* columns, std::string* timestamp,
+                   Status* s, MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   bool immutable_memtable, ReadCallback* callback,
+                   bool* is_blob_index, bool do_merge) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return false;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      NewRangeTombstoneIterator(read_opts,
+                                GetInternalKeySeqno(key.internal_key()),
+                                immutable_memtable));
+  if (range_del_iter != nullptr) {
+    SequenceNumber covering_seq =
+        range_del_iter->MaxCoveringTombstoneSeqnum(key.user_key());
+    if (covering_seq > *max_covering_tombstone_seq) {
+      *max_covering_tombstone_seq = covering_seq;
+      if (timestamp) {
+        // Will be overwritten in SaveValue() if there is a point key with
+        // a higher seqno.
+        timestamp->assign(range_del_iter->timestamp().data(),
+                          range_del_iter->timestamp().size());
+      }
+    }
+  }
+
+  bool found_final_value = false;
+  bool merge_in_progress = s->IsMergeInProgress();
+  bool may_contain = true;
+  size_t ts_sz = GetInternalKeyComparator().user_comparator()->timestamp_size();
+  Slice user_key_without_ts = StripTimestampFromUserKey(key.user_key(), ts_sz);
+  bool bloom_checked = false;
+  if (bloom_filter_) {
+    // when both memtable_whole_key_filtering and prefix_extractor_ are set,
+    // only do whole key filtering for Get() to save CPU
+    if (moptions_.memtable_whole_key_filtering) {
+      may_contain = bloom_filter_->MayContain(user_key_without_ts);
+      bloom_checked = true;
+    } else {
+      assert(prefix_extractor_);
+      if (prefix_extractor_->InDomain(user_key_without_ts)) {
+        may_contain = bloom_filter_->MayContain(
+            prefix_extractor_->Transform(user_key_without_ts));
+        bloom_checked = true;
+      }
+    }
+  }
+
+  if (bloom_filter_ && !may_contain) {
+    // iter is null if prefix bloom says the key does not exist
+    PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+    *seq = kMaxSequenceNumber;
+  } else {
+    if (bloom_checked) {
+      PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+    }
+    GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
+                 is_blob_index, value, columns, timestamp, s, merge_context,
+                 seq, &found_final_value, &merge_in_progress);
+  }
+
+  // No change to value, since we have not yet found a Put/Delete
+  // Propagate corruption error
+  if (!found_final_value && merge_in_progress && !s->IsCorruption()) {
+    *s = Status::MergeInProgress();
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+  return found_final_value;
+}
+
+void MemTable::GetFromTable(const LookupKey& key,
+                            SequenceNumber max_covering_tombstone_seq,
+                            bool do_merge, ReadCallback* callback,
+                            bool* is_blob_index, std::string* value,
+                            PinnableWideColumns* columns,
+                            std::string* timestamp, Status* s,
+                            MergeContext* merge_context, SequenceNumber* seq,
+                            bool* found_final_value, bool* merge_in_progress) {
+  Saver saver;
+  saver.status = s;
+  saver.found_final_value = found_final_value;
+  saver.merge_in_progress = merge_in_progress;
+  saver.key = &key;
+  saver.value = value;
+  saver.columns = columns;
+  saver.timestamp = timestamp;
+  saver.seq = kMaxSequenceNumber;
+  saver.mem = this;
+  saver.merge_context = merge_context;
+  saver.max_covering_tombstone_seq = max_covering_tombstone_seq;
+  saver.merge_operator = moptions_.merge_operator;
+  saver.logger = moptions_.info_log;
+  saver.inplace_update_support = moptions_.inplace_update_support;
+  saver.statistics = moptions_.statistics;
+  saver.clock = clock_;
+  saver.callback_ = callback;
+  saver.is_blob_index = is_blob_index;
+  saver.do_merge = do_merge;
+  saver.allow_data_in_errors = moptions_.allow_data_in_errors;
+  saver.protection_bytes_per_key = moptions_.protection_bytes_per_key;
+  table_->Get(key, &saver, SaveValue);
+  *seq = saver.seq;
+}
+
+void MemTable::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                        ReadCallback* callback, bool immutable_memtable) {
+  // The sequence number is updated synchronously in version_set.h
+  if (IsEmpty()) {
+    // Avoiding recording stats for speed.
+    return;
+  }
+  PERF_TIMER_GUARD(get_from_memtable_time);
+
+  // For now, memtable Bloom filter is effectively disabled if there are any
+  // range tombstones. This is the simplest way to ensure range tombstones are
+  // handled. TODO: allow Bloom checks where max_covering_tombstone_seq==0
+  bool no_range_del = read_options.ignore_range_deletions ||
+                      is_range_del_table_empty_.load(std::memory_order_relaxed);
+  MultiGetRange temp_range(*range, range->begin(), range->end());
+  if (bloom_filter_ && no_range_del) {
+    bool whole_key =
+        !prefix_extractor_ || moptions_.memtable_whole_key_filtering;
+    std::array<Slice, MultiGetContext::MAX_BATCH_SIZE> bloom_keys;
+    std::array<bool, MultiGetContext::MAX_BATCH_SIZE> may_match;
+    std::array<size_t, MultiGetContext::MAX_BATCH_SIZE> range_indexes;
+    int num_keys = 0;
+    for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+      if (whole_key) {
+        bloom_keys[num_keys] = iter->ukey_without_ts;
+        range_indexes[num_keys++] = iter.index();
+      } else if (prefix_extractor_->InDomain(iter->ukey_without_ts)) {
+        bloom_keys[num_keys] =
+            prefix_extractor_->Transform(iter->ukey_without_ts);
+        range_indexes[num_keys++] = iter.index();
+      }
+    }
+    bloom_filter_->MayContain(num_keys, &bloom_keys[0], &may_match[0]);
+    for (int i = 0; i < num_keys; ++i) {
+      if (!may_match[i]) {
+        temp_range.SkipIndex(range_indexes[i]);
+        PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
+      } else {
+        PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
+      }
+    }
+  }
+  for (auto iter = temp_range.begin(); iter != temp_range.end(); ++iter) {
+    bool found_final_value{false};
+    bool merge_in_progress = iter->s->IsMergeInProgress();
+    if (!no_range_del) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          NewRangeTombstoneIteratorInternal(
+              read_options, GetInternalKeySeqno(iter->lkey->internal_key()),
+              immutable_memtable));
+      SequenceNumber covering_seq =
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->lkey->user_key());
+      if (covering_seq > iter->max_covering_tombstone_seq) {
+        iter->max_covering_tombstone_seq = covering_seq;
+        if (iter->timestamp) {
+          // Will be overwritten in SaveValue() if there is a point key with
+          // a higher seqno.
+          iter->timestamp->assign(range_del_iter->timestamp().data(),
+                                  range_del_iter->timestamp().size());
+        }
+      }
+    }
+    SequenceNumber dummy_seq;
+    GetFromTable(*(iter->lkey), iter->max_covering_tombstone_seq, true,
+                 callback, &iter->is_blob_index, iter->value->GetSelf(),
+                 /*columns=*/nullptr, iter->timestamp, iter->s,
+                 &(iter->merge_context), &dummy_seq, &found_final_value,
+                 &merge_in_progress);
+
+    if (!found_final_value && merge_in_progress) {
+      *(iter->s) = Status::MergeInProgress();
+    }
+
+    if (found_final_value) {
+      iter->value->PinSelf();
+      range->AddValueSize(iter->value->size());
+      range->MarkKeyDone(iter);
+      RecordTick(moptions_.statistics, MEMTABLE_HIT);
+      if (range->GetValueSize() > read_options.value_size_soft_limit) {
+        // Set all remaining keys in range to Abort
+        for (auto range_iter = range->begin(); range_iter != range->end();
+             ++range_iter) {
+          range->MarkKeyDone(range_iter);
+          *(range_iter->s) = Status::Aborted();
+        }
+        break;
+      }
+    }
+  }
+  PERF_COUNTER_ADD(get_from_memtable_count, 1);
+}
+
+Status MemTable::Update(SequenceNumber seq, ValueType value_type,
+                        const Slice& key, const Slice& value,
+                        const ProtectionInfoKVOS64* kv_prot_info) {
+  LookupKey lkey(key, seq);
+  Slice mem_key = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), mem_key.data());
+
+  if (iter->Valid()) {
+    // Refer to comments under MemTable::Add() for entry format.
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      SequenceNumber existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      assert(existing_seq != seq);
+      if (type == value_type) {
+        Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+        uint32_t new_size = static_cast<uint32_t>(value.size());
+
+        // Update value, if new value size  <= previous value size
+        if (new_size <= prev_size) {
+          char* p =
+              EncodeVarint32(const_cast<char*>(key_ptr) + key_length, new_size);
+          WriteLock wl(GetLock(lkey.user_key()));
+          memcpy(p, value.data(), value.size());
+          assert((unsigned)((p + value.size()) - entry) ==
+                 (unsigned)(VarintLength(key_length) + key_length +
+                            VarintLength(value.size()) + value.size()));
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            UpdateEntryChecksum(&updated_kv_prot_info, key, value, type,
+                                existing_seq, p + value.size());
+            Slice encoded(entry, p + value.size() - entry);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          } else {
+            UpdateEntryChecksum(nullptr, key, value, type, existing_seq,
+                                p + value.size());
+          }
+          return Status::OK();
+        }
+      }
+    }
+  }
+
+  // The latest value is not value_type or key doesn't exist
+  return Add(seq, value_type, key, value, kv_prot_info);
+}
+
+Status MemTable::UpdateCallback(SequenceNumber seq, const Slice& key,
+                                const Slice& delta,
+                                const ProtectionInfoKVOS64* kv_prot_info) {
+  LookupKey lkey(key, seq);
+  Slice memkey = lkey.memtable_key();
+
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(lkey.internal_key(), memkey.data());
+
+  if (iter->Valid()) {
+    // Refer to comments under MemTable::Add() for entry format.
+    // Check that it belongs to same user key.  We do not check the
+    // sequence number since the Seek() call above should have skipped
+    // all entries with overly large sequence numbers.
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (comparator_.comparator.user_comparator()->Equal(
+            Slice(key_ptr, key_length - 8), lkey.user_key())) {
+      // Correct user key
+      const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8);
+      ValueType type;
+      uint64_t existing_seq;
+      UnPackSequenceAndType(tag, &existing_seq, &type);
+      if (type == kTypeValue) {
+        Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length);
+        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+
+        char* prev_buffer = const_cast<char*>(prev_value.data());
+        uint32_t new_prev_size = prev_size;
+
+        std::string str_value;
+        WriteLock wl(GetLock(lkey.user_key()));
+        auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
+                                                 delta, &str_value);
+        if (status == UpdateStatus::UPDATED_INPLACE) {
+          // Value already updated by callback.
+          assert(new_prev_size <= prev_size);
+          if (new_prev_size < prev_size) {
+            // overwrite the new prev_size
+            char* p = EncodeVarint32(const_cast<char*>(key_ptr) + key_length,
+                                     new_prev_size);
+            if (VarintLength(new_prev_size) < VarintLength(prev_size)) {
+              // shift the value buffer as well.
+              memcpy(p, prev_buffer, new_prev_size);
+              prev_buffer = p;
+            }
+          }
+          RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED);
+          UpdateFlushState();
+          Slice new_value(prev_buffer, new_prev_size);
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            // `seq` is swallowed and `existing_seq` prevails.
+            updated_kv_prot_info.UpdateS(seq, existing_seq);
+            updated_kv_prot_info.UpdateV(delta, new_value);
+            Slice encoded(entry, prev_buffer + new_prev_size - entry);
+            UpdateEntryChecksum(&updated_kv_prot_info, key, new_value, type,
+                                existing_seq, prev_buffer + new_prev_size);
+            return VerifyEncodedEntry(encoded, updated_kv_prot_info);
+          } else {
+            UpdateEntryChecksum(nullptr, key, new_value, type, existing_seq,
+                                prev_buffer + new_prev_size);
+          }
+          return Status::OK();
+        } else if (status == UpdateStatus::UPDATED) {
+          Status s;
+          if (kv_prot_info != nullptr) {
+            ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+            updated_kv_prot_info.UpdateV(delta, str_value);
+            s = Add(seq, kTypeValue, key, Slice(str_value),
+                    &updated_kv_prot_info);
+          } else {
+            s = Add(seq, kTypeValue, key, Slice(str_value),
+                    nullptr /* kv_prot_info */);
+          }
+          RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN);
+          UpdateFlushState();
+          return s;
+        } else if (status == UpdateStatus::UPDATE_FAILED) {
+          // `UPDATE_FAILED` is named incorrectly. It indicates no update
+          // happened. It does not indicate a failure happened.
+          UpdateFlushState();
+          return Status::OK();
+        }
+      }
+    }
+  }
+  // The latest value is not `kTypeValue` or key doesn't exist
+  return Status::NotFound();
+}
+
+size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
+  Slice memkey = key.memtable_key();
+
+  // A total ordered iterator is costly for some memtablerep (prefix aware
+  // reps). By passing in the user key, we allow efficient iterator creation.
+  // The iterator only needs to be ordered within the same user key.
+  std::unique_ptr<MemTableRep::Iterator> iter(
+      table_->GetDynamicPrefixIterator());
+  iter->Seek(key.internal_key(), memkey.data());
+
+  size_t num_successive_merges = 0;
+
+  for (; iter->Valid(); iter->Next()) {
+    const char* entry = iter->key();
+    uint32_t key_length = 0;
+    const char* iter_key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length);
+    if (!comparator_.comparator.user_comparator()->Equal(
+            Slice(iter_key_ptr, key_length - 8), key.user_key())) {
+      break;
+    }
+
+    const uint64_t tag = DecodeFixed64(iter_key_ptr + key_length - 8);
+    ValueType type;
+    uint64_t unused;
+    UnPackSequenceAndType(tag, &unused, &type);
+    if (type != kTypeMerge) {
+      break;
+    }
+
+    ++num_successive_merges;
+  }
+
+  return num_successive_merges;
+}
+
+void MemTableRep::Get(const LookupKey& k, void* callback_args,
+                      bool (*callback_func)(void* arg, const char* entry)) {
+  auto iter = GetDynamicPrefixIterator();
+  for (iter->Seek(k.internal_key(), k.memtable_key().data());
+       iter->Valid() && callback_func(callback_args, iter->key());
+       iter->Next()) {
+  }
+}
+
+void MemTable::RefLogContainingPrepSection(uint64_t log) {
+  assert(log > 0);
+  auto cur = min_prep_log_referenced_.load();
+  while ((log < cur || cur == 0) &&
+         !min_prep_log_referenced_.compare_exchange_strong(cur, log)) {
+    cur = min_prep_log_referenced_.load();
+  }
+}
+
+uint64_t MemTable::GetMinLogContainingPrepSection() {
+  return min_prep_log_referenced_.load();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable.h b/src/rocksdb/db/memtable.h
new file mode 100644
index 000000000..6db2721e4
--- /dev/null
+++ b/src/rocksdb/db/memtable.h
@@ -0,0 +1,664 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/kv_checksum.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/read_callback.h"
+#include "db/version_edit.h"
+#include "memory/allocator.h"
+#include "memory/concurrent_arena.h"
+#include "monitoring/instrumented_mutex.h"
+#include "options/cf_options.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "table/multiget_context.h"
+#include "util/dynamic_bloom.h"
+#include "util/hash.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FlushJobInfo;
+class Mutex;
+class MemTableIterator;
+class MergeContext;
+class SystemClock;
+
+struct ImmutableMemTableOptions {
+  explicit ImmutableMemTableOptions(const ImmutableOptions& ioptions,
+                                    const MutableCFOptions& mutable_cf_options);
+  size_t arena_block_size;
+  uint32_t memtable_prefix_bloom_bits;
+  size_t memtable_huge_page_size;
+  bool memtable_whole_key_filtering;
+  bool inplace_update_support;
+  size_t inplace_update_num_locks;
+  UpdateStatus (*inplace_callback)(char* existing_value,
+                                   uint32_t* existing_value_size,
+                                   Slice delta_value,
+                                   std::string* merged_value);
+  size_t max_successive_merges;
+  Statistics* statistics;
+  MergeOperator* merge_operator;
+  Logger* info_log;
+  bool allow_data_in_errors;
+  uint32_t protection_bytes_per_key;
+};
+
+// Batched counters to updated when inserting keys in one write batch.
+// In post process of the write batch, these can be updated together.
+// Only used in concurrent memtable insert case.
+struct MemTablePostProcessInfo {
+  uint64_t data_size = 0;
+  uint64_t num_entries = 0;
+  uint64_t num_deletes = 0;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// Note:  Many of the methods in this class have comments indicating that
+// external synchronization is required as these methods are not thread-safe.
+// It is up to higher layers of code to decide how to prevent concurrent
+// invocation of these methods.  This is usually done by acquiring either
+// the db mutex or the single writer thread.
+//
+// Some of these methods are documented to only require external
+// synchronization if this memtable is immutable.  Calling MarkImmutable() is
+// not sufficient to guarantee immutability.  It is up to higher layers of
+// code to determine if this MemTable can still be modified by other threads.
+// Eg: The Superversion stores a pointer to the current MemTable (that can
+// be modified) and a separate list of the MemTables that can no longer be
+// written to (aka the 'immutable memtables').
+class MemTable {
+ public:
+  struct KeyComparator : public MemTableRep::KeyComparator {
+    const InternalKeyComparator comparator;
+    explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) {}
+    virtual int operator()(const char* prefix_len_key1,
+                           const char* prefix_len_key2) const override;
+    virtual int operator()(const char* prefix_len_key,
+                           const DecodedType& key) const override;
+  };
+
+  // MemTables are reference counted.  The initial reference count
+  // is zero and the caller must call Ref() at least once.
+  //
+  // earliest_seq should be the current SequenceNumber in the db such that any
+  // key inserted into this memtable will have an equal or larger seq number.
+  // (When a db is first created, the earliest sequence number will be 0).
+  // If the earliest sequence number is not known, kMaxSequenceNumber may be
+  // used, but this may prevent some transactions from succeeding until the
+  // first key is inserted into the memtable.
+  explicit MemTable(const InternalKeyComparator& comparator,
+                    const ImmutableOptions& ioptions,
+                    const MutableCFOptions& mutable_cf_options,
+                    WriteBufferManager* write_buffer_manager,
+                    SequenceNumber earliest_seq, uint32_t column_family_id);
+  // No copying allowed
+  MemTable(const MemTable&) = delete;
+  MemTable& operator=(const MemTable&) = delete;
+
+  // Do not delete this MemTable unless Unref() indicates it not in use.
+  ~MemTable();
+
+  // Increase reference count.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void Ref() { ++refs_; }
+
+  // Drop reference count.
+  // If the refcount goes to zero return this memtable, otherwise return null.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  MemTable* Unref() {
+    --refs_;
+    assert(refs_ >= 0);
+    if (refs_ <= 0) {
+      return this;
+    }
+    return nullptr;
+  }
+
+  // Returns an estimate of the number of bytes of data in use by this
+  // data structure.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  size_t ApproximateMemoryUsage();
+
+  // As a cheap version of `ApproximateMemoryUsage()`, this function doesn't
+  // require external synchronization. The value may be less accurate though
+  size_t ApproximateMemoryUsageFast() const {
+    return approximate_memory_usage_.load(std::memory_order_relaxed);
+  }
+
+  // used by MemTableListVersion::MemoryAllocatedBytesExcludingLast
+  size_t MemoryAllocatedBytes() const {
+    return table_->ApproximateMemoryUsage() +
+           range_del_table_->ApproximateMemoryUsage() +
+           arena_.MemoryAllocatedBytes();
+  }
+
+  // Returns a vector of unique random memtable entries of size 'sample_size'.
+  //
+  // Note: the entries are stored in the unordered_set as length-prefixed keys,
+  //       hence their representation in the set as "const char*".
+  // Note2: the size of the output set 'entries' is not enforced to be strictly
+  //        equal to 'target_sample_size'. Its final size might be slightly
+  //        greater or slightly less than 'target_sample_size'
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  // REQUIRES: SkipList memtable representation. This function is not
+  // implemented for any other type of memtable representation (vectorrep,
+  // hashskiplist,...).
+  void UniqueRandomSample(const uint64_t& target_sample_size,
+                          std::unordered_set<const char*>* entries) {
+    // TODO(bjlemaire): at the moment, only supported by skiplistrep.
+    // Extend it to all other memtable representations.
+    table_->UniqueRandomSample(num_entries(), target_sample_size, entries);
+  }
+
+  // This method heuristically determines if the memtable should continue to
+  // host more data.
+  bool ShouldScheduleFlush() const {
+    return flush_state_.load(std::memory_order_relaxed) == FLUSH_REQUESTED;
+  }
+
+  // Returns true if a flush should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkFlushScheduled() {
+    auto before = FLUSH_REQUESTED;
+    return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
+                                                std::memory_order_relaxed,
+                                                std::memory_order_relaxed);
+  }
+
+  // Return an iterator that yields the contents of the memtable.
+  //
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.  The keys returned by this
+  // iterator are internal keys encoded by AppendInternalKey in the
+  // db/dbformat.{h,cc} module.
+  //
+  // By default, it returns an iterator for prefix seek if prefix_extractor
+  // is configured in Options.
+  // arena: If not null, the arena needs to be used to allocate the Iterator.
+  //        Calling ~Iterator of the iterator will destroy all the states but
+  //        those allocated in arena.
+  InternalIterator* NewIterator(const ReadOptions& read_options, Arena* arena);
+
+  // Returns an iterator that yields the range tombstones of the memtable.
+  // The caller must ensure that the underlying MemTable remains live
+  // while the returned iterator is live.
+  // @param immutable_memtable Whether this memtable is an immutable memtable.
+  // This information is not stored in memtable itself, so it needs to be
+  // specified by the caller. This flag is used internally to decide whether a
+  // cached fragmented range tombstone list can be returned. This cached version
+  // is constructed when a memtable becomes immutable. Setting the flag to false
+  // will always yield correct result, but may incur performance penalty as it
+  // always creates a new fragmented range tombstone list.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  Status VerifyEncodedEntry(Slice encoded,
+                            const ProtectionInfoKVOS64& kv_prot_info);
+
+  // Add an entry into memtable that maps key to value at the
+  // specified sequence number and with the specified type.
+  // Typically value will be empty if type==kTypeDeletion.
+  //
+  // REQUIRES: if allow_concurrent = false, external synchronization to prevent
+  // simultaneous operations on the same MemTable.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  Status Add(SequenceNumber seq, ValueType type, const Slice& key,
+             const Slice& value, const ProtectionInfoKVOS64* kv_prot_info,
+             bool allow_concurrent = false,
+             MemTablePostProcessInfo* post_process_info = nullptr,
+             void** hint = nullptr);
+
+  // Used to Get value associated with key or Get Merge Operands associated
+  // with key.
+  // If do_merge = true the default behavior which is Get value for key is
+  // executed. Expected behavior is described right below.
+  // If memtable contains a value for key, store it in *value and return true.
+  // If memtable contains a deletion for key, store a NotFound() error
+  // in *status and return true.
+  // If memtable contains Merge operation as the most recent entry for a key,
+  //   and the merge process does not stop (not reaching a value or delete),
+  //   prepend the current merge operand to *operands.
+  //   store MergeInProgress in s, and return false.
+  // Else, return false.
+  // If any operation was found, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  // On success, *s may be set to OK, NotFound, or MergeInProgress.  Any other
+  // status returned indicates a corruption or other unexpected error.
+  // If do_merge = false then any Merge Operands encountered for key are simply
+  // stored in merge_context.operands_list and never actually merged to get a
+  // final value. The raw Merge Operands are eventually returned to the user.
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true);
+
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, bool immutable_memtable,
+           ReadCallback* callback = nullptr, bool* is_blob_index = nullptr,
+           bool do_merge = true) {
+    SequenceNumber seq;
+    return Get(key, value, columns, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, immutable_memtable,
+               callback, is_blob_index, do_merge);
+  }
+
+  // @param immutable_memtable Whether this memtable is immutable. Used
+  // internally by NewRangeTombstoneIterator(). See comment above
+  // NewRangeTombstoneIterator() for more detail.
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback, bool immutable_memtable);
+
+  // If `key` exists in current memtable with type value_type and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // adds the new value to the memtable out-of-place.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status Update(SequenceNumber seq, ValueType value_type, const Slice& key,
+                const Slice& value, const ProtectionInfoKVOS64* kv_prot_info);
+
+  // If `key` exists in current memtable with type `kTypeValue` and the existing
+  // value is at least as large as the new value, updates it in-place. Otherwise
+  // if `key` exists in current memtable with type `kTypeValue`, adds the new
+  // value to the memtable out-of-place.
+  //
+  // Returns `Status::NotFound` if `key` does not exist in current memtable or
+  // the latest version of `key` does not have `kTypeValue`.
+  //
+  // Returns `Status::TryAgain` if the `seq`, `key` combination already exists
+  // in the memtable and `MemTableRepFactory::CanHandleDuplicatedKey()` is true.
+  // The next attempt should try a larger value for `seq`.
+  //
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  Status UpdateCallback(SequenceNumber seq, const Slice& key,
+                        const Slice& delta,
+                        const ProtectionInfoKVOS64* kv_prot_info);
+
+  // Returns the number of successive merge entries starting from the newest
+  // entry for the key up to the last non-merge entry or last entry for the
+  // key in the memtable.
+  size_t CountSuccessiveMergeEntries(const LookupKey& key);
+
+  // Update counters and flush status after inserting a whole write batch
+  // Used in concurrent memtable inserts.
+  void BatchPostProcess(const MemTablePostProcessInfo& update_counters) {
+    num_entries_.fetch_add(update_counters.num_entries,
+                           std::memory_order_relaxed);
+    data_size_.fetch_add(update_counters.data_size, std::memory_order_relaxed);
+    if (update_counters.num_deletes != 0) {
+      num_deletes_.fetch_add(update_counters.num_deletes,
+                             std::memory_order_relaxed);
+    }
+    UpdateFlushState();
+  }
+
+  // Get total number of entries in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_entries() const {
+    return num_entries_.load(std::memory_order_relaxed);
+  }
+
+  // Get total number of deletes in the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  uint64_t num_deletes() const {
+    return num_deletes_.load(std::memory_order_relaxed);
+  }
+
+  uint64_t get_data_size() const {
+    return data_size_.load(std::memory_order_relaxed);
+  }
+
+  // Dynamically change the memtable's capacity. If set below the current usage,
+  // the next key added will trigger a flush. Can only increase size when
+  // memtable prefix bloom is disabled, since we can't easily allocate more
+  // space.
+  void UpdateWriteBufferSize(size_t new_write_buffer_size) {
+    if (bloom_filter_ == nullptr ||
+        new_write_buffer_size < write_buffer_size_) {
+      write_buffer_size_.store(new_write_buffer_size,
+                               std::memory_order_relaxed);
+    }
+  }
+
+  // Returns the edits area that is needed for flushing the memtable
+  VersionEdit* GetEdits() { return &edit_; }
+
+  // Returns if there is no entry inserted to the mem table.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  bool IsEmpty() const { return first_seqno_ == 0; }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  SequenceNumber GetFirstSequenceNumber() {
+    return first_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number of the first element that was inserted
+  // into the memtable.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable (unless this Memtable is immutable).
+  void SetFirstSequenceNumber(SequenceNumber first_seqno) {
+    return first_seqno_.store(first_seqno, std::memory_order_relaxed);
+  }
+
+  // Returns the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  //
+  // If the earliest sequence number could not be determined,
+  // kMaxSequenceNumber will be returned.
+  SequenceNumber GetEarliestSequenceNumber() {
+    return earliest_seqno_.load(std::memory_order_relaxed);
+  }
+
+  // Sets the sequence number that is guaranteed to be smaller than or equal
+  // to the sequence number of any key that could be inserted into this
+  // memtable. It can then be assumed that any write with a larger(or equal)
+  // sequence number will be present in this memtable or a later memtable.
+  // Used only for MemPurge operation
+  void SetEarliestSequenceNumber(SequenceNumber earliest_seqno) {
+    return earliest_seqno_.store(earliest_seqno, std::memory_order_relaxed);
+  }
+
+  // DB's latest sequence ID when the memtable is created. This number
+  // may be updated to a more recent one before any key is inserted.
+  SequenceNumber GetCreationSeq() const { return creation_seq_; }
+
+  void SetCreationSeq(SequenceNumber sn) { creation_seq_ = sn; }
+
+  // Returns the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  uint64_t GetNextLogNumber() { return mem_next_logfile_number_; }
+
+  // Sets the next active logfile number when this memtable is about to
+  // be flushed to storage
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; }
+
+  // if this memtable contains data from a committed
+  // two phase transaction we must take note of the
+  // log which contains that data so we can know
+  // when to relese that log
+  void RefLogContainingPrepSection(uint64_t log);
+  uint64_t GetMinLogContainingPrepSection();
+
+  // Notify the underlying storage that no more items will be added.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  // After MarkImmutable() is called, you should not attempt to
+  // write anything to this MemTable().  (Ie. do not call Add() or Update()).
+  void MarkImmutable() {
+    table_->MarkReadOnly();
+    mem_tracker_.DoneAllocating();
+  }
+
+  // Notify the underlying storage that all data it contained has been
+  // persisted.
+  // REQUIRES: external synchronization to prevent simultaneous
+  // operations on the same MemTable.
+  void MarkFlushed() { table_->MarkFlushed(); }
+
+  // return true if the current MemTableRep supports merge operator.
+  bool IsMergeOperatorSupported() const {
+    return table_->IsMergeOperatorSupported();
+  }
+
+  // return true if the current MemTableRep supports snapshots.
+  // inplace update prevents snapshots,
+  bool IsSnapshotSupported() const {
+    return table_->IsSnapshotSupported() && !moptions_.inplace_update_support;
+  }
+
+  struct MemTableStats {
+    uint64_t size;
+    uint64_t count;
+  };
+
+  MemTableStats ApproximateStats(const Slice& start_ikey,
+                                 const Slice& end_ikey);
+
+  // Get the lock associated for the key
+  port::RWMutex* GetLock(const Slice& key);
+
+  const InternalKeyComparator& GetInternalKeyComparator() const {
+    return comparator_.comparator;
+  }
+
+  const ImmutableMemTableOptions* GetImmutableMemTableOptions() const {
+    return &moptions_;
+  }
+
+  uint64_t ApproximateOldestKeyTime() const {
+    return oldest_key_time_.load(std::memory_order_relaxed);
+  }
+
+  // REQUIRES: db_mutex held.
+  void SetID(uint64_t id) { id_ = id; }
+
+  uint64_t GetID() const { return id_; }
+
+  void SetFlushCompleted(bool completed) { flush_completed_ = completed; }
+
+  uint64_t GetFileNumber() const { return file_number_; }
+
+  void SetFileNumber(uint64_t file_num) { file_number_ = file_num; }
+
+  void SetFlushInProgress(bool in_progress) {
+    flush_in_progress_ = in_progress;
+  }
+
+#ifndef ROCKSDB_LITE
+  void SetFlushJobInfo(std::unique_ptr<FlushJobInfo>&& info) {
+    flush_job_info_ = std::move(info);
+  }
+
+  std::unique_ptr<FlushJobInfo> ReleaseFlushJobInfo() {
+    return std::move(flush_job_info_);
+  }
+#endif  // !ROCKSDB_LITE
+
+  // Returns a heuristic flush decision
+  bool ShouldFlushNow();
+
+  void ConstructFragmentedRangeTombstones();
+
+  // Returns whether a fragmented range tombstone list is already constructed
+  // for this memtable. It should be constructed right before a memtable is
+  // added to an immutable memtable list. Note that if a memtable does not have
+  // any range tombstone, then no range tombstone list will ever be constructed.
+  // @param allow_empty Specifies whether a memtable with no range tombstone is
+  // considered to have its fragmented range tombstone list constructed.
+  bool IsFragmentedRangeTombstonesConstructed(bool allow_empty = true) const {
+    if (allow_empty) {
+      return fragmented_range_tombstone_list_.get() != nullptr ||
+             is_range_del_table_empty_;
+    } else {
+      return fragmented_range_tombstone_list_.get() != nullptr;
+    }
+  }
+
+  // Returns Corruption status if verification fails.
+  static Status VerifyEntryChecksum(const char* entry,
+                                    size_t protection_bytes_per_key,
+                                    bool allow_data_in_errors = false);
+
+ private:
+  enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
+
+  friend class MemTableIterator;
+  friend class MemTableBackwardIterator;
+  friend class MemTableList;
+
+  KeyComparator comparator_;
+  const ImmutableMemTableOptions moptions_;
+  int refs_;
+  const size_t kArenaBlockSize;
+  AllocTracker mem_tracker_;
+  ConcurrentArena arena_;
+  std::unique_ptr<MemTableRep> table_;
+  std::unique_ptr<MemTableRep> range_del_table_;
+  std::atomic_bool is_range_del_table_empty_;
+
+  // Total data size of all data inserted
+  std::atomic<uint64_t> data_size_;
+  std::atomic<uint64_t> num_entries_;
+  std::atomic<uint64_t> num_deletes_;
+
+  // Dynamically changeable memtable option
+  std::atomic<size_t> write_buffer_size_;
+
+  // These are used to manage memtable flushes to storage
+  bool flush_in_progress_;  // started the flush
+  bool flush_completed_;    // finished the flush
+  uint64_t file_number_;    // filled up after flush is complete
+
+  // The updates to be applied to the transaction log when this
+  // memtable is flushed to storage.
+  VersionEdit edit_;
+
+  // The sequence number of the kv that was inserted first
+  std::atomic<SequenceNumber> first_seqno_;
+
+  // The db sequence number at the time of creation or kMaxSequenceNumber
+  // if not set.
+  std::atomic<SequenceNumber> earliest_seqno_;
+
+  SequenceNumber creation_seq_;
+
+  // The log files earlier than this number can be deleted.
+  uint64_t mem_next_logfile_number_;
+
+  // the earliest log containing a prepared section
+  // which has been inserted into this memtable.
+  std::atomic<uint64_t> min_prep_log_referenced_;
+
+  // rw locks for inplace updates
+  std::vector<port::RWMutex> locks_;
+
+  const SliceTransform* const prefix_extractor_;
+  std::unique_ptr<DynamicBloom> bloom_filter_;
+
+  std::atomic<FlushStateEnum> flush_state_;
+
+  SystemClock* clock_;
+
+  // Extract sequential insert prefixes.
+  const SliceTransform* insert_with_hint_prefix_extractor_;
+
+  // Insert hints for each prefix.
+  UnorderedMapH<Slice, void*, SliceHasher> insert_hints_;
+
+  // Timestamp of oldest key
+  std::atomic<uint64_t> oldest_key_time_;
+
+  // Memtable id to track flush.
+  uint64_t id_ = 0;
+
+  // Sequence number of the atomic flush that is responsible for this memtable.
+  // The sequence number of atomic flush is a seq, such that no writes with
+  // sequence numbers greater than or equal to seq are flushed, while all
+  // writes with sequence number smaller than seq are flushed.
+  SequenceNumber atomic_flush_seqno_;
+
+  // keep track of memory usage in table_, arena_, and range_del_table_.
+  // Gets refreshed inside `ApproximateMemoryUsage()` or `ShouldFlushNow`
+  std::atomic<uint64_t> approximate_memory_usage_;
+
+#ifndef ROCKSDB_LITE
+  // Flush job info of the current memtable.
+  std::unique_ptr<FlushJobInfo> flush_job_info_;
+#endif  // !ROCKSDB_LITE
+
+  // Updates flush_state_ using ShouldFlushNow()
+  void UpdateFlushState();
+
+  void UpdateOldestKeyTime();
+
+  void GetFromTable(const LookupKey& key,
+                    SequenceNumber max_covering_tombstone_seq, bool do_merge,
+                    ReadCallback* callback, bool* is_blob_index,
+                    std::string* value, PinnableWideColumns* columns,
+                    std::string* timestamp, Status* s,
+                    MergeContext* merge_context, SequenceNumber* seq,
+                    bool* found_final_value, bool* merge_in_progress);
+
+  // Always returns non-null and assumes certain pre-checks (e.g.,
+  // is_range_del_table_empty_) are done. This is only valid during the lifetime
+  // of the underlying memtable.
+  // read_seq and read_options.timestamp will be used as the upper bound
+  // for range tombstones.
+  FragmentedRangeTombstoneIterator* NewRangeTombstoneIteratorInternal(
+      const ReadOptions& read_options, SequenceNumber read_seq,
+      bool immutable_memtable);
+
+  // The fragmented range tombstones of this memtable.
+  // This is constructed when this memtable becomes immutable
+  // if !is_range_del_table_empty_.
+  std::unique_ptr<FragmentedRangeTombstoneList>
+      fragmented_range_tombstone_list_;
+
+  // makes sure there is a single range tombstone writer to invalidate cache
+  std::mutex range_del_mutex_;
+  CoreLocalArray<std::shared_ptr<FragmentedRangeTombstoneListCache>>
+      cached_range_tombstone_;
+
+  void UpdateEntryChecksum(const ProtectionInfoKVOS64* kv_prot_info,
+                           const Slice& key, const Slice& value, ValueType type,
+                           SequenceNumber s, char* checksum_ptr);
+};
+
+extern const char* EncodeKey(std::string* scratch, const Slice& target);
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.cc b/src/rocksdb/db/memtable_list.cc
new file mode 100644
index 000000000..1545003ad
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.cc
@@ -0,0 +1,991 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <queue>
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/memtable.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_set.h"
+#include "logging/log_buffer.h"
+#include "logging/logging.h"
+#include "monitoring/thread_status_util.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/iterator.h"
+#include "table/merging_iterator.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class InternalKeyComparator;
+class Mutex;
+class VersionSet;
+
+void MemTableListVersion::AddMemTable(MemTable* m) {
+  memlist_.push_front(m);
+  *parent_memtable_list_memory_usage_ += m->ApproximateMemoryUsage();
+}
+
+void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
+                                        MemTable* m) {
+  if (m->Unref()) {
+    to_delete->push_back(m);
+    assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
+    *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage, const MemTableListVersion& old)
+    : max_write_buffer_number_to_maintain_(
+          old.max_write_buffer_number_to_maintain_),
+      max_write_buffer_size_to_maintain_(
+          old.max_write_buffer_size_to_maintain_),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {
+  memlist_ = old.memlist_;
+  for (auto& m : memlist_) {
+    m->Ref();
+  }
+
+  memlist_history_ = old.memlist_history_;
+  for (auto& m : memlist_history_) {
+    m->Ref();
+  }
+}
+
+MemTableListVersion::MemTableListVersion(
+    size_t* parent_memtable_list_memory_usage,
+    int max_write_buffer_number_to_maintain,
+    int64_t max_write_buffer_size_to_maintain)
+    : max_write_buffer_number_to_maintain_(max_write_buffer_number_to_maintain),
+      max_write_buffer_size_to_maintain_(max_write_buffer_size_to_maintain),
+      parent_memtable_list_memory_usage_(parent_memtable_list_memory_usage) {}
+
+void MemTableListVersion::Ref() { ++refs_; }
+
+// called by superversion::clean()
+void MemTableListVersion::Unref(autovector<MemTable*>* to_delete) {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    // if to_delete is equal to nullptr it means we're confident
+    // that refs_ will not be zero
+    assert(to_delete != nullptr);
+    for (const auto& m : memlist_) {
+      UnrefMemTable(to_delete, m);
+    }
+    for (const auto& m : memlist_history_) {
+      UnrefMemTable(to_delete, m);
+    }
+    delete this;
+  }
+}
+
+int MemTableList::NumNotFlushed() const {
+  int size = static_cast<int>(current_->memlist_.size());
+  assert(num_flush_not_started_ <= size);
+  return size;
+}
+
+int MemTableList::NumFlushed() const {
+  return static_cast<int>(current_->memlist_history_.size());
+}
+
+// Search all the memtables starting from the most recent one.
+// Return the most recent value found, if any.
+// Operands stores the list of merge operations to apply, so far.
+bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
+                              PinnableWideColumns* columns,
+                              std::string* timestamp, Status* s,
+                              MergeContext* merge_context,
+                              SequenceNumber* max_covering_tombstone_seq,
+                              SequenceNumber* seq, const ReadOptions& read_opts,
+                              ReadCallback* callback, bool* is_blob_index) {
+  return GetFromList(&memlist_, key, value, columns, timestamp, s,
+                     merge_context, max_covering_tombstone_seq, seq, read_opts,
+                     callback, is_blob_index);
+}
+
+void MemTableListVersion::MultiGet(const ReadOptions& read_options,
+                                   MultiGetRange* range,
+                                   ReadCallback* callback) {
+  for (auto memtable : memlist_) {
+    memtable->MultiGet(read_options, range, callback,
+                       true /* immutable_memtable */);
+    if (range->empty()) {
+      return;
+    }
+  }
+}
+
+bool MemTableListVersion::GetMergeOperands(
+    const LookupKey& key, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, const ReadOptions& read_opts) {
+  for (MemTable* memtable : memlist_) {
+    bool done = memtable->Get(
+        key, /*value=*/nullptr, /*columns=*/nullptr, /*timestamp=*/nullptr, s,
+        merge_context, max_covering_tombstone_seq, read_opts,
+        true /* immutable_memtable */, nullptr, nullptr, false);
+    if (done) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MemTableListVersion::GetFromHistory(
+    const LookupKey& key, std::string* value, PinnableWideColumns* columns,
+    std::string* timestamp, Status* s, MergeContext* merge_context,
+    SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+    const ReadOptions& read_opts, bool* is_blob_index) {
+  return GetFromList(&memlist_history_, key, value, columns, timestamp, s,
+                     merge_context, max_covering_tombstone_seq, seq, read_opts,
+                     nullptr /*read_callback*/, is_blob_index);
+}
+
+bool MemTableListVersion::GetFromList(
+    std::list<MemTable*>* list, const LookupKey& key, std::string* value,
+    PinnableWideColumns* columns, std::string* timestamp, Status* s,
+    MergeContext* merge_context, SequenceNumber* max_covering_tombstone_seq,
+    SequenceNumber* seq, const ReadOptions& read_opts, ReadCallback* callback,
+    bool* is_blob_index) {
+  *seq = kMaxSequenceNumber;
+
+  for (auto& memtable : *list) {
+    assert(memtable->IsFragmentedRangeTombstonesConstructed());
+    SequenceNumber current_seq = kMaxSequenceNumber;
+
+    bool done =
+        memtable->Get(key, value, columns, timestamp, s, merge_context,
+                      max_covering_tombstone_seq, &current_seq, read_opts,
+                      true /* immutable_memtable */, callback, is_blob_index);
+    if (*seq == kMaxSequenceNumber) {
+      // Store the most recent sequence number of any operation on this key.
+      // Since we only care about the most recent change, we only need to
+      // return the first operation found when searching memtables in
+      // reverse-chronological order.
+      // current_seq would be equal to kMaxSequenceNumber if the value was to be
+      // skipped. This allows seq to be assigned again when the next value is
+      // read.
+      *seq = current_seq;
+    }
+
+    if (done) {
+      assert(*seq != kMaxSequenceNumber || s->IsNotFound());
+      return true;
+    }
+    if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) {
+      return false;
+    }
+  }
+  return false;
+}
+
+Status MemTableListVersion::AddRangeTombstoneIterators(
+    const ReadOptions& read_opts, Arena* /*arena*/,
+    RangeDelAggregator* range_del_agg) {
+  assert(range_del_agg != nullptr);
+  // Except for snapshot read, using kMaxSequenceNumber is OK because these
+  // are immutable memtables.
+  SequenceNumber read_seq = read_opts.snapshot != nullptr
+                                ? read_opts.snapshot->GetSequenceNumber()
+                                : kMaxSequenceNumber;
+  for (auto& m : memlist_) {
+    assert(m->IsFragmentedRangeTombstonesConstructed());
+    std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+        m->NewRangeTombstoneIterator(read_opts, read_seq,
+                                     true /* immutable_memtable */));
+    range_del_agg->AddTombstones(std::move(range_del_iter));
+  }
+  return Status::OK();
+}
+
+void MemTableListVersion::AddIterators(
+    const ReadOptions& options, std::vector<InternalIterator*>* iterator_list,
+    Arena* arena) {
+  for (auto& m : memlist_) {
+    iterator_list->push_back(m->NewIterator(options, arena));
+  }
+}
+
+void MemTableListVersion::AddIterators(const ReadOptions& options,
+                                       MergeIteratorBuilder* merge_iter_builder,
+                                       bool add_range_tombstone_iter) {
+  for (auto& m : memlist_) {
+    auto mem_iter = m->NewIterator(options, merge_iter_builder->GetArena());
+    if (!add_range_tombstone_iter || options.ignore_range_deletions) {
+      merge_iter_builder->AddIterator(mem_iter);
+    } else {
+      // Except for snapshot read, using kMaxSequenceNumber is OK because these
+      // are immutable memtables.
+      SequenceNumber read_seq = options.snapshot != nullptr
+                                    ? options.snapshot->GetSequenceNumber()
+                                    : kMaxSequenceNumber;
+      TruncatedRangeDelIterator* mem_tombstone_iter = nullptr;
+      auto range_del_iter = m->NewRangeTombstoneIterator(
+          options, read_seq, true /* immutale_memtable */);
+      if (range_del_iter == nullptr || range_del_iter->empty()) {
+        delete range_del_iter;
+      } else {
+        mem_tombstone_iter = new TruncatedRangeDelIterator(
+            std::unique_ptr<FragmentedRangeTombstoneIterator>(range_del_iter),
+            &m->GetInternalKeyComparator(), nullptr /* smallest */,
+            nullptr /* largest */);
+      }
+      merge_iter_builder->AddPointAndTombstoneIterator(mem_iter,
+                                                       mem_tombstone_iter);
+    }
+  }
+}
+
+uint64_t MemTableListVersion::GetTotalNumEntries() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_entries();
+  }
+  return total_num;
+}
+
+MemTable::MemTableStats MemTableListVersion::ApproximateStats(
+    const Slice& start_ikey, const Slice& end_ikey) {
+  MemTable::MemTableStats total_stats = {0, 0};
+  for (auto& m : memlist_) {
+    auto mStats = m->ApproximateStats(start_ikey, end_ikey);
+    total_stats.size += mStats.size;
+    total_stats.count += mStats.count;
+  }
+  return total_stats;
+}
+
+uint64_t MemTableListVersion::GetTotalNumDeletes() const {
+  uint64_t total_num = 0;
+  for (auto& m : memlist_) {
+    total_num += m->num_deletes();
+  }
+  return total_num;
+}
+
+SequenceNumber MemTableListVersion::GetEarliestSequenceNumber(
+    bool include_history) const {
+  if (include_history && !memlist_history_.empty()) {
+    return memlist_history_.back()->GetEarliestSequenceNumber();
+  } else if (!memlist_.empty()) {
+    return memlist_.back()->GetEarliestSequenceNumber();
+  } else {
+    return kMaxSequenceNumber;
+  }
+}
+
+SequenceNumber MemTableListVersion::GetFirstSequenceNumber() const {
+  SequenceNumber min_first_seqno = kMaxSequenceNumber;
+  // The first memtable in the list might not be the oldest one with mempurge
+  for (const auto& m : memlist_) {
+    min_first_seqno = std::min(m->GetFirstSequenceNumber(), min_first_seqno);
+  }
+  return min_first_seqno;
+}
+
+// caller is responsible for referencing m
+void MemTableListVersion::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  AddMemTable(m);
+  // m->MemoryAllocatedBytes() is added in MemoryAllocatedBytesExcludingLast
+  TrimHistory(to_delete, 0);
+}
+
+// Removes m from list of memtables not flushed.  Caller should NOT Unref m.
+void MemTableListVersion::Remove(MemTable* m,
+                                 autovector<MemTable*>* to_delete) {
+  assert(refs_ == 1);  // only when refs_ == 1 is MemTableListVersion mutable
+  memlist_.remove(m);
+
+  m->MarkFlushed();
+  if (max_write_buffer_size_to_maintain_ > 0 ||
+      max_write_buffer_number_to_maintain_ > 0) {
+    memlist_history_.push_front(m);
+    // Unable to get size of mutable memtable at this point, pass 0 to
+    // TrimHistory as a best effort.
+    TrimHistory(to_delete, 0);
+  } else {
+    UnrefMemTable(to_delete, m);
+  }
+}
+
+// return the total memory usage assuming the oldest flushed memtable is dropped
+size_t MemTableListVersion::MemoryAllocatedBytesExcludingLast() const {
+  size_t total_memtable_size = 0;
+  for (auto& memtable : memlist_) {
+    total_memtable_size += memtable->MemoryAllocatedBytes();
+  }
+  for (auto& memtable : memlist_history_) {
+    total_memtable_size += memtable->MemoryAllocatedBytes();
+  }
+  if (!memlist_history_.empty()) {
+    total_memtable_size -= memlist_history_.back()->MemoryAllocatedBytes();
+  }
+  return total_memtable_size;
+}
+
+bool MemTableListVersion::MemtableLimitExceeded(size_t usage) {
+  if (max_write_buffer_size_to_maintain_ > 0) {
+    // calculate the total memory usage after dropping the oldest flushed
+    // memtable, compare with max_write_buffer_size_to_maintain_ to decide
+    // whether to trim history
+    return MemoryAllocatedBytesExcludingLast() + usage >=
+           static_cast<size_t>(max_write_buffer_size_to_maintain_);
+  } else if (max_write_buffer_number_to_maintain_ > 0) {
+    return memlist_.size() + memlist_history_.size() >
+           static_cast<size_t>(max_write_buffer_number_to_maintain_);
+  } else {
+    return false;
+  }
+}
+
+// Make sure we don't use up too much space in history
+bool MemTableListVersion::TrimHistory(autovector<MemTable*>* to_delete,
+                                      size_t usage) {
+  bool ret = false;
+  while (MemtableLimitExceeded(usage) && !memlist_history_.empty()) {
+    MemTable* x = memlist_history_.back();
+    memlist_history_.pop_back();
+
+    UnrefMemTable(to_delete, x);
+    ret = true;
+  }
+  return ret;
+}
+
+// Returns true if there is at least one memtable on which flush has
+// not yet started.
+bool MemTableList::IsFlushPending() const {
+  if ((flush_requested_ && num_flush_not_started_ > 0) ||
+      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
+    assert(imm_flush_needed.load(std::memory_order_relaxed));
+    return true;
+  }
+  return false;
+}
+
+bool MemTableList::IsFlushPendingOrRunning() const {
+  if (current_->memlist_.size() - num_flush_not_started_ > 0) {
+    // Flush is already running on at least one memtable
+    return true;
+  }
+  return IsFlushPending();
+}
+
+// Returns the memtables that need to be flushed.
+void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
+                                        autovector<MemTable*>* ret,
+                                        uint64_t* max_next_log_number) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_PICK_MEMTABLES_TO_FLUSH);
+  const auto& memlist = current_->memlist_;
+  bool atomic_flush = false;
+
+  // Note: every time MemTableList::Add(mem) is called, it adds the new mem
+  // at the FRONT of the memlist (memlist.push_front(mem)). Therefore, by
+  // iterating through the memlist starting at the end, the vector<MemTable*>
+  // ret is filled with memtables already sorted in increasing MemTable ID.
+  // However, when the mempurge feature is activated, new memtables with older
+  // IDs will be added to the memlist.
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* m = *it;
+    if (!atomic_flush && m->atomic_flush_seqno_ != kMaxSequenceNumber) {
+      atomic_flush = true;
+    }
+    if (m->GetID() > max_memtable_id) {
+      break;
+    }
+    if (!m->flush_in_progress_) {
+      assert(!m->flush_completed_);
+      num_flush_not_started_--;
+      if (num_flush_not_started_ == 0) {
+        imm_flush_needed.store(false, std::memory_order_release);
+      }
+      m->flush_in_progress_ = true;  // flushing will start very soon
+      if (max_next_log_number) {
+        *max_next_log_number =
+            std::max(m->GetNextLogNumber(), *max_next_log_number);
+      }
+      ret->push_back(m);
+    } else if (!ret->empty()) {
+      // This `break` is necessary to prevent picking non-consecutive memtables
+      // in case `memlist` has one or more entries with
+      // `flush_in_progress_ == true` sandwiched between entries with
+      // `flush_in_progress_ == false`. This could happen after parallel flushes
+      // are picked and the one flushing older memtables is rolled back.
+      break;
+    }
+  }
+  if (!atomic_flush || num_flush_not_started_ == 0) {
+    flush_requested_ = false;  // start-flush request is complete
+  }
+}
+
+void MemTableList::RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                                         uint64_t /*file_number*/) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_ROLLBACK);
+  assert(!mems.empty());
+
+  // If the flush was not successful, then just reset state.
+  // Maybe a succeeding attempt to flush will be successful.
+  for (MemTable* m : mems) {
+    assert(m->flush_in_progress_);
+    assert(m->file_number_ == 0);
+
+    m->flush_in_progress_ = false;
+    m->flush_completed_ = false;
+    m->edit_.Clear();
+    num_flush_not_started_++;
+  }
+  imm_flush_needed.store(true, std::memory_order_release);
+}
+
+// Try record a successful flush in the manifest file. It might just return
+// Status::OK letting a concurrent flush to do actual the recording..
+Status MemTableList::TryInstallMemtableFlushResults(
+    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+    const autovector<MemTable*>& mems, LogsWithPrepTracker* prep_tracker,
+    VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer,
+    std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+    bool write_edits) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  // Flush was successful
+  // Record the status on the memtable object. Either this call or a call by a
+  // concurrent flush thread will read the status and write it to manifest.
+  for (size_t i = 0; i < mems.size(); ++i) {
+    // All the edits are associated with the first memtable of this batch.
+    assert(i == 0 || mems[i]->GetEdits()->NumEntries() == 0);
+
+    mems[i]->flush_completed_ = true;
+    mems[i]->file_number_ = file_number;
+  }
+
+  // if some other thread is already committing, then return
+  Status s;
+  if (commit_in_progress_) {
+    TEST_SYNC_POINT("MemTableList::TryInstallMemtableFlushResults:InProgress");
+    return s;
+  }
+
+  // Only a single thread can be executing this piece of code
+  commit_in_progress_ = true;
+
+  // Retry until all completed flushes are committed. New flushes can finish
+  // while the current thread is writing manifest where mutex is released.
+  while (s.ok()) {
+    auto& memlist = current_->memlist_;
+    // The back is the oldest; if flush_completed_ is not set to it, it means
+    // that we were assigned a more recent memtable. The memtables' flushes must
+    // be recorded in manifest in order. A concurrent flush thread, who is
+    // assigned to flush the oldest memtable, will later wake up and does all
+    // the pending writes to manifest, in order.
+    if (memlist.empty() || !memlist.back()->flush_completed_) {
+      break;
+    }
+    // scan all memtables from the earliest, and commit those
+    // (in that order) that have finished flushing. Memtables
+    // are always committed in the order that they were created.
+    uint64_t batch_file_number = 0;
+    size_t batch_count = 0;
+    autovector<VersionEdit*> edit_list;
+    autovector<MemTable*> memtables_to_flush;
+    // enumerate from the last (earliest) element to see how many batch finished
+    for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+      MemTable* m = *it;
+      if (!m->flush_completed_) {
+        break;
+      }
+      if (it == memlist.rbegin() || batch_file_number != m->file_number_) {
+        batch_file_number = m->file_number_;
+        if (m->edit_.GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64 " started",
+                           cfd->GetName().c_str(), m->file_number_);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files) started",
+                           cfd->GetName().c_str(), m->file_number_,
+                           m->edit_.GetBlobFileAdditions().size());
+        }
+
+        edit_list.push_back(&m->edit_);
+        memtables_to_flush.push_back(m);
+#ifndef ROCKSDB_LITE
+        std::unique_ptr<FlushJobInfo> info = m->ReleaseFlushJobInfo();
+        if (info != nullptr) {
+          committed_flush_jobs_info->push_back(std::move(info));
+        }
+#else
+        (void)committed_flush_jobs_info;
+#endif  // !ROCKSDB_LITE
+      }
+      batch_count++;
+    }
+
+    // TODO(myabandeh): Not sure how batch_count could be 0 here.
+    if (batch_count > 0) {
+      uint64_t min_wal_number_to_keep = 0;
+      assert(edit_list.size() > 0);
+      if (vset->db_options()->allow_2pc) {
+        // Note that if mempurge is successful, the edit_list will
+        // not be applicable (contains info of new min_log number to keep,
+        // and level 0 file path of SST file created during normal flush,
+        // so both pieces of information are irrelevant after a successful
+        // mempurge operation).
+        min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+            vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
+
+        // We piggyback the information of earliest log file to keep in the
+        // manifest entry for the last file flushed.
+      } else {
+        min_wal_number_to_keep =
+            PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
+      }
+
+      VersionEdit wal_deletion;
+      wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+      if (vset->db_options()->track_and_verify_wals_in_manifest) {
+        if (min_wal_number_to_keep >
+            vset->GetWalSet().GetMinWalNumberToKeep()) {
+          wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+        }
+        TEST_SYNC_POINT_CALLBACK(
+            "MemTableList::TryInstallMemtableFlushResults:"
+            "AfterComputeMinWalToKeep",
+            nullptr);
+      }
+      edit_list.push_back(&wal_deletion);
+
+      const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
+                                      to_delete, mu](const Status& status) {
+        RemoveMemTablesOrRestoreFlags(status, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+      };
+      if (write_edits) {
+        // this can release and reacquire the mutex.
+        s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu,
+                              db_directory, /*new_descriptor_log=*/false,
+                              /*column_family_options=*/nullptr,
+                              manifest_write_cb);
+      } else {
+        // If write_edit is false (e.g: successful mempurge),
+        // then remove old memtables, wake up manifest write queue threads,
+        // and don't commit anything to the manifest file.
+        RemoveMemTablesOrRestoreFlags(s, cfd, batch_count, log_buffer,
+                                      to_delete, mu);
+        // Note: cfd->SetLogNumber is only called when a VersionEdit
+        // is written to MANIFEST. When mempurge is succesful, we skip
+        // this step, therefore cfd->GetLogNumber is always is
+        // earliest log with data unflushed.
+        // Notify new head of manifest write queue.
+        // wake up all the waiting writers
+        // TODO(bjlemaire): explain full reason WakeUpWaitingManifestWriters
+        // needed or investigate more.
+        vset->WakeUpWaitingManifestWriters();
+      }
+    }
+  }
+  commit_in_progress_ = false;
+  return s;
+}
+
+// New memtables are inserted at the front of the list.
+void MemTableList::Add(MemTable* m, autovector<MemTable*>* to_delete) {
+  assert(static_cast<int>(current_->memlist_.size()) >= num_flush_not_started_);
+  InstallNewVersion();
+  // this method is used to move mutable memtable into an immutable list.
+  // since mutable memtable is already refcounted by the DBImpl,
+  // and when moving to the immutable list we don't unref it,
+  // we don't have to ref the memtable here. we just take over the
+  // reference from the DBImpl.
+  current_->Add(m, to_delete);
+  m->MarkImmutable();
+  num_flush_not_started_++;
+  if (num_flush_not_started_ == 1) {
+    imm_flush_needed.store(true, std::memory_order_release);
+  }
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+bool MemTableList::TrimHistory(autovector<MemTable*>* to_delete, size_t usage) {
+  InstallNewVersion();
+  bool ret = current_->TrimHistory(to_delete, usage);
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+  return ret;
+}
+
+// Returns an estimate of the number of bytes of data in use.
+size_t MemTableList::ApproximateUnflushedMemTablesMemoryUsage() {
+  size_t total_size = 0;
+  for (auto& memtable : current_->memlist_) {
+    total_size += memtable->ApproximateMemoryUsage();
+  }
+  return total_size;
+}
+
+size_t MemTableList::ApproximateMemoryUsage() { return current_memory_usage_; }
+
+size_t MemTableList::MemoryAllocatedBytesExcludingLast() const {
+  const size_t usage = current_memory_allocted_bytes_excluding_last_.load(
+      std::memory_order_relaxed);
+  return usage;
+}
+
+bool MemTableList::HasHistory() const {
+  const bool has_history = current_has_history_.load(std::memory_order_relaxed);
+  return has_history;
+}
+
+void MemTableList::UpdateCachedValuesFromMemTableListVersion() {
+  const size_t total_memtable_size =
+      current_->MemoryAllocatedBytesExcludingLast();
+  current_memory_allocted_bytes_excluding_last_.store(
+      total_memtable_size, std::memory_order_relaxed);
+
+  const bool has_history = current_->HasHistory();
+  current_has_history_.store(has_history, std::memory_order_relaxed);
+}
+
+uint64_t MemTableList::ApproximateOldestKeyTime() const {
+  if (!current_->memlist_.empty()) {
+    return current_->memlist_.back()->ApproximateOldestKeyTime();
+  }
+  return std::numeric_limits<uint64_t>::max();
+}
+
+void MemTableList::InstallNewVersion() {
+  if (current_->refs_ == 1) {
+    // we're the only one using the version, just keep using it
+  } else {
+    // somebody else holds the current version, we need to create new one
+    MemTableListVersion* version = current_;
+    current_ = new MemTableListVersion(&current_memory_usage_, *version);
+    current_->Ref();
+    version->Unref();
+  }
+}
+
+void MemTableList::RemoveMemTablesOrRestoreFlags(
+    const Status& s, ColumnFamilyData* cfd, size_t batch_count,
+    LogBuffer* log_buffer, autovector<MemTable*>* to_delete,
+    InstrumentedMutex* mu) {
+  assert(mu);
+  mu->AssertHeld();
+  assert(to_delete);
+  // we will be changing the version in the next code path,
+  // so we better create a new one, since versions are immutable
+  InstallNewVersion();
+
+  // All the later memtables that have the same filenum
+  // are part of the same batch. They can be committed now.
+  uint64_t mem_id = 1;  // how many memtables have been flushed.
+
+  // commit new state only if the column family is NOT dropped.
+  // The reason is as follows (refer to
+  // ColumnFamilyTest.FlushAndDropRaceCondition).
+  // If the column family is dropped, then according to LogAndApply, its
+  // corresponding flush operation is NOT written to the MANIFEST. This
+  // means the DB is not aware of the L0 files generated from the flush.
+  // By committing the new state, we remove the memtable from the memtable
+  // list. Creating an iterator on this column family will not be able to
+  // read full data since the memtable is removed, and the DB is not aware
+  // of the L0 files, causing MergingIterator unable to build child
+  // iterators. RocksDB contract requires that the iterator can be created
+  // on a dropped column family, and we must be able to
+  // read full data as long as column family handle is not deleted, even if
+  // the column family is dropped.
+  if (s.ok() && !cfd->IsDropped()) {  // commit new state
+    while (batch_count-- > 0) {
+      MemTable* m = current_->memlist_.back();
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "[%s] Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " done",
+                         cfd->GetName().c_str(), m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      assert(m->file_number_ > 0);
+      current_->Remove(m, to_delete);
+      UpdateCachedValuesFromMemTableListVersion();
+      ResetTrimHistoryNeeded();
+      ++mem_id;
+    }
+  } else {
+    for (auto it = current_->memlist_.rbegin(); batch_count-- > 0; ++it) {
+      MemTable* m = *it;
+      // commit failed. setup state so that we can flush again.
+      if (m->edit_.GetBlobFileAdditions().empty()) {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64 ": memtable #%" PRIu64
+                         " failed",
+                         m->file_number_, mem_id);
+      } else {
+        ROCKS_LOG_BUFFER(log_buffer,
+                         "Level-0 commit table #%" PRIu64
+                         " (+%zu blob files)"
+                         ": memtable #%" PRIu64 " failed",
+                         m->file_number_,
+                         m->edit_.GetBlobFileAdditions().size(), mem_id);
+      }
+
+      m->flush_completed_ = false;
+      m->flush_in_progress_ = false;
+      m->edit_.Clear();
+      num_flush_not_started_++;
+      m->file_number_ = 0;
+      imm_flush_needed.store(true, std::memory_order_release);
+      ++mem_id;
+    }
+  }
+}
+
+uint64_t MemTableList::PrecomputeMinLogContainingPrepSection(
+    const std::unordered_set<MemTable*>* memtables_to_flush) {
+  uint64_t min_log = 0;
+
+  for (auto& m : current_->memlist_) {
+    if (memtables_to_flush && memtables_to_flush->count(m)) {
+      continue;
+    }
+
+    auto log = m->GetMinLogContainingPrepSection();
+
+    if (log > 0 && (min_log == 0 || log < min_log)) {
+      min_log = log;
+    }
+  }
+
+  return min_log;
+}
+
+// Commit a successful atomic flush in the manifest file.
+Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_metas,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer) {
+  AutoThreadOperationStageUpdater stage_updater(
+      ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS);
+  mu->AssertHeld();
+
+  size_t num = mems_list.size();
+  assert(cfds.size() == num);
+  if (imm_lists != nullptr) {
+    assert(imm_lists->size() == num);
+  }
+  if (num == 0) {
+    return Status::OK();
+  }
+
+  for (size_t k = 0; k != num; ++k) {
+#ifndef NDEBUG
+    const auto* imm =
+        (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    if (!mems_list[k]->empty()) {
+      assert((*mems_list[k])[0]->GetID() == imm->GetEarliestMemTableID());
+    }
+#endif
+    assert(nullptr != file_metas[k]);
+    for (size_t i = 0; i != mems_list[k]->size(); ++i) {
+      assert(i == 0 || (*mems_list[k])[i]->GetEdits()->NumEntries() == 0);
+      (*mems_list[k])[i]->SetFlushCompleted(true);
+      (*mems_list[k])[i]->SetFileNumber(file_metas[k]->fd.GetNumber());
+    }
+#ifndef ROCKSDB_LITE
+    if (committed_flush_jobs_info[k]) {
+      assert(!mems_list[k]->empty());
+      assert((*mems_list[k])[0]);
+      std::unique_ptr<FlushJobInfo> flush_job_info =
+          (*mems_list[k])[0]->ReleaseFlushJobInfo();
+      committed_flush_jobs_info[k]->push_back(std::move(flush_job_info));
+    }
+#else   //! ROCKSDB_LITE
+    (void)committed_flush_jobs_info;
+#endif  // ROCKSDB_LITE
+  }
+
+  Status s;
+
+  autovector<autovector<VersionEdit*>> edit_lists;
+  uint32_t num_entries = 0;
+  for (const auto mems : mems_list) {
+    assert(mems != nullptr);
+    autovector<VersionEdit*> edits;
+    assert(!mems->empty());
+    edits.emplace_back((*mems)[0]->GetEdits());
+    ++num_entries;
+    edit_lists.emplace_back(edits);
+  }
+
+  WalNumber min_wal_number_to_keep = 0;
+  if (vset->db_options()->allow_2pc) {
+    min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
+        vset, cfds, edit_lists, mems_list, prep_tracker);
+  } else {
+    min_wal_number_to_keep =
+        PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
+  }
+
+  VersionEdit wal_deletion;
+  wal_deletion.SetMinLogNumberToKeep(min_wal_number_to_keep);
+  if (vset->db_options()->track_and_verify_wals_in_manifest &&
+      min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
+    wal_deletion.DeleteWalsBefore(min_wal_number_to_keep);
+  }
+  edit_lists.back().push_back(&wal_deletion);
+  ++num_entries;
+
+  // Mark the version edits as an atomic group if the number of version edits
+  // exceeds 1.
+  if (cfds.size() > 1) {
+    for (size_t i = 0; i < edit_lists.size(); i++) {
+      assert((edit_lists[i].size() == 1) ||
+             ((edit_lists[i].size() == 2) && (i == edit_lists.size() - 1)));
+      for (auto& e : edit_lists[i]) {
+        e->MarkAtomicGroup(--num_entries);
+      }
+    }
+    assert(0 == num_entries);
+  }
+
+  // this can release and reacquire the mutex.
+  s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                        db_directory);
+
+  for (size_t k = 0; k != cfds.size(); ++k) {
+    auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k);
+    imm->InstallNewVersion();
+  }
+
+  if (s.ok() || s.IsColumnFamilyDropped()) {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      if (cfds[i]->IsDropped()) {
+        continue;
+      }
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        assert(m->GetFileNumber() > 0);
+        uint64_t mem_id = m->GetID();
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " done",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
+        imm->current_->Remove(m, to_delete);
+        imm->UpdateCachedValuesFromMemTableListVersion();
+        imm->ResetTrimHistoryNeeded();
+      }
+    }
+  } else {
+    for (size_t i = 0; i != cfds.size(); ++i) {
+      auto* imm = (imm_lists == nullptr) ? cfds[i]->imm() : imm_lists->at(i);
+      for (auto m : *mems_list[i]) {
+        uint64_t mem_id = m->GetID();
+
+        const VersionEdit* const edit = m->GetEdits();
+        assert(edit);
+
+        if (edit->GetBlobFileAdditions().empty()) {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           mem_id);
+        } else {
+          ROCKS_LOG_BUFFER(log_buffer,
+                           "[%s] Level-0 commit table #%" PRIu64
+                           " (+%zu blob files)"
+                           ": memtable #%" PRIu64 " failed",
+                           cfds[i]->GetName().c_str(), m->GetFileNumber(),
+                           edit->GetBlobFileAdditions().size(), mem_id);
+        }
+
+        m->SetFlushCompleted(false);
+        m->SetFlushInProgress(false);
+        m->GetEdits()->Clear();
+        m->SetFileNumber(0);
+        imm->num_flush_not_started_++;
+      }
+      imm->imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  return s;
+}
+
+void MemTableList::RemoveOldMemTables(uint64_t log_number,
+                                      autovector<MemTable*>* to_delete) {
+  assert(to_delete != nullptr);
+  InstallNewVersion();
+  auto& memlist = current_->memlist_;
+  autovector<MemTable*> old_memtables;
+  for (auto it = memlist.rbegin(); it != memlist.rend(); ++it) {
+    MemTable* mem = *it;
+    if (mem->GetNextLogNumber() > log_number) {
+      break;
+    }
+    old_memtables.push_back(mem);
+  }
+
+  for (auto it = old_memtables.begin(); it != old_memtables.end(); ++it) {
+    MemTable* mem = *it;
+    current_->Remove(mem, to_delete);
+    --num_flush_not_started_;
+    if (0 == num_flush_not_started_) {
+      imm_flush_needed.store(false, std::memory_order_release);
+    }
+  }
+
+  UpdateCachedValuesFromMemTableListVersion();
+  ResetTrimHistoryNeeded();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list.h b/src/rocksdb/db/memtable_list.h
new file mode 100644
index 000000000..1ad28a59e
--- /dev/null
+++ b/src/rocksdb/db/memtable_list.h
@@ -0,0 +1,471 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <limits>
+#include <list>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/logs_with_prep_tracker.h"
+#include "db/memtable.h"
+#include "db/range_del_aggregator.h"
+#include "file/filename.h"
+#include "logging/log_buffer.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/db.h"
+#include "rocksdb/iterator.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+class InternalKeyComparator;
+class InstrumentedMutex;
+class MergeIteratorBuilder;
+class MemTableList;
+
+struct FlushJobInfo;
+
+// keeps a list of immutable memtables in a vector. the list is immutable
+// if refcount is bigger than one. It is used as a state for Get() and
+// Iterator code paths
+//
+// This class is not thread-safe.  External synchronization is required
+// (such as holding the db mutex or being on the write thread).
+class MemTableListVersion {
+ public:
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               const MemTableListVersion& old);
+  explicit MemTableListVersion(size_t* parent_memtable_list_memory_usage,
+                               int max_write_buffer_number_to_maintain,
+                               int64_t max_write_buffer_size_to_maintain);
+
+  void Ref();
+  void Unref(autovector<MemTable*>* to_delete = nullptr);
+
+  // Search all the memtables starting from the most recent one.
+  // Return the most recent value found, if any.
+  //
+  // If any operation was found for this key, its most recent sequence number
+  // will be stored in *seq on success (regardless of whether true/false is
+  // returned).  Otherwise, *seq will be set to kMaxSequenceNumber.
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq, SequenceNumber* seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr);
+
+  bool Get(const LookupKey& key, std::string* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* s,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           const ReadOptions& read_opts, ReadCallback* callback = nullptr,
+           bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return Get(key, value, columns, timestamp, s, merge_context,
+               max_covering_tombstone_seq, &seq, read_opts, callback,
+               is_blob_index);
+  }
+
+  void MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                ReadCallback* callback);
+
+  // Returns all the merge operands corresponding to the key by searching all
+  // memtables starting from the most recent one.
+  bool GetMergeOperands(const LookupKey& key, Status* s,
+                        MergeContext* merge_context,
+                        SequenceNumber* max_covering_tombstone_seq,
+                        const ReadOptions& read_opts);
+
+  // Similar to Get(), but searches the Memtable history of memtables that
+  // have already been flushed.  Should only be used from in-memory only
+  // queries (such as Transaction validation) as the history may contain
+  // writes that are also present in the SST files.
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      SequenceNumber* seq, const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr);
+  bool GetFromHistory(const LookupKey& key, std::string* value,
+                      PinnableWideColumns* columns, std::string* timestamp,
+                      Status* s, MergeContext* merge_context,
+                      SequenceNumber* max_covering_tombstone_seq,
+                      const ReadOptions& read_opts,
+                      bool* is_blob_index = nullptr) {
+    SequenceNumber seq;
+    return GetFromHistory(key, value, columns, timestamp, s, merge_context,
+                          max_covering_tombstone_seq, &seq, read_opts,
+                          is_blob_index);
+  }
+
+  Status AddRangeTombstoneIterators(const ReadOptions& read_opts, Arena* arena,
+                                    RangeDelAggregator* range_del_agg);
+
+  void AddIterators(const ReadOptions& options,
+                    std::vector<InternalIterator*>* iterator_list,
+                    Arena* arena);
+
+  void AddIterators(const ReadOptions& options,
+                    MergeIteratorBuilder* merge_iter_builder,
+                    bool add_range_tombstone_iter);
+
+  uint64_t GetTotalNumEntries() const;
+
+  uint64_t GetTotalNumDeletes() const;
+
+  MemTable::MemTableStats ApproximateStats(const Slice& start_ikey,
+                                           const Slice& end_ikey);
+
+  // Returns the value of MemTable::GetEarliestSequenceNumber() on the most
+  // recent MemTable in this list or kMaxSequenceNumber if the list is empty.
+  // If include_history=true, will also search Memtables in MemTableList
+  // History.
+  SequenceNumber GetEarliestSequenceNumber(bool include_history = false) const;
+
+  // Return the first sequence number from the memtable list, which is the
+  // smallest sequence number of all FirstSequenceNumber.
+  // Return kMaxSequenceNumber if the list is empty.
+  SequenceNumber GetFirstSequenceNumber() const;
+
+ private:
+  friend class MemTableList;
+
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer);
+
+  // REQUIRE: m is an immutable memtable
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+  // REQUIRE: m is an immutable memtable
+  void Remove(MemTable* m, autovector<MemTable*>* to_delete);
+
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  bool GetFromList(std::list<MemTable*>* list, const LookupKey& key,
+                   std::string* value, PinnableWideColumns* columns,
+                   std::string* timestamp, Status* s,
+                   MergeContext* merge_context,
+                   SequenceNumber* max_covering_tombstone_seq,
+                   SequenceNumber* seq, const ReadOptions& read_opts,
+                   ReadCallback* callback = nullptr,
+                   bool* is_blob_index = nullptr);
+
+  void AddMemTable(MemTable* m);
+
+  void UnrefMemTable(autovector<MemTable*>* to_delete, MemTable* m);
+
+  // Calculate the total amount of memory used by memlist_ and memlist_history_
+  // excluding the last MemTable in memlist_history_. The reason for excluding
+  // the last MemTable is to see if dropping the last MemTable will keep total
+  // memory usage above or equal to max_write_buffer_size_to_maintain_
+  size_t MemoryAllocatedBytesExcludingLast() const;
+
+  // Whether this version contains flushed memtables that are only kept around
+  // for transaction conflict checking.
+  bool HasHistory() const { return !memlist_history_.empty(); }
+
+  bool MemtableLimitExceeded(size_t usage);
+
+  // Immutable MemTables that have not yet been flushed.
+  std::list<MemTable*> memlist_;
+
+  // MemTables that have already been flushed
+  // (used during Transaction validation)
+  std::list<MemTable*> memlist_history_;
+
+  // Maximum number of MemTables to keep in memory (including both flushed
+  const int max_write_buffer_number_to_maintain_;
+  // Maximum size of MemTables to keep in memory (including both flushed
+  // and not-yet-flushed tables).
+  const int64_t max_write_buffer_size_to_maintain_;
+
+  int refs_ = 0;
+
+  size_t* parent_memtable_list_memory_usage_;
+};
+
+// This class stores references to all the immutable memtables.
+// The memtables are flushed to L0 as soon as possible and in
+// any order. If there are more than one immutable memtable, their
+// flushes can occur concurrently.  However, they are 'committed'
+// to the manifest in FIFO order to maintain correctness and
+// recoverability from a crash.
+//
+//
+// Other than imm_flush_needed and imm_trim_needed, this class is not
+// thread-safe and requires external synchronization (such as holding the db
+// mutex or being on the write thread.)
+class MemTableList {
+ public:
+  // A list of memtables.
+  explicit MemTableList(int min_write_buffer_number_to_merge,
+                        int max_write_buffer_number_to_maintain,
+                        int64_t max_write_buffer_size_to_maintain)
+      : imm_flush_needed(false),
+        imm_trim_needed(false),
+        min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge),
+        current_(new MemTableListVersion(&current_memory_usage_,
+                                         max_write_buffer_number_to_maintain,
+                                         max_write_buffer_size_to_maintain)),
+        num_flush_not_started_(0),
+        commit_in_progress_(false),
+        flush_requested_(false),
+        current_memory_usage_(0),
+        current_memory_allocted_bytes_excluding_last_(0),
+        current_has_history_(false) {
+    current_->Ref();
+  }
+
+  // Should not delete MemTableList without making sure MemTableList::current()
+  // is Unref()'d.
+  ~MemTableList() {}
+
+  MemTableListVersion* current() const { return current_; }
+
+  // so that background threads can detect non-nullptr pointer to
+  // determine whether there is anything more to start flushing.
+  std::atomic<bool> imm_flush_needed;
+
+  std::atomic<bool> imm_trim_needed;
+
+  // Returns the total number of memtables in the list that haven't yet
+  // been flushed and logged.
+  int NumNotFlushed() const;
+
+  // Returns total number of memtables in the list that have been
+  // completely flushed and logged.
+  int NumFlushed() const;
+
+  // Returns true if there is at least one memtable on which flush has
+  // not yet started.
+  bool IsFlushPending() const;
+
+  // Returns true if there is at least one memtable that is pending flush or
+  // flushing.
+  bool IsFlushPendingOrRunning() const;
+
+  // Returns the earliest memtables that needs to be flushed. The returned
+  // memtables are guaranteed to be in the ascending order of created time.
+  void PickMemtablesToFlush(uint64_t max_memtable_id,
+                            autovector<MemTable*>* mems,
+                            uint64_t* max_next_log_number = nullptr);
+
+  // Reset status of the given memtable list back to pending state so that
+  // they can get picked up again on the next round of flush.
+  void RollbackMemtableFlush(const autovector<MemTable*>& mems,
+                             uint64_t file_number);
+
+  // Try commit a successful flush in the manifest file. It might just return
+  // Status::OK letting a concurrent flush to do the actual the recording.
+  Status TryInstallMemtableFlushResults(
+      ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, LogsWithPrepTracker* prep_tracker,
+      VersionSet* vset, InstrumentedMutex* mu, uint64_t file_number,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer,
+      std::list<std::unique_ptr<FlushJobInfo>>* committed_flush_jobs_info,
+      bool write_edits = true);
+
+  // New memtables are inserted at the front of the list.
+  // Takes ownership of the referenced held on *m by the caller of Add().
+  // By default, adding memtables will flag that the memtable list needs to be
+  // flushed, but in certain situations, like after a mempurge, we may want to
+  // avoid flushing the memtable list upon addition of a memtable.
+  void Add(MemTable* m, autovector<MemTable*>* to_delete);
+
+  // Returns an estimate of the number of bytes of data in use.
+  size_t ApproximateMemoryUsage();
+
+  // Returns the cached current_memory_allocted_bytes_excluding_last_ value.
+  size_t MemoryAllocatedBytesExcludingLast() const;
+
+  // Returns the cached current_has_history_ value.
+  bool HasHistory() const;
+
+  // Updates current_memory_allocted_bytes_excluding_last_ and
+  // current_has_history_ from MemTableListVersion. Must be called whenever
+  // InstallNewVersion is called.
+  void UpdateCachedValuesFromMemTableListVersion();
+
+  // `usage` is the current size of the mutable Memtable. When
+  // max_write_buffer_size_to_maintain is used, total size of mutable and
+  // immutable memtables is checked against it to decide whether to trim
+  // memtable list.
+  //
+  // Return true if memtable is trimmed
+  bool TrimHistory(autovector<MemTable*>* to_delete, size_t usage);
+
+  // Returns an estimate of the number of bytes of data used by
+  // the unflushed mem-tables.
+  size_t ApproximateUnflushedMemTablesMemoryUsage();
+
+  // Returns an estimate of the timestamp of the earliest key.
+  uint64_t ApproximateOldestKeyTime() const;
+
+  // Request a flush of all existing memtables to storage.  This will
+  // cause future calls to IsFlushPending() to return true if this list is
+  // non-empty (regardless of the min_write_buffer_number_to_merge
+  // parameter). This flush request will persist until the next time
+  // PickMemtablesToFlush() is called.
+  void FlushRequested() {
+    flush_requested_ = true;
+    // If there are some memtables stored in imm() that don't trigger
+    // flush (eg: mempurge output memtable), then update imm_flush_needed.
+    // Note: if race condition and imm_flush_needed is set to true
+    // when there is num_flush_not_started_==0, then there is no
+    // impact whatsoever. Imm_flush_needed is only used in an assert
+    // in IsFlushPending().
+    if (num_flush_not_started_ > 0) {
+      imm_flush_needed.store(true, std::memory_order_release);
+    }
+  }
+
+  bool HasFlushRequested() { return flush_requested_; }
+
+  // Returns true if a trim history should be scheduled and the caller should
+  // be the one to schedule it
+  bool MarkTrimHistoryNeeded() {
+    auto expected = false;
+    return imm_trim_needed.compare_exchange_strong(
+        expected, true, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  void ResetTrimHistoryNeeded() {
+    auto expected = true;
+    imm_trim_needed.compare_exchange_strong(
+        expected, false, std::memory_order_relaxed, std::memory_order_relaxed);
+  }
+
+  // Copying allowed
+  // MemTableList(const MemTableList&);
+  // void operator=(const MemTableList&);
+
+  size_t* current_memory_usage() { return &current_memory_usage_; }
+
+  // Returns the min log containing the prep section after memtables listsed in
+  // `memtables_to_flush` are flushed and their status is persisted in manifest.
+  uint64_t PrecomputeMinLogContainingPrepSection(
+      const std::unordered_set<MemTable*>* memtables_to_flush = nullptr);
+
+  uint64_t GetEarliestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return std::numeric_limits<uint64_t>::max();
+    }
+    return memlist.back()->GetID();
+  }
+
+  uint64_t GetLatestMemTableID() const {
+    auto& memlist = current_->memlist_;
+    if (memlist.empty()) {
+      return 0;
+    }
+    return memlist.front()->GetID();
+  }
+
+  void AssignAtomicFlushSeq(const SequenceNumber& seq) {
+    const auto& memlist = current_->memlist_;
+    // Scan the memtable list from new to old
+    for (auto it = memlist.begin(); it != memlist.end(); ++it) {
+      MemTable* mem = *it;
+      if (mem->atomic_flush_seqno_ == kMaxSequenceNumber) {
+        mem->atomic_flush_seqno_ = seq;
+      } else {
+        // Earlier memtables must have been assigned a atomic flush seq, no
+        // need to continue scan.
+        break;
+      }
+    }
+  }
+
+  // Used only by DBImplSecondary during log replay.
+  // Remove memtables whose data were written before the WAL with log_number
+  // was created, i.e. mem->GetNextLogNumber() <= log_number. The memtables are
+  // not freed, but put into a vector for future deref and reclamation.
+  void RemoveOldMemTables(uint64_t log_number,
+                          autovector<MemTable*>* to_delete);
+
+ private:
+  friend Status InstallMemtableAtomicFlushResults(
+      const autovector<MemTableList*>* imm_lists,
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      VersionSet* vset, LogsWithPrepTracker* prep_tracker,
+      InstrumentedMutex* mu, const autovector<FileMetaData*>& file_meta,
+      const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+          committed_flush_jobs_info,
+      autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+      LogBuffer* log_buffer);
+
+  // DB mutex held
+  void InstallNewVersion();
+
+  // DB mutex held
+  // Called after writing to MANIFEST
+  void RemoveMemTablesOrRestoreFlags(const Status& s, ColumnFamilyData* cfd,
+                                     size_t batch_count, LogBuffer* log_buffer,
+                                     autovector<MemTable*>* to_delete,
+                                     InstrumentedMutex* mu);
+
+  const int min_write_buffer_number_to_merge_;
+
+  MemTableListVersion* current_;
+
+  // the number of elements that still need flushing
+  int num_flush_not_started_;
+
+  // committing in progress
+  bool commit_in_progress_;
+
+  // Requested a flush of memtables to storage. It's possible to request that
+  // a subset of memtables be flushed.
+  bool flush_requested_;
+
+  // The current memory usage.
+  size_t current_memory_usage_;
+
+  // Cached value of current_->MemoryAllocatedBytesExcludingLast().
+  std::atomic<size_t> current_memory_allocted_bytes_excluding_last_;
+
+  // Cached value of current_->HasHistory().
+  std::atomic<bool> current_has_history_;
+};
+
+// Installs memtable atomic flush results.
+// In most cases, imm_lists is nullptr, and the function simply uses the
+// immutable memtable lists associated with the cfds. There are unit tests that
+// installs flush results for external immutable memtable lists other than the
+// cfds' own immutable memtable lists, e.g. MemTableLIstTest. In this case,
+// imm_lists parameter is not nullptr.
+extern Status InstallMemtableAtomicFlushResults(
+    const autovector<MemTableList*>* imm_lists,
+    const autovector<ColumnFamilyData*>& cfds,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<const autovector<MemTable*>*>& mems_list, VersionSet* vset,
+    LogsWithPrepTracker* prep_tracker, InstrumentedMutex* mu,
+    const autovector<FileMetaData*>& file_meta,
+    const autovector<std::list<std::unique_ptr<FlushJobInfo>>*>&
+        committed_flush_jobs_info,
+    autovector<MemTable*>* to_delete, FSDirectory* db_directory,
+    LogBuffer* log_buffer);
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/memtable_list_test.cc b/src/rocksdb/db/memtable_list_test.cc
new file mode 100644
index 000000000..8242061af
--- /dev/null
+++ b/src/rocksdb/db/memtable_list_test.cc
@@ -0,0 +1,1039 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/memtable_list.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/version_set.h"
+#include "db/write_controller.h"
+#include "rocksdb/db.h"
+#include "rocksdb/status.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTableListTest : public testing::Test {
+ public:
+  std::string dbname;
+  DB* db;
+  Options options;
+  std::vector<ColumnFamilyHandle*> handles;
+  std::atomic<uint64_t> file_number;
+
+  MemTableListTest() : db(nullptr), file_number(1) {
+    dbname = test::PerThreadDBPath("memtable_list_test");
+    options.create_if_missing = true;
+    EXPECT_OK(DestroyDB(dbname, options));
+  }
+
+  // Create a test db if not yet created
+  void CreateDB() {
+    if (db == nullptr) {
+      options.create_if_missing = true;
+      EXPECT_OK(DestroyDB(dbname, options));
+      // Open DB only with default column family
+      ColumnFamilyOptions cf_options;
+      std::vector<ColumnFamilyDescriptor> cf_descs;
+      cf_descs.emplace_back(kDefaultColumnFamilyName, cf_options);
+      Status s = DB::Open(options, dbname, cf_descs, &handles, &db);
+      EXPECT_OK(s);
+
+      ColumnFamilyOptions cf_opt1, cf_opt2;
+      cf_opt1.cf_paths.emplace_back(dbname + "_one_1",
+                                    std::numeric_limits<uint64_t>::max());
+      cf_opt2.cf_paths.emplace_back(dbname + "_two_1",
+                                    std::numeric_limits<uint64_t>::max());
+      int sz = static_cast<int>(handles.size());
+      handles.resize(sz + 2);
+      s = db->CreateColumnFamily(cf_opt1, "one", &handles[1]);
+      EXPECT_OK(s);
+      s = db->CreateColumnFamily(cf_opt2, "two", &handles[2]);
+      EXPECT_OK(s);
+
+      cf_descs.emplace_back("one", cf_options);
+      cf_descs.emplace_back("two", cf_options);
+    }
+  }
+
+  ~MemTableListTest() override {
+    if (db) {
+      std::vector<ColumnFamilyDescriptor> cf_descs(handles.size());
+#ifndef ROCKSDB_LITE
+      for (int i = 0; i != static_cast<int>(handles.size()); ++i) {
+        EXPECT_OK(handles[i]->GetDescriptor(&cf_descs[i]));
+      }
+#endif  // !ROCKSDB_LITE
+      for (auto h : handles) {
+        if (h) {
+          EXPECT_OK(db->DestroyColumnFamilyHandle(h));
+        }
+      }
+      handles.clear();
+      delete db;
+      db = nullptr;
+      EXPECT_OK(DestroyDB(dbname, options, cf_descs));
+    }
+  }
+
+  // Calls MemTableList::TryInstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableFlushResults(
+      MemTableList* list, const MutableCFOptions& mutable_cf_options,
+      const autovector<MemTable*>& m, autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_id*/ "",
+                        /*db_session_id*/ "");
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+    auto column_family_set = versions.GetColumnFamilySet();
+    LogsWithPrepTracker dummy_prep_tracker;
+    auto cfd = column_family_set->GetDefault();
+    EXPECT_TRUE(nullptr != cfd);
+    uint64_t file_num = file_number.fetch_add(1);
+    IOStatus io_s;
+    // Create dummy mutex.
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    std::list<std::unique_ptr<FlushJobInfo>> flush_jobs_info;
+    Status s = list->TryInstallMemtableFlushResults(
+        cfd, mutable_cf_options, m, &dummy_prep_tracker, &versions, &mutex,
+        file_num, to_delete, nullptr, &log_buffer, &flush_jobs_info);
+    EXPECT_OK(io_s);
+    return s;
+  }
+
+  // Calls MemTableList::InstallMemtableFlushResults() and sets up all
+  // structures needed to call this function.
+  Status Mock_InstallMemtableAtomicFlushResults(
+      autovector<MemTableList*>& lists, const autovector<uint32_t>& cf_ids,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<const autovector<MemTable*>*>& mems_list,
+      autovector<MemTable*>* to_delete) {
+    // Create a mock Logger
+    test::NullLogger logger;
+    LogBuffer log_buffer(DEBUG_LEVEL, &logger);
+
+    CreateDB();
+    // Create a mock VersionSet
+    DBOptions db_options;
+
+    ImmutableDBOptions immutable_db_options(db_options);
+    EnvOptions env_options;
+    std::shared_ptr<Cache> table_cache(NewLRUCache(50000, 16));
+    WriteBufferManager write_buffer_manager(db_options.db_write_buffer_size);
+    WriteController write_controller(10000000u);
+
+    VersionSet versions(dbname, &immutable_db_options, env_options,
+                        table_cache.get(), &write_buffer_manager,
+                        &write_controller, /*block_cache_tracer=*/nullptr,
+                        /*io_tracer=*/nullptr, /*db_id*/ "",
+                        /*db_session_id*/ "");
+    std::vector<ColumnFamilyDescriptor> cf_descs;
+    cf_descs.emplace_back(kDefaultColumnFamilyName, ColumnFamilyOptions());
+    cf_descs.emplace_back("one", ColumnFamilyOptions());
+    cf_descs.emplace_back("two", ColumnFamilyOptions());
+    EXPECT_OK(versions.Recover(cf_descs, false));
+
+    // Create mock default ColumnFamilyData
+
+    auto column_family_set = versions.GetColumnFamilySet();
+
+    LogsWithPrepTracker dummy_prep_tracker;
+    autovector<ColumnFamilyData*> cfds;
+    for (int i = 0; i != static_cast<int>(cf_ids.size()); ++i) {
+      cfds.emplace_back(column_family_set->GetColumnFamily(cf_ids[i]));
+      EXPECT_NE(nullptr, cfds[i]);
+    }
+    std::vector<FileMetaData> file_metas;
+    file_metas.reserve(cf_ids.size());
+    for (size_t i = 0; i != cf_ids.size(); ++i) {
+      FileMetaData meta;
+      uint64_t file_num = file_number.fetch_add(1);
+      meta.fd = FileDescriptor(file_num, 0, 0);
+      file_metas.emplace_back(meta);
+    }
+    autovector<FileMetaData*> file_meta_ptrs;
+    for (auto& meta : file_metas) {
+      file_meta_ptrs.push_back(&meta);
+    }
+    std::vector<std::list<std::unique_ptr<FlushJobInfo>>>
+        committed_flush_jobs_info_storage(cf_ids.size());
+    autovector<std::list<std::unique_ptr<FlushJobInfo>>*>
+        committed_flush_jobs_info;
+    for (int i = 0; i < static_cast<int>(cf_ids.size()); ++i) {
+      committed_flush_jobs_info.push_back(
+          &committed_flush_jobs_info_storage[i]);
+    }
+
+    InstrumentedMutex mutex;
+    InstrumentedMutexLock l(&mutex);
+    return InstallMemtableAtomicFlushResults(
+        &lists, cfds, mutable_cf_options_list, mems_list, &versions,
+        nullptr /* prep_tracker */, &mutex, file_meta_ptrs,
+        committed_flush_jobs_info, to_delete, nullptr, &log_buffer);
+  }
+};
+
+TEST_F(MemTableListTest, Empty) {
+  // Create an empty MemTableList and validate basic functions.
+  MemTableList list(1, 0, 0);
+
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+
+  autovector<MemTable*> mems;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &mems);
+  ASSERT_EQ(0, mems.size());
+
+  autovector<MemTable*> to_delete;
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(0, to_delete.size());
+}
+
+TEST_F(MemTableListTest, GetTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 0;
+  int64_t max_write_buffer_size_to_maintain = 0;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+                                   /*timestamp=*/nullptr, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", "value1",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value1");
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", 2), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  ASSERT_EQ(4, mem->num_entries());
+  ASSERT_EQ(1, mem->num_deletes());
+
+  // Add memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem->ConstructFragmentedRangeTombstones();
+  list.Add(mem, &to_delete);
+
+  SequenceNumber saved_seq = seq;
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key2", "value2.3",
+                      nullptr /* kv_prot_info */));
+
+  // Add second memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem2->ConstructFragmentedRangeTombstones();
+  list.Add(mem2, &to_delete);
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key1", saved_seq), &value,
+                              /*columns=*/nullptr, /*timestamp=*/nullptr, &s,
+                              &merge_context, &max_covering_tombstone_seq,
+                              ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value1", value);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.3");
+
+  merge_context.Clear();
+  found = list.current()->Get(LookupKey("key2", 1), &value, /*columns=*/nullptr,
+                              /*timestamp=*/nullptr, &s, &merge_context,
+                              &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  ASSERT_EQ(2, list.NumNotFlushed());
+
+  list.current()->Unref(&to_delete);
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, GetFromHistoryTest) {
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 2;
+  int max_write_buffer_number_to_maintain = 2;
+  int64_t max_write_buffer_size_to_maintain = 2 * Arena::kInlineSize;
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  SequenceNumber seq = 1;
+  std::string value;
+  Status s;
+  MergeContext merge_context;
+  InternalKeyComparator ikey_cmp(options.comparator);
+  SequenceNumber max_covering_tombstone_seq = 0;
+  autovector<MemTable*> to_delete;
+
+  LookupKey lkey("key1", seq);
+  bool found = list.current()->Get(lkey, &value, /*columns=*/nullptr,
+                                   /*timestamp=*/nullptr, &s, &merge_context,
+                                   &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Create a MemTable
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+
+  // Write some keys to this memtable.
+  ASSERT_OK(
+      mem->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2",
+                     nullptr /* kv_prot_info */));
+  ASSERT_OK(mem->Add(++seq, kTypeValue, "key2", "value2.2",
+                     nullptr /* kv_prot_info */));
+
+  // Fetch the newly written keys
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key1", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  // MemTable found out that this key is *not* found (at this sequence#)
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = mem->Get(LookupKey("key2", seq), &value, /*columns*/ nullptr,
+                   /*timestamp*/ nullptr, &s, &merge_context,
+                   &max_covering_tombstone_seq, ReadOptions(),
+                   false /* immutable_memtable */);
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ(value, "value2.2");
+
+  // Add memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem->ConstructFragmentedRangeTombstones();
+  list.Add(mem, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  // Fetch keys via MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_TRUE(s.ok() && found);
+  ASSERT_EQ("value2.2", value);
+
+  // Flush this memtable from the list.
+  // (It will then be a part of the memtable history).
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  MutableCFOptions mutable_cf_options(options);
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify keys are present in history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key2", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value2.2", value);
+
+  // Create another memtable and write some keys to it
+  WriteBufferManager wb2(options.db_write_buffer_size);
+  MemTable* mem2 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb2,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem2->Ref();
+
+  ASSERT_OK(
+      mem2->Add(++seq, kTypeDeletion, "key1", "", nullptr /* kv_prot_info */));
+  ASSERT_OK(mem2->Add(++seq, kTypeValue, "key3", "value3",
+                      nullptr /* kv_prot_info */));
+
+  // Add second memtable to list
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem2->ConstructFragmentedRangeTombstones();
+  list.Add(mem2, &to_delete);
+  ASSERT_EQ(0, to_delete.size());
+
+  to_flush.clear();
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(1, to_flush.size());
+
+  // Flush second memtable
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+  ASSERT_EQ(0, list.NumNotFlushed());
+  ASSERT_EQ(2, list.NumFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Add a third memtable to push the first memtable out of the history
+  WriteBufferManager wb3(options.db_write_buffer_size);
+  MemTable* mem3 = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb3,
+                                kMaxSequenceNumber, 0 /* column_family_id */);
+  mem3->Ref();
+  // This is to make assert(memtable->IsFragmentedRangeTombstonesConstructed())
+  // in MemTableListVersion::GetFromList work.
+  mem3->ConstructFragmentedRangeTombstones();
+  list.Add(mem3, &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(1, list.NumFlushed());
+  ASSERT_EQ(1, to_delete.size());
+
+  // Verify keys are no longer in MemTableList
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key1", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key3", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Verify that the second memtable's keys are in the history
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key1", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found && s.IsNotFound());
+
+  merge_context.Clear();
+  found = list.current()->GetFromHistory(
+      LookupKey("key3", seq), &value, /*columns=*/nullptr,
+      /*timestamp=*/nullptr, &s, &merge_context, &max_covering_tombstone_seq,
+      ReadOptions());
+  ASSERT_TRUE(found);
+  ASSERT_EQ("value3", value);
+
+  // Verify that key2 from the first memtable is no longer in the history
+  merge_context.Clear();
+  found =
+      list.current()->Get(LookupKey("key2", seq), &value, /*columns=*/nullptr,
+                          /*timestamp=*/nullptr, &s, &merge_context,
+                          &max_covering_tombstone_seq, ReadOptions());
+  ASSERT_FALSE(found);
+
+  // Cleanup
+  list.current()->Unref(&to_delete);
+  ASSERT_EQ(3, to_delete.size());
+  for (MemTable* m : to_delete) {
+    delete m;
+  }
+}
+
+TEST_F(MemTableListTest, FlushPendingTest) {
+  const int num_tables = 6;
+  SequenceNumber seq = 1;
+  Status s;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+  autovector<MemTable*> to_delete;
+
+  // Create MemTableList
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int>(options.write_buffer_size);
+  MemTableList list(min_write_buffer_number_to_merge,
+                    max_write_buffer_number_to_maintain,
+                    max_write_buffer_size_to_maintain);
+
+  // Create some MemTables
+  uint64_t memtable_id = 0;
+  std::vector<MemTable*> tables;
+  MutableCFOptions mutable_cf_options(options);
+  for (int i = 0; i < num_tables; i++) {
+    MemTable* mem = new MemTable(cmp, ioptions, mutable_cf_options, &wb,
+                                 kMaxSequenceNumber, 0 /* column_family_id */);
+    mem->SetID(memtable_id++);
+    mem->Ref();
+
+    std::string value;
+    MergeContext merge_context;
+
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i), "valueN",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i), "valueM",
+                       nullptr /* kv_prot_info */));
+    ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+                       nullptr /* kv_prot_info */));
+
+    tables.push_back(mem);
+  }
+
+  // Nothing to flush
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  autovector<MemTable*> to_flush;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+
+  // Request a flush even though there is nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Attempt to 'flush' to clear request for flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(0, to_flush.size());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Request a flush again
+  list.FlushRequested();
+  // No flush pending since the list is empty.
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add 2 tables
+  list.Add(tables[0], &to_delete);
+  list.Add(tables[1], &to_delete);
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Even though we have less than the minimum to flush, a flush is
+  // pending since we had previously requested a flush and never called
+  // PickMemtablesToFlush() to clear the flush.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(2, to_flush.size());
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Revert flush
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another table
+  list.Add(tables[2], &to_delete);
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  // was called.
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush2;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+  ASSERT_EQ(0, to_flush2.size());
+  ASSERT_EQ(3, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Add another table
+  list.Add(tables[3], &to_delete);
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Request a flush again
+  list.FlushRequested();
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush2);
+  ASSERT_EQ(1, to_flush2.size());
+  ASSERT_EQ(4, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Rollback first pick of tables
+  list.RollbackMemtableFlush(to_flush, 0);
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  to_flush.clear();
+
+  // Add another tables
+  list.Add(tables[4], &to_delete);
+  ASSERT_EQ(5, list.NumNotFlushed());
+  // We now have the minimum to flush regardles of whether FlushRequested()
+  ASSERT_TRUE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_EQ(0, to_delete.size());
+
+  // Pick tables to flush
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
+  // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
+  // must be excluded. The newest (fifth oldest) is non-consecutive with the
+  // three oldest due to omitting the fourth oldest so must not be picked.
+  ASSERT_EQ(3, to_flush.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Pick tables to flush again
+  autovector<MemTable*> to_flush3;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
+  // Picks newest (fifth oldest)
+  ASSERT_EQ(1, to_flush3.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Nothing left to flush
+  autovector<MemTable*> to_flush4;
+  list.PickMemtablesToFlush(
+      std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
+  ASSERT_EQ(0, to_flush4.size());
+  ASSERT_EQ(5, list.NumNotFlushed());
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 3 memtables that were picked in to_flush
+  s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
+                                       &to_delete);
+  ASSERT_OK(s);
+
+  // Note:  now to_flush contains tables[0,1,2].  to_flush2 contains
+  // tables[3]. to_flush3 contains tables[4].
+  // Current implementation will only commit memtables in the order they were
+  // created. So TryInstallMemtableFlushResults will install the first 3 tables
+  // in to_flush and stop when it encounters a table not yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  int num_in_history =
+      std::min(3, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  // Request a flush again. Should be nothing to flush
+  list.FlushRequested();
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+
+  // Flush the 1 memtable (tables[4]) that was picked in to_flush3
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush3, &to_delete);
+  ASSERT_OK(s);
+
+  // This will install 0 tables since tables[4] flushed while tables[3] has not
+  // yet flushed.
+  ASSERT_EQ(2, list.NumNotFlushed());
+  ASSERT_EQ(0, to_delete.size());
+
+  // Flush the 1 memtable (tables[3]) that was picked in to_flush2
+  s = MemTableListTest::Mock_InstallMemtableFlushResults(
+      &list, mutable_cf_options, to_flush2, &to_delete);
+  ASSERT_OK(s);
+
+  // This will actually install 2 tables.  The 1 we told it to flush, and also
+  // tables[4] which has been waiting for tables[3] to commit.
+  ASSERT_EQ(0, list.NumNotFlushed());
+  num_in_history =
+      std::min(5, static_cast<int>(max_write_buffer_size_to_maintain) /
+                      static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(num_in_history, list.NumFlushed());
+  ASSERT_EQ(5 - list.NumNotFlushed() - num_in_history, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+
+  // Add another table
+  list.Add(tables[5], &to_delete);
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_EQ(5, list.GetLatestMemTableID());
+  memtable_id = 4;
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 4. Therefore, no table will be selected in this case.
+  autovector<MemTable*> to_flush5;
+  list.FlushRequested();
+  ASSERT_TRUE(list.HasFlushRequested());
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_TRUE(to_flush5.empty());
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  ASSERT_FALSE(list.HasFlushRequested());
+
+  // Pick tables to flush. The tables to pick must have ID smaller than or
+  // equal to 5. Therefore, only tables[5] will be selected.
+  memtable_id = 5;
+  list.FlushRequested();
+  list.PickMemtablesToFlush(memtable_id, &to_flush5);
+  ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
+  ASSERT_EQ(1, list.NumNotFlushed());
+  ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
+  ASSERT_FALSE(list.IsFlushPending());
+  to_delete.clear();
+
+  list.current()->Unref(&to_delete);
+  int to_delete_size =
+      std::min(num_tables, static_cast<int>(max_write_buffer_size_to_maintain) /
+                               static_cast<int>(options.write_buffer_size));
+  ASSERT_EQ(to_delete_size, to_delete.size());
+
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling TryInstallMemtableFlushResults.
+    // Verify this, by Ref'ing then UnRef'ing:
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+  to_delete.clear();
+}
+
+TEST_F(MemTableListTest, EmptyAtomicFlusTest) {
+  autovector<MemTableList*> lists;
+  autovector<uint32_t> cf_ids;
+  autovector<const MutableCFOptions*> options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  autovector<MemTable*> to_delete;
+  Status s = Mock_InstallMemtableAtomicFlushResults(lists, cf_ids, options_list,
+                                                    to_flush, &to_delete);
+  ASSERT_OK(s);
+  ASSERT_TRUE(to_delete.empty());
+}
+
+TEST_F(MemTableListTest, AtomicFlusTest) {
+  const int num_cfs = 3;
+  const int num_tables_per_cf = 2;
+  SequenceNumber seq = 1;
+
+  auto factory = std::make_shared<SkipListFactory>();
+  options.memtable_factory = factory;
+  ImmutableOptions ioptions(options);
+  InternalKeyComparator cmp(BytewiseComparator());
+  WriteBufferManager wb(options.db_write_buffer_size);
+
+  // Create MemTableLists
+  int min_write_buffer_number_to_merge = 3;
+  int max_write_buffer_number_to_maintain = 7;
+  int64_t max_write_buffer_size_to_maintain =
+      7 * static_cast<int64_t>(options.write_buffer_size);
+  autovector<MemTableList*> lists;
+  for (int i = 0; i != num_cfs; ++i) {
+    lists.emplace_back(new MemTableList(min_write_buffer_number_to_merge,
+                                        max_write_buffer_number_to_maintain,
+                                        max_write_buffer_size_to_maintain));
+  }
+
+  autovector<uint32_t> cf_ids;
+  std::vector<std::vector<MemTable*>> tables(num_cfs);
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  uint32_t cf_id = 0;
+  for (auto& elem : tables) {
+    mutable_cf_options_list.emplace_back(new MutableCFOptions(options));
+    uint64_t memtable_id = 0;
+    for (int i = 0; i != num_tables_per_cf; ++i) {
+      MemTable* mem =
+          new MemTable(cmp, ioptions, *(mutable_cf_options_list.back()), &wb,
+                       kMaxSequenceNumber, cf_id);
+      mem->SetID(memtable_id++);
+      mem->Ref();
+
+      std::string value;
+
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "key1", std::to_string(i),
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyN" + std::to_string(i),
+                         "valueN", nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyX" + std::to_string(i), "value",
+                         nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeValue, "keyM" + std::to_string(i),
+                         "valueM", nullptr /* kv_prot_info */));
+      ASSERT_OK(mem->Add(++seq, kTypeDeletion, "keyX" + std::to_string(i), "",
+                         nullptr /* kv_prot_info */));
+
+      elem.push_back(mem);
+    }
+    cf_ids.push_back(cf_id++);
+  }
+
+  std::vector<autovector<MemTable*>> flush_candidates(num_cfs);
+
+  // Nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+    list->PickMemtablesToFlush(
+        std::numeric_limits<uint64_t>::max() /* memtable_id */,
+        &flush_candidates[i]);
+    ASSERT_EQ(0, flush_candidates[i].size());
+  }
+  // Request flush even though there is nothing to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    auto* list = lists[i];
+    list->FlushRequested();
+    ASSERT_FALSE(list->IsFlushPending());
+    ASSERT_FALSE(list->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  autovector<MemTable*> to_delete;
+  // Add tables to the immutable memtalbe lists associated with column families
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      lists[i]->Add(tables[i][j], &to_delete);
+    }
+    ASSERT_EQ(num_tables_per_cf, lists[i]->NumNotFlushed());
+    ASSERT_TRUE(lists[i]->IsFlushPending());
+    ASSERT_TRUE(lists[i]->imm_flush_needed.load(std::memory_order_acquire));
+  }
+  std::vector<uint64_t> flush_memtable_ids = {1, 1, 0};
+  //          +----+
+  // list[0]: |0  1|
+  // list[1]: |0  1|
+  //          | +--+
+  // list[2]: |0| 1
+  //          +-+
+  // Pick memtables to flush
+  for (auto i = 0; i != num_cfs; ++i) {
+    flush_candidates[i].clear();
+    lists[i]->PickMemtablesToFlush(flush_memtable_ids[i], &flush_candidates[i]);
+    ASSERT_EQ(flush_memtable_ids[i] - 0 + 1,
+              static_cast<uint64_t>(flush_candidates[i].size()));
+  }
+  autovector<MemTableList*> tmp_lists;
+  autovector<uint32_t> tmp_cf_ids;
+  autovector<const MutableCFOptions*> tmp_options_list;
+  autovector<const autovector<MemTable*>*> to_flush;
+  for (auto i = 0; i != num_cfs; ++i) {
+    if (!flush_candidates[i].empty()) {
+      to_flush.push_back(&flush_candidates[i]);
+      tmp_lists.push_back(lists[i]);
+      tmp_cf_ids.push_back(i);
+      tmp_options_list.push_back(mutable_cf_options_list[i]);
+    }
+  }
+  Status s = Mock_InstallMemtableAtomicFlushResults(
+      tmp_lists, tmp_cf_ids, tmp_options_list, to_flush, &to_delete);
+  ASSERT_OK(s);
+
+  for (auto i = 0; i != num_cfs; ++i) {
+    for (auto j = 0; j != num_tables_per_cf; ++j) {
+      if (static_cast<uint64_t>(j) <= flush_memtable_ids[i]) {
+        ASSERT_LT(0, tables[i][j]->GetFileNumber());
+      }
+    }
+    ASSERT_EQ(
+        static_cast<size_t>(num_tables_per_cf) - flush_candidates[i].size(),
+        lists[i]->NumNotFlushed());
+  }
+
+  to_delete.clear();
+  for (auto list : lists) {
+    list->current()->Unref(&to_delete);
+    delete list;
+  }
+  for (auto& mutable_cf_options : mutable_cf_options_list) {
+    if (mutable_cf_options != nullptr) {
+      delete mutable_cf_options;
+      mutable_cf_options = nullptr;
+    }
+  }
+  // All memtables in tables array must have been flushed, thus ready to be
+  // deleted.
+  ASSERT_EQ(to_delete.size(), tables.size() * tables.front().size());
+  for (const auto& m : to_delete) {
+    // Refcount should be 0 after calling InstallMemtableFlushResults.
+    // Verify this by Ref'ing and then Unref'ing.
+    m->Ref();
+    ASSERT_EQ(m, m->Unref());
+    delete m;
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_context.h b/src/rocksdb/db/merge_context.h
new file mode 100644
index 000000000..8a7b07290
--- /dev/null
+++ b/src/rocksdb/db/merge_context.h
@@ -0,0 +1,147 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "rocksdb/slice.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const std::vector<Slice> empty_operand_list;
+
+// The merge context for merging a user key.
+// When doing a Get(), DB will create such a class and pass it when
+// issuing Get() operation to memtables and version_set. The operands
+// will be fetched from the context when issuing partial of full merge.
+class MergeContext {
+ public:
+  // Clear all the operands
+  void Clear() {
+    if (operand_list_) {
+      operand_list_->clear();
+      copied_operands_->clear();
+    }
+  }
+
+  // Push a merge operand
+  void PushOperand(const Slice& operand_slice, bool operand_pinned = false) {
+    Initialize();
+    SetDirectionBackward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // Push back a merge operand
+  void PushOperandBack(const Slice& operand_slice,
+                       bool operand_pinned = false) {
+    Initialize();
+    SetDirectionForward();
+
+    if (operand_pinned) {
+      operand_list_->push_back(operand_slice);
+    } else {
+      // We need to have our own copy of the operand since it's not pinned
+      copied_operands_->emplace_back(
+          new std::string(operand_slice.data(), operand_slice.size()));
+      operand_list_->push_back(*copied_operands_->back());
+    }
+  }
+
+  // return total number of operands in the list
+  size_t GetNumOperands() const {
+    if (!operand_list_) {
+      return 0;
+    }
+    return operand_list_->size();
+  }
+
+  // Get the operand at the index.
+  Slice GetOperand(int index) const {
+    assert(operand_list_);
+
+    SetDirectionForward();
+    return (*operand_list_)[index];
+  }
+
+  // Same as GetOperandsDirectionForward
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperands() const {
+    return GetOperandsDirectionForward();
+  }
+
+  // Return all the operands in the order as they were merged (passed to
+  // FullMerge or FullMergeV2)
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionForward() const {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionForward();
+    return *operand_list_;
+  }
+
+  // Return all the operands in the reversed order relative to how they were
+  // merged (passed to FullMerge or FullMergeV2)
+  //
+  // Note that the returned reference is only good until another call
+  // to this MergeContext.  If the returned value is needed for longer,
+  // a copy must be made.
+  const std::vector<Slice>& GetOperandsDirectionBackward() const {
+    if (!operand_list_) {
+      return empty_operand_list;
+    }
+
+    SetDirectionBackward();
+    return *operand_list_;
+  }
+
+ private:
+  void Initialize() {
+    if (!operand_list_) {
+      operand_list_.reset(new std::vector<Slice>());
+      copied_operands_.reset(new std::vector<std::unique_ptr<std::string>>());
+    }
+  }
+
+  void SetDirectionForward() const {
+    if (operands_reversed_ == true) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = false;
+    }
+  }
+
+  void SetDirectionBackward() const {
+    if (operands_reversed_ == false) {
+      std::reverse(operand_list_->begin(), operand_list_->end());
+      operands_reversed_ = true;
+    }
+  }
+
+  // List of operands
+  mutable std::unique_ptr<std::vector<Slice>> operand_list_;
+  // Copy of operands that are not pinned.
+  std::unique_ptr<std::vector<std::unique_ptr<std::string>>> copied_operands_;
+  mutable bool operands_reversed_ = true;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.cc b/src/rocksdb/db/merge_helper.cc
new file mode 100644
index 000000000..6df841012
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.cc
@@ -0,0 +1,583 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <string>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/prefetch_buffer_collection.h"
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/wide/wide_column_serialization.h"
+#include "logging/logging.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/likely.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "table/format.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+MergeHelper::MergeHelper(Env* env, const Comparator* user_comparator,
+                         const MergeOperator* user_merge_operator,
+                         const CompactionFilter* compaction_filter,
+                         Logger* logger, bool assert_valid_internal_key,
+                         SequenceNumber latest_snapshot,
+                         const SnapshotChecker* snapshot_checker, int level,
+                         Statistics* stats,
+                         const std::atomic<bool>* shutting_down)
+    : env_(env),
+      clock_(env->GetSystemClock().get()),
+      user_comparator_(user_comparator),
+      user_merge_operator_(user_merge_operator),
+      compaction_filter_(compaction_filter),
+      shutting_down_(shutting_down),
+      logger_(logger),
+      assert_valid_internal_key_(assert_valid_internal_key),
+      allow_single_operand_(false),
+      latest_snapshot_(latest_snapshot),
+      snapshot_checker_(snapshot_checker),
+      level_(level),
+      keys_(),
+      filter_timer_(clock_),
+      total_filter_time_(0U),
+      stats_(stats) {
+  assert(user_comparator_ != nullptr);
+  if (user_merge_operator_) {
+    allow_single_operand_ = user_merge_operator_->AllowSingleOperand();
+  }
+}
+
+Status MergeHelper::TimedFullMerge(const MergeOperator* merge_operator,
+                                   const Slice& key, const Slice* value,
+                                   const std::vector<Slice>& operands,
+                                   std::string* result, Logger* logger,
+                                   Statistics* statistics, SystemClock* clock,
+                                   Slice* result_operand,
+                                   bool update_num_ops_stats) {
+  assert(merge_operator != nullptr);
+
+  if (operands.empty()) {
+    assert(value != nullptr && result != nullptr);
+    result->assign(value->data(), value->size());
+    return Status::OK();
+  }
+
+  if (update_num_ops_stats) {
+    RecordInHistogram(statistics, READ_NUM_MERGE_OPERANDS,
+                      static_cast<uint64_t>(operands.size()));
+  }
+
+  bool success = false;
+  Slice tmp_result_operand(nullptr, 0);
+  const MergeOperator::MergeOperationInput merge_in(key, value, operands,
+                                                    logger);
+  MergeOperator::MergeOperationOutput merge_out(*result, tmp_result_operand);
+  {
+    // Setup to time the merge
+    StopWatchNano timer(clock, statistics != nullptr);
+    PERF_TIMER_GUARD(merge_operator_time_nanos);
+
+    // Do the merge
+    success = merge_operator->FullMergeV2(merge_in, &merge_out);
+
+    if (tmp_result_operand.data()) {
+      // FullMergeV2 result is an existing operand
+      if (result_operand != nullptr) {
+        *result_operand = tmp_result_operand;
+      } else {
+        result->assign(tmp_result_operand.data(), tmp_result_operand.size());
+      }
+    } else if (result_operand) {
+      *result_operand = Slice(nullptr, 0);
+    }
+
+    RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
+               statistics ? timer.ElapsedNanos() : 0);
+  }
+
+  if (!success) {
+    RecordTick(statistics, NUMBER_MERGE_FAILURES);
+    return Status::Corruption("Error: Could not perform merge.");
+  }
+
+  return Status::OK();
+}
+
+Status MergeHelper::TimedFullMergeWithEntity(
+    const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+    const std::vector<Slice>& operands, std::string* result, Logger* logger,
+    Statistics* statistics, SystemClock* clock, bool update_num_ops_stats) {
+  WideColumns base_columns;
+
+  {
+    const Status s =
+        WideColumnSerialization::Deserialize(base_entity, base_columns);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  const bool has_default_column =
+      !base_columns.empty() && base_columns[0].name() == kDefaultWideColumnName;
+
+  Slice value_of_default;
+  if (has_default_column) {
+    value_of_default = base_columns[0].value();
+  }
+
+  std::string merge_result;
+
+  {
+    constexpr Slice* result_operand = nullptr;
+
+    const Status s = TimedFullMerge(
+        merge_operator, key, &value_of_default, operands, &merge_result, logger,
+        statistics, clock, result_operand, update_num_ops_stats);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  if (has_default_column) {
+    base_columns[0].value() = merge_result;
+
+    const Status s = WideColumnSerialization::Serialize(base_columns, *result);
+    if (!s.ok()) {
+      return s;
+    }
+  } else {
+    const Status s =
+        WideColumnSerialization::Serialize(merge_result, base_columns, *result);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+// PRE:  iter points to the first merge type entry
+// POST: iter points to the first entry beyond the merge process (or the end)
+//       keys_, operands_ are updated to reflect the merge result.
+//       keys_ stores the list of keys encountered while merging.
+//       operands_ stores the list of merge operands encountered while merging.
+//       keys_[i] corresponds to operands_[i] for each i.
+//
+// TODO: Avoid the snapshot stripe map lookup in CompactionRangeDelAggregator
+// and just pass the StripeRep corresponding to the stripe being merged.
+Status MergeHelper::MergeUntil(InternalIterator* iter,
+                               CompactionRangeDelAggregator* range_del_agg,
+                               const SequenceNumber stop_before,
+                               const bool at_bottom,
+                               const bool allow_data_in_errors,
+                               const BlobFetcher* blob_fetcher,
+                               const std::string* const full_history_ts_low,
+                               PrefetchBufferCollection* prefetch_buffers,
+                               CompactionIterationStats* c_iter_stats) {
+  // Get a copy of the internal key, before it's invalidated by iter->Next()
+  // Also maintain the list of merge operands seen.
+  assert(HasOperator());
+  keys_.clear();
+  merge_context_.Clear();
+  has_compaction_filter_skip_until_ = false;
+  assert(user_merge_operator_);
+  assert(user_comparator_);
+  const size_t ts_sz = user_comparator_->timestamp_size();
+  if (full_history_ts_low) {
+    assert(ts_sz > 0);
+    assert(ts_sz == full_history_ts_low->size());
+  }
+  bool first_key = true;
+
+  // We need to parse the internal key again as the parsed key is
+  // backed by the internal key!
+  // Assume no internal key corruption as it has been successfully parsed
+  // by the caller.
+  // original_key_is_iter variable is just caching the information:
+  // original_key_is_iter == (iter->key().ToString() == original_key)
+  bool original_key_is_iter = true;
+  std::string original_key = iter->key().ToString();
+  // Important:
+  // orig_ikey is backed by original_key if keys_.empty()
+  // orig_ikey is backed by keys_.back() if !keys_.empty()
+  ParsedInternalKey orig_ikey;
+
+  Status s = ParseInternalKey(original_key, &orig_ikey, allow_data_in_errors);
+  assert(s.ok());
+  if (!s.ok()) return s;
+
+  assert(kTypeMerge == orig_ikey.type);
+
+  bool hit_the_next_user_key = false;
+  int cmp_with_full_history_ts_low = 0;
+  for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
+    if (IsShuttingDown()) {
+      s = Status::ShutdownInProgress();
+      return s;
+    }
+
+    ParsedInternalKey ikey;
+    assert(keys_.size() == merge_context_.GetNumOperands());
+
+    Status pik_status =
+        ParseInternalKey(iter->key(), &ikey, allow_data_in_errors);
+    Slice ts;
+    if (pik_status.ok()) {
+      ts = ExtractTimestampFromUserKey(ikey.user_key, ts_sz);
+      if (full_history_ts_low) {
+        cmp_with_full_history_ts_low =
+            user_comparator_->CompareTimestamp(ts, *full_history_ts_low);
+      }
+    }
+    if (!pik_status.ok()) {
+      // stop at corrupted key
+      if (assert_valid_internal_key_) {
+        return pik_status;
+      }
+      break;
+    } else if (first_key) {
+      // If user-defined timestamp is enabled, we expect both user key and
+      // timestamps are equal, as a sanity check.
+      assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
+      first_key = false;
+    } else if (!user_comparator_->EqualWithoutTimestamp(ikey.user_key,
+                                                        orig_ikey.user_key) ||
+               (ts_sz > 0 &&
+                !user_comparator_->Equal(ikey.user_key, orig_ikey.user_key) &&
+                cmp_with_full_history_ts_low >= 0)) {
+      // 1) hit a different user key, or
+      // 2) user-defined timestamp is enabled, and hit a version of user key NOT
+      // eligible for GC, then stop right here.
+      hit_the_next_user_key = true;
+      break;
+    } else if (stop_before > 0 && ikey.sequence <= stop_before &&
+               LIKELY(snapshot_checker_ == nullptr ||
+                      snapshot_checker_->CheckInSnapshot(ikey.sequence,
+                                                         stop_before) !=
+                          SnapshotCheckerResult::kNotInSnapshot)) {
+      // hit an entry that's possibly visible by the previous snapshot, can't
+      // touch that
+      break;
+    }
+
+    // At this point we are guaranteed that we need to process this key.
+
+    assert(IsValueType(ikey.type));
+    if (ikey.type != kTypeMerge) {
+      // hit a put/delete/single delete
+      //   => merge the put value or a nullptr with operands_
+      //   => store result in operands_.back() (and update keys_.back())
+      //   => change the entry type to kTypeValue for keys_.back()
+      // We are done! Success!
+
+      // If there are no operands, just return the Status::OK(). That will cause
+      // the compaction iterator to write out the key we're currently at, which
+      // is the put/delete we just encountered.
+      if (keys_.empty()) {
+        return s;
+      }
+
+      // TODO(noetzli) If the merge operator returns false, we are currently
+      // (almost) silently dropping the put/delete. That's probably not what we
+      // want. Also if we're in compaction and it's a put, it would be nice to
+      // run compaction filter on it.
+      std::string merge_result;
+
+      if (range_del_agg &&
+          range_del_agg->ShouldDelete(
+              ikey, RangeDelPositioningMode::kForwardTraversal)) {
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeValue) {
+        const Slice val = iter->value();
+
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, &val,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeBlobIndex) {
+        BlobIndex blob_index;
+
+        s = blob_index.DecodeFrom(iter->value());
+        if (!s.ok()) {
+          return s;
+        }
+
+        FilePrefetchBuffer* prefetch_buffer =
+            prefetch_buffers ? prefetch_buffers->GetOrCreatePrefetchBuffer(
+                                   blob_index.file_number())
+                             : nullptr;
+
+        uint64_t bytes_read = 0;
+
+        assert(blob_fetcher);
+
+        PinnableSlice blob_value;
+        s = blob_fetcher->FetchBlob(ikey.user_key, blob_index, prefetch_buffer,
+                                    &blob_value, &bytes_read);
+        if (!s.ok()) {
+          return s;
+        }
+
+        if (c_iter_stats) {
+          ++c_iter_stats->num_blobs_read;
+          c_iter_stats->total_blob_bytes_read += bytes_read;
+        }
+
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, &blob_value,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      } else if (ikey.type == kTypeWideColumnEntity) {
+        s = TimedFullMergeWithEntity(
+            user_merge_operator_, ikey.user_key, iter->value(),
+            merge_context_.GetOperands(), &merge_result, logger_, stats_,
+            clock_, /* update_num_ops_stats */ false);
+      } else {
+        s = TimedFullMerge(user_merge_operator_, ikey.user_key, nullptr,
+                           merge_context_.GetOperands(), &merge_result, logger_,
+                           stats_, clock_,
+                           /* result_operand */ nullptr,
+                           /* update_num_ops_stats */ false);
+      }
+
+      // We store the result in keys_.back() and operands_.back()
+      // if nothing went wrong (i.e.: no operand corruption on disk)
+      if (s.ok()) {
+        // The original key encountered
+        original_key = std::move(keys_.back());
+        orig_ikey.type = ikey.type == kTypeWideColumnEntity
+                             ? kTypeWideColumnEntity
+                             : kTypeValue;
+        UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+        keys_.clear();
+        merge_context_.Clear();
+        keys_.emplace_front(std::move(original_key));
+        merge_context_.PushOperand(merge_result);
+      }
+
+      // move iter to the next entry
+      iter->Next();
+      return s;
+    } else {
+      // hit a merge
+      //   => if there is a compaction filter, apply it.
+      //   => check for range tombstones covering the operand
+      //   => merge the operand into the front of the operands_ list
+      //      if not filtered
+      //   => then continue because we haven't yet seen a Put/Delete.
+      //
+      // Keep queuing keys and operands until we either meet a put / delete
+      // request or later did a partial merge.
+
+      Slice value_slice = iter->value();
+      // add an operand to the list if:
+      // 1) it's included in one of the snapshots. in that case we *must* write
+      // it out, no matter what compaction filter says
+      // 2) it's not filtered by a compaction filter
+      CompactionFilter::Decision filter =
+          ikey.sequence <= latest_snapshot_
+              ? CompactionFilter::Decision::kKeep
+              : FilterMerge(orig_ikey.user_key, value_slice);
+      if (filter != CompactionFilter::Decision::kRemoveAndSkipUntil &&
+          range_del_agg != nullptr &&
+          range_del_agg->ShouldDelete(
+              iter->key(), RangeDelPositioningMode::kForwardTraversal)) {
+        filter = CompactionFilter::Decision::kRemove;
+      }
+      if (filter == CompactionFilter::Decision::kKeep ||
+          filter == CompactionFilter::Decision::kChangeValue) {
+        if (original_key_is_iter) {
+          // this is just an optimization that saves us one memcpy
+          keys_.emplace_front(original_key);
+        } else {
+          keys_.emplace_front(iter->key().ToString());
+        }
+        if (keys_.size() == 1) {
+          // we need to re-anchor the orig_ikey because it was anchored by
+          // original_key before
+          pik_status =
+              ParseInternalKey(keys_.back(), &orig_ikey, allow_data_in_errors);
+          pik_status.PermitUncheckedError();
+          assert(pik_status.ok());
+        }
+        if (filter == CompactionFilter::Decision::kKeep) {
+          merge_context_.PushOperand(
+              value_slice, iter->IsValuePinned() /* operand_pinned */);
+        } else {
+          assert(filter == CompactionFilter::Decision::kChangeValue);
+          // Compaction filter asked us to change the operand from value_slice
+          // to compaction_filter_value_.
+          merge_context_.PushOperand(compaction_filter_value_, false);
+        }
+      } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+        // Compaction filter asked us to remove this key altogether
+        // (not just this operand), along with some keys following it.
+        keys_.clear();
+        merge_context_.Clear();
+        has_compaction_filter_skip_until_ = true;
+        return s;
+      }
+    }
+  }
+
+  if (cmp_with_full_history_ts_low >= 0) {
+    size_t num_merge_operands = merge_context_.GetNumOperands();
+    if (ts_sz && num_merge_operands > 1) {
+      // We do not merge merge operands with different timestamps if they are
+      // not eligible for GC.
+      ROCKS_LOG_ERROR(logger_, "ts_sz=%d, %d merge oprands",
+                      static_cast<int>(ts_sz),
+                      static_cast<int>(num_merge_operands));
+      assert(false);
+    }
+  }
+
+  if (merge_context_.GetNumOperands() == 0) {
+    // we filtered out all the merge operands
+    return s;
+  }
+
+  // We are sure we have seen this key's entire history if:
+  // at_bottom == true (this does not necessarily mean it is the bottommost
+  // layer, but rather that we are confident the key does not appear on any of
+  // the lower layers, at_bottom == false doesn't mean it does appear, just
+  // that we can't be sure, see Compaction::IsBottommostLevel for details)
+  // AND
+  // we have either encountered another key or end of key history on this
+  // layer.
+  // Note that if user-defined timestamp is enabled, we need some extra caution
+  // here: if full_history_ts_low is nullptr, or it's not null but the key's
+  // timestamp is greater than or equal to full_history_ts_low, it means this
+  // key cannot be dropped. We may not have seen the beginning of the key.
+  //
+  // When these conditions are true we are able to merge all the keys
+  // using full merge.
+  //
+  // For these cases we are not sure about, we simply miss the opportunity
+  // to combine the keys. Since VersionSet::SetupOtherInputs() always makes
+  // sure that all merge-operands on the same level get compacted together,
+  // this will simply lead to these merge operands moving to the next level.
+  bool surely_seen_the_beginning =
+      (hit_the_next_user_key || !iter->Valid()) && at_bottom &&
+      (ts_sz == 0 || cmp_with_full_history_ts_low < 0);
+  if (surely_seen_the_beginning) {
+    // do a final merge with nullptr as the existing value and say
+    // bye to the merge type (it's now converted to a Put)
+    assert(kTypeMerge == orig_ikey.type);
+    assert(merge_context_.GetNumOperands() >= 1);
+    assert(merge_context_.GetNumOperands() == keys_.size());
+    std::string merge_result;
+    s = TimedFullMerge(
+        user_merge_operator_, orig_ikey.user_key, nullptr,
+        merge_context_.GetOperands(), &merge_result, logger_, stats_, clock_,
+        /* result_operand */ nullptr, /* update_num_ops_stats */ false);
+    if (s.ok()) {
+      // The original key encountered
+      // We are certain that keys_ is not empty here (see assertions couple of
+      // lines before).
+      original_key = std::move(keys_.back());
+      orig_ikey.type = kTypeValue;
+      UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
+      keys_.clear();
+      merge_context_.Clear();
+      keys_.emplace_front(std::move(original_key));
+      merge_context_.PushOperand(merge_result);
+    }
+  } else {
+    // We haven't seen the beginning of the key nor a Put/Delete.
+    // Attempt to use the user's associative merge function to
+    // merge the stacked merge operands into a single operand.
+    s = Status::MergeInProgress();
+    if (merge_context_.GetNumOperands() >= 2 ||
+        (allow_single_operand_ && merge_context_.GetNumOperands() == 1)) {
+      bool merge_success = false;
+      std::string merge_result;
+      {
+        StopWatchNano timer(clock_, stats_ != nullptr);
+        PERF_TIMER_GUARD(merge_operator_time_nanos);
+        merge_success = user_merge_operator_->PartialMergeMulti(
+            orig_ikey.user_key,
+            std::deque<Slice>(merge_context_.GetOperands().begin(),
+                              merge_context_.GetOperands().end()),
+            &merge_result, logger_);
+        RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
+                   stats_ ? timer.ElapsedNanosSafe() : 0);
+      }
+      if (merge_success) {
+        // Merging of operands (associative merge) was successful.
+        // Replace operands with the merge result
+        merge_context_.Clear();
+        merge_context_.PushOperand(merge_result);
+        keys_.erase(keys_.begin(), keys_.end() - 1);
+      }
+    }
+  }
+
+  return s;
+}
+
+MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
+    : merge_helper_(merge_helper) {
+  it_keys_ = merge_helper_->keys().rend();
+  it_values_ = merge_helper_->values().rend();
+}
+
+void MergeOutputIterator::SeekToFirst() {
+  const auto& keys = merge_helper_->keys();
+  const auto& values = merge_helper_->values();
+  assert(keys.size() == values.size());
+  it_keys_ = keys.rbegin();
+  it_values_ = values.rbegin();
+}
+
+void MergeOutputIterator::Next() {
+  ++it_keys_;
+  ++it_values_;
+}
+
+CompactionFilter::Decision MergeHelper::FilterMerge(const Slice& user_key,
+                                                    const Slice& value_slice) {
+  if (compaction_filter_ == nullptr) {
+    return CompactionFilter::Decision::kKeep;
+  }
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    filter_timer_.Start();
+  }
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+  auto ret = compaction_filter_->FilterV2(
+      level_, user_key, CompactionFilter::ValueType::kMergeOperand, value_slice,
+      &compaction_filter_value_, compaction_filter_skip_until_.rep());
+  if (ret == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+    if (user_comparator_->Compare(*compaction_filter_skip_until_.rep(),
+                                  user_key) <= 0) {
+      // Invalid skip_until returned from compaction filter.
+      // Keep the key as per FilterV2 documentation.
+      ret = CompactionFilter::Decision::kKeep;
+    } else {
+      compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+                                                       kValueTypeForSeek);
+    }
+  }
+  if (stats_ != nullptr && ShouldReportDetailedTime(env_, stats_)) {
+    total_filter_time_ += filter_timer_.ElapsedNanosSafe();
+  }
+  return ret;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper.h b/src/rocksdb/db/merge_helper.h
new file mode 100644
index 000000000..790ec6239
--- /dev/null
+++ b/src/rocksdb/db/merge_helper.h
@@ -0,0 +1,216 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+
+#include <deque>
+#include <string>
+#include <vector>
+
+#include "db/merge_context.h"
+#include "db/range_del_aggregator.h"
+#include "db/snapshot_checker.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/wide_columns.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Comparator;
+class Iterator;
+class Logger;
+class MergeOperator;
+class Statistics;
+class SystemClock;
+class BlobFetcher;
+class PrefetchBufferCollection;
+struct CompactionIterationStats;
+
+class MergeHelper {
+ public:
+  MergeHelper(Env* env, const Comparator* user_comparator,
+              const MergeOperator* user_merge_operator,
+              const CompactionFilter* compaction_filter, Logger* logger,
+              bool assert_valid_internal_key, SequenceNumber latest_snapshot,
+              const SnapshotChecker* snapshot_checker = nullptr, int level = 0,
+              Statistics* stats = nullptr,
+              const std::atomic<bool>* shutting_down = nullptr);
+
+  // Wrapper around MergeOperator::FullMergeV2() that records perf statistics.
+  // Result of merge will be written to result if status returned is OK.
+  // If operands is empty, the value will simply be copied to result.
+  // Set `update_num_ops_stats` to true if it is from a user read, so that
+  // the latency is sensitive.
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - Corruption: Merge operator reported unsuccessful merge.
+  static Status TimedFullMerge(const MergeOperator* merge_operator,
+                               const Slice& key, const Slice* value,
+                               const std::vector<Slice>& operands,
+                               std::string* result, Logger* logger,
+                               Statistics* statistics, SystemClock* clock,
+                               Slice* result_operand,
+                               bool update_num_ops_stats);
+
+  static Status TimedFullMergeWithEntity(
+      const MergeOperator* merge_operator, const Slice& key, Slice base_entity,
+      const std::vector<Slice>& operands, std::string* result, Logger* logger,
+      Statistics* statistics, SystemClock* clock, bool update_num_ops_stats);
+
+  // During compaction, merge entries until we hit
+  //     - a corrupted key
+  //     - a Put/Delete,
+  //     - a different user key,
+  //     - a specific sequence number (snapshot boundary),
+  //     - REMOVE_AND_SKIP_UNTIL returned from compaction filter,
+  //  or - the end of iteration
+  // iter: (IN)  points to the first merge type entry
+  //       (OUT) points to the first entry not included in the merge process
+  // range_del_agg: (IN) filters merge operands covered by range tombstones.
+  // stop_before: (IN) a sequence number that merge should not cross.
+  //                   0 means no restriction
+  // at_bottom:   (IN) true if the iterator covers the bottem level, which means
+  //                   we could reach the start of the history of this user key.
+  // allow_data_in_errors: (IN) if true, data details will be displayed in
+  //                   error/log messages.
+  // blob_fetcher: (IN) blob fetcher object for the compaction's input version.
+  // prefetch_buffers: (IN/OUT) a collection of blob file prefetch buffers
+  //                            used for compaction readahead.
+  // c_iter_stats: (OUT) compaction iteration statistics.
+  //
+  // Returns one of the following statuses:
+  // - OK: Entries were successfully merged.
+  // - MergeInProgress: Put/Delete not encountered, and didn't reach the start
+  //   of key's history. Output consists of merge operands only.
+  // - Corruption: Merge operator reported unsuccessful merge or a corrupted
+  //   key has been encountered and not expected (applies only when compiling
+  //   with asserts removed).
+  // - ShutdownInProgress: interrupted by shutdown (*shutting_down == true).
+  //
+  // REQUIRED: The first key in the input is not corrupted.
+  Status MergeUntil(InternalIterator* iter,
+                    CompactionRangeDelAggregator* range_del_agg,
+                    const SequenceNumber stop_before, const bool at_bottom,
+                    const bool allow_data_in_errors,
+                    const BlobFetcher* blob_fetcher,
+                    const std::string* const full_history_ts_low,
+                    PrefetchBufferCollection* prefetch_buffers,
+                    CompactionIterationStats* c_iter_stats);
+
+  // Filters a merge operand using the compaction filter specified
+  // in the constructor. Returns the decision that the filter made.
+  // Uses compaction_filter_value_ and compaction_filter_skip_until_ for the
+  // optional outputs of compaction filter.
+  // user_key includes timestamp if user-defined timestamp is enabled.
+  CompactionFilter::Decision FilterMerge(const Slice& user_key,
+                                         const Slice& value_slice);
+
+  // Query the merge result
+  // These are valid until the next MergeUntil call
+  // If the merging was successful:
+  //   - keys() contains a single element with the latest sequence number of
+  //     the merges. The type will be Put or Merge. See IMPORTANT 1 note, below.
+  //   - values() contains a single element with the result of merging all the
+  //     operands together
+  //
+  //   IMPORTANT 1: the key type could change after the MergeUntil call.
+  //        Put/Delete + Merge + ... + Merge => Put
+  //        Merge + ... + Merge => Merge
+  //
+  // If the merge operator is not associative, and if a Put/Delete is not found
+  // then the merging will be unsuccessful. In this case:
+  //   - keys() contains the list of internal keys seen in order of iteration.
+  //   - values() contains the list of values (merges) seen in the same order.
+  //              values() is parallel to keys() so that the first entry in
+  //              keys() is the key associated with the first entry in values()
+  //              and so on. These lists will be the same length.
+  //              All of these pairs will be merges over the same user key.
+  //              See IMPORTANT 2 note below.
+  //
+  //   IMPORTANT 2: The entries were traversed in order from BACK to FRONT.
+  //                So keys().back() was the first key seen by iterator.
+  // TODO: Re-style this comment to be like the first one
+  const std::deque<std::string>& keys() const { return keys_; }
+  const std::vector<Slice>& values() const {
+    return merge_context_.GetOperands();
+  }
+  uint64_t TotalFilterTime() const { return total_filter_time_; }
+  bool HasOperator() const { return user_merge_operator_ != nullptr; }
+
+  // If compaction filter returned REMOVE_AND_SKIP_UNTIL, this method will
+  // return true and fill *until with the key to which we should skip.
+  // If true, keys() and values() are empty.
+  bool FilteredUntil(Slice* skip_until) const {
+    if (!has_compaction_filter_skip_until_) {
+      return false;
+    }
+    assert(compaction_filter_ != nullptr);
+    assert(skip_until != nullptr);
+    assert(compaction_filter_skip_until_.Valid());
+    *skip_until = compaction_filter_skip_until_.Encode();
+    return true;
+  }
+
+ private:
+  Env* env_;
+  SystemClock* clock_;
+  const Comparator* user_comparator_;
+  const MergeOperator* user_merge_operator_;
+  const CompactionFilter* compaction_filter_;
+  const std::atomic<bool>* shutting_down_;
+  Logger* logger_;
+  bool assert_valid_internal_key_;  // enforce no internal key corruption?
+  bool allow_single_operand_;
+  SequenceNumber latest_snapshot_;
+  const SnapshotChecker* const snapshot_checker_;
+  int level_;
+
+  // the scratch area that holds the result of MergeUntil
+  // valid up to the next MergeUntil call
+
+  // Keeps track of the sequence of keys seen
+  std::deque<std::string> keys_;
+  // Parallel with keys_; stores the operands
+  mutable MergeContext merge_context_;
+
+  StopWatchNano filter_timer_;
+  uint64_t total_filter_time_;
+  Statistics* stats_;
+
+  bool has_compaction_filter_skip_until_ = false;
+  std::string compaction_filter_value_;
+  InternalKey compaction_filter_skip_until_;
+
+  bool IsShuttingDown() {
+    // This is a best-effort facility, so memory_order_relaxed is sufficient.
+    return shutting_down_ && shutting_down_->load(std::memory_order_relaxed);
+  }
+};
+
+// MergeOutputIterator can be used to iterate over the result of a merge.
+class MergeOutputIterator {
+ public:
+  // The MergeOutputIterator is bound to a MergeHelper instance.
+  explicit MergeOutputIterator(const MergeHelper* merge_helper);
+
+  // Seeks to the first record in the output.
+  void SeekToFirst();
+  // Advances to the next record in the output.
+  void Next();
+
+  Slice key() { return Slice(*it_keys_); }
+  Slice value() { return Slice(*it_values_); }
+  bool Valid() { return it_keys_ != merge_helper_->keys().rend(); }
+
+ private:
+  const MergeHelper* merge_helper_;
+  std::deque<std::string>::const_reverse_iterator it_keys_;
+  std::vector<Slice>::const_reverse_iterator it_values_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_helper_test.cc b/src/rocksdb/db/merge_helper_test.cc
new file mode 100644
index 000000000..05408d5b9
--- /dev/null
+++ b/src/rocksdb/db/merge_helper_test.cc
@@ -0,0 +1,298 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/merge_helper.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/vector_iterator.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MergeHelperTest : public testing::Test {
+ public:
+  MergeHelperTest() : icmp_(BytewiseComparator()) { env_ = Env::Default(); }
+
+  ~MergeHelperTest() override = default;
+
+  Status Run(SequenceNumber stop_before, bool at_bottom,
+             SequenceNumber latest_snapshot = 0) {
+    iter_.reset(new VectorIterator(ks_, vs_, &icmp_));
+    iter_->SeekToFirst();
+    merge_helper_.reset(new MergeHelper(env_, icmp_.user_comparator(),
+                                        merge_op_.get(), filter_.get(), nullptr,
+                                        false, latest_snapshot));
+    return merge_helper_->MergeUntil(
+        iter_.get(), nullptr /* range_del_agg */, stop_before, at_bottom,
+        false /* allow_data_in_errors */, nullptr /* blob_fetcher */,
+        nullptr /* full_history_ts_low */, nullptr /* prefetch_buffers */,
+        nullptr /* c_iter_stats */);
+  }
+
+  void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
+                 const ValueType& t, const std::string& val,
+                 bool corrupt = false) {
+    InternalKey ikey(user_key, seq, t);
+    if (corrupt) {
+      test::CorruptKeyType(&ikey);
+    }
+    ks_.push_back(ikey.Encode().ToString());
+    vs_.push_back(val);
+  }
+
+  Env* env_;
+  InternalKeyComparator icmp_;
+  std::unique_ptr<VectorIterator> iter_;
+  std::shared_ptr<MergeOperator> merge_op_;
+  std::unique_ptr<MergeHelper> merge_helper_;
+  std::vector<std::string> ks_;
+  std::vector<std::string> vs_;
+  std::unique_ptr<test::FilterNumber> filter_;
+};
+
+// If MergeHelper encounters a new key on the last level, we know that
+// the key has no more history and it can merge keys.
+TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U));  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(0, true).ok());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a value results in a successful merge.
+TEST_F(MergeHelperTest, MergeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));  // <- iter_ after merge
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(0, false).ok());
+  ASSERT_EQ(ks_[3], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging stops before a snapshot.
+TEST_F(MergeHelperTest, SnapshotBeforeValue) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U));  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
+  AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// MergeHelper preserves the operand stack for merge operators that
+// cannot do a partial merge.
+TEST_F(MergeHelperTest, NoPartialMerge) {
+  merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, "v2");
+  AddKeyVal("a", 40, kTypeMerge, "v");  // <- iter_ after merge
+  AddKeyVal("a", 30, kTypeMerge, "v");
+
+  ASSERT_TRUE(Run(31, true).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ("v", merge_helper_->values()[0]);
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[1]);
+  ASSERT_EQ("v2", merge_helper_->values()[1]);
+  ASSERT_EQ(2U, merge_helper_->keys().size());
+  ASSERT_EQ(2U, merge_helper_->values().size());
+}
+
+// A single operand can not be merged.
+TEST_F(MergeHelperTest, SingleOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(31, false).IsMergeInProgress());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// Merging with a deletion turns the deletion into a value
+TEST_F(MergeHelperTest, MergeDeletion) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 20, kTypeDeletion, "");
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The merge helper stops upon encountering a corrupt key
+TEST_F(MergeHelperTest, CorruptKey) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
+  // Corrupt key
+  AddKeyVal("a", 20, kTypeDeletion, "", true);  // <- iter_ after merge
+
+  ASSERT_TRUE(Run(15, false).IsMergeInProgress());
+  ASSERT_EQ(ks_[2], iter_->key());
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
+  ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
+  ASSERT_EQ(1U, merge_helper_->keys().size());
+  ASSERT_EQ(1U, merge_helper_->values().size());
+}
+
+// The compaction filter is called on every merge operand
+TEST_F(MergeHelperTest, FilterMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
+
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+TEST_F(MergeHelperTest, FilterAllMergeOperands) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+
+  // filtered out all
+  ASSERT_TRUE(Run(15, false).ok());
+  ASSERT_FALSE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // we have one operand that will survive because it's a delete
+  AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
+  AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
+  ASSERT_TRUE(Run(15, true).ok());
+  merge_output_iter = MergeOutputIterator(merge_helper_.get());
+  ASSERT_TRUE(iter_->Valid());
+  merge_output_iter.SeekToFirst();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // when all merge operands are filtered out, we leave the iterator pointing to
+  // the Put/Delete that survived
+  ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
+  ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
+}
+
+// Make sure that merge operands are filtered at the beginning
+TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));  // Filtered
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));  // next user key
+
+  ASSERT_OK(Run(15, true));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  // sequence number is 29 here, because the first merge operand got filtered
+  // out
+  ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+
+  // make sure that we're passing user keys into the filter
+  ASSERT_EQ("a", filter_->last_merge_operand_key());
+}
+
+// Make sure that merge operands are not filtered out if there's a snapshot
+// pointing at them
+TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
+  merge_op_ = MergeOperators::CreateUInt64AddOperator();
+  filter_.reset(new test::FilterNumber(5U));
+
+  AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
+  AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
+  AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
+  AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
+  AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
+
+  ASSERT_OK(Run(15, true, 32));
+  ASSERT_TRUE(iter_->Valid());
+  MergeOutputIterator merge_output_iter(merge_helper_.get());
+  merge_output_iter.SeekToFirst();
+  ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
+            merge_output_iter.key().ToString());
+  ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
+  merge_output_iter.Next();
+  ASSERT_FALSE(merge_output_iter.Valid());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/merge_operator.cc b/src/rocksdb/db/merge_operator.cc
new file mode 100644
index 000000000..d32585640
--- /dev/null
+++ b/src/rocksdb/db/merge_operator.cc
@@ -0,0 +1,85 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+/**
+ * Back-end implementation details specific to the Merge Operator.
+ */
+
+#include "rocksdb/merge_operator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool MergeOperator::FullMergeV2(const MergeOperationInput& merge_in,
+                                MergeOperationOutput* merge_out) const {
+  // If FullMergeV2 is not implemented, we convert the operand_list to
+  // std::deque<std::string> and pass it to FullMerge
+  std::deque<std::string> operand_list_str;
+  for (auto& op : merge_in.operand_list) {
+    operand_list_str.emplace_back(op.data(), op.size());
+  }
+  return FullMerge(merge_in.key, merge_in.existing_value, operand_list_str,
+                   &merge_out->new_value, merge_in.logger);
+}
+
+// The default implementation of PartialMergeMulti, which invokes
+// PartialMerge multiple times internally and merges two operands at
+// a time.
+bool MergeOperator::PartialMergeMulti(const Slice& key,
+                                      const std::deque<Slice>& operand_list,
+                                      std::string* new_value,
+                                      Logger* logger) const {
+  assert(operand_list.size() >= 2);
+  // Simply loop through the operands
+  Slice temp_slice(operand_list[0]);
+
+  for (size_t i = 1; i < operand_list.size(); ++i) {
+    auto& operand = operand_list[i];
+    std::string temp_value;
+    if (!PartialMerge(key, temp_slice, operand, &temp_value, logger)) {
+      return false;
+    }
+    swap(temp_value, *new_value);
+    temp_slice = Slice(*new_value);
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Given a "real" merge from the library, call the user's
+// associative merge function one-by-one on each of the operands.
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::FullMergeV2(
+    const MergeOperationInput& merge_in,
+    MergeOperationOutput* merge_out) const {
+  // Simply loop through the operands
+  Slice temp_existing;
+  const Slice* existing_value = merge_in.existing_value;
+  for (const auto& operand : merge_in.operand_list) {
+    std::string temp_value;
+    if (!Merge(merge_in.key, existing_value, operand, &temp_value,
+               merge_in.logger)) {
+      return false;
+    }
+    swap(temp_value, merge_out->new_value);
+    temp_existing = Slice(merge_out->new_value);
+    existing_value = &temp_existing;
+  }
+
+  // The result will be in *new_value. All merges succeeded.
+  return true;
+}
+
+// Call the user defined simple merge on the operands;
+// NOTE: It is assumed that the client's merge-operator will handle any errors.
+bool AssociativeMergeOperator::PartialMerge(const Slice& key,
+                                            const Slice& left_operand,
+                                            const Slice& right_operand,
+                                            std::string* new_value,
+                                            Logger* logger) const {
+  return Merge(key, &left_operand, right_operand, new_value, logger);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/merge_test.cc b/src/rocksdb/db/merge_test.cc
new file mode 100644
index 000000000..0d373d41e
--- /dev/null
+++ b/src/rocksdb/db/merge_test.cc
@@ -0,0 +1,629 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include <assert.h>
+
+#include <iostream>
+#include <memory>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/write_batch_internal.h"
+#include "port/stack_trace.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/utilities/db_ttl.h"
+#include "test_util/testharness.h"
+#include "util/coding.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+bool use_compression;
+
+class MergeTest : public testing::Test {};
+
+size_t num_merge_operator_calls;
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+class CountMergeOperator : public AssociativeMergeOperator {
+ public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  bool Merge(const Slice& key, const Slice* existing_value, const Slice& value,
+             std::string* new_value, Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_merge_operator_calls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
+                                        logger);
+  }
+
+  bool PartialMergeMulti(const Slice& key,
+                         const std::deque<Slice>& operand_list,
+                         std::string* new_value,
+                         Logger* logger) const override {
+    assert(new_value->empty());
+    ++num_partial_merge_calls;
+    return mergeOperator_->PartialMergeMulti(key, operand_list, new_value,
+                                             logger);
+  }
+
+  const char* Name() const override { return "UInt64AddOperator"; }
+
+ private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+class EnvMergeTest : public EnvWrapper {
+ public:
+  EnvMergeTest() : EnvWrapper(Env::Default()) {}
+  static const char* kClassName() { return "MergeEnv"; }
+  const char* Name() const override { return kClassName(); }
+  //  ~EnvMergeTest() override {}
+
+  uint64_t NowNanos() override {
+    ++now_nanos_count_;
+    return target()->NowNanos();
+  }
+
+  static uint64_t now_nanos_count_;
+
+  static std::unique_ptr<EnvMergeTest> singleton_;
+
+  static EnvMergeTest* GetInstance() {
+    if (nullptr == singleton_) singleton_.reset(new EnvMergeTest);
+    return singleton_.get();
+  }
+};
+
+uint64_t EnvMergeTest::now_nanos_count_{0};
+std::unique_ptr<EnvMergeTest> EnvMergeTest::singleton_;
+
+std::shared_ptr<DB> OpenDb(const std::string& dbname, const bool ttl = false,
+                           const size_t max_successive_merges = 0) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = max_successive_merges;
+  options.env = EnvMergeTest::GetInstance();
+  EXPECT_OK(DestroyDB(dbname, Options()));
+  Status s;
+// DBWithTTL is not supported in ROCKSDB_LITE
+#ifndef ROCKSDB_LITE
+  if (ttl) {
+    DBWithTTL* db_with_ttl;
+    s = DBWithTTL::Open(options, dbname, &db_with_ttl);
+    db = db_with_ttl;
+  } else {
+    s = DB::Open(options, dbname, &db);
+  }
+#else
+  assert(!ttl);
+  s = DB::Open(options, dbname, &db);
+#endif  // !ROCKSDB_LITE
+  EXPECT_OK(s);
+  assert(s.ok());
+  // Allowed to call NowNanos during DB creation (in GenerateRawUniqueId() for
+  // session ID)
+  EnvMergeTest::now_nanos_count_ = 0;
+  return std::shared_ptr<DB>(db);
+}
+
+// Imagine we are maintaining a set of uint64 counters.
+// Each counter has a distinct name. And we would like
+// to support four high level operations:
+// set, add, get and remove
+// This is a quick implementation without a Merge operation.
+class Counters {
+ protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_;
+
+ public:
+  explicit Counters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : db_(db),
+        put_option_(),
+        get_option_(),
+        delete_option_(),
+        default_(defaultCount) {
+    assert(db_);
+  }
+
+  virtual ~Counters() {}
+
+  // public interface of Counters.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const std::string& key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    Slice slice(buf, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Delete
+  bool remove(const std::string& key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const std::string& key, uint64_t* value) {
+    std::string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        std::cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // 'add' is implemented as get -> modify -> set
+  // An alternative is a single merge operation, see MergeBasedCounters
+  virtual bool add(const std::string& key, uint64_t value) {
+    uint64_t base = default_;
+    return get(key, &base) && set(key, base + value);
+  }
+
+  // convenience functions for testing
+  void assert_set(const std::string& key, uint64_t value) {
+    assert(set(key, value));
+  }
+
+  void assert_remove(const std::string& key) { assert(remove(key)); }
+
+  uint64_t assert_get(const std::string& key) {
+    uint64_t value = default_;
+    int result = get(key, &value);
+    assert(result);
+    if (result == 0) exit(1);  // Disable unused variable warning.
+    return value;
+  }
+
+  void assert_add(const std::string& key, uint64_t value) {
+    int result = add(key, value);
+    assert(result);
+    if (result == 0) exit(1);  // Disable unused variable warning.
+  }
+};
+
+// Implement 'add' directly with the new Merge operation
+class MergeBasedCounters : public Counters {
+ private:
+  WriteOptions merge_option_;  // for merge
+
+ public:
+  explicit MergeBasedCounters(std::shared_ptr<DB> db, uint64_t defaultCount = 0)
+      : Counters(db, defaultCount), merge_option_() {}
+
+  // mapped to a rocksdb Merge operation
+  bool add(const std::string& key, uint64_t value) override {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+};
+
+void dumpDb(DB* db) {
+  auto it = std::unique_ptr<Iterator>(db->NewIterator(ReadOptions()));
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
+    // uint64_t value = DecodeFixed64(it->value().data());
+    // std::cout << it->key().ToString() << ": " << value << std::endl;
+  }
+  assert(it->status().ok());  // Check for any errors found during the scan
+}
+
+void testCounters(Counters& counters, DB* db, bool test_compaction) {
+  FlushOptions o;
+  o.wait = true;
+
+  counters.assert_set("a", 1);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
+
+  ASSERT_EQ(counters.assert_get("a"), 1);
+
+  counters.assert_remove("b");
+
+  // defaut value is 0 if non-existent
+  ASSERT_EQ(counters.assert_get("b"), 0);
+
+  counters.assert_add("a", 2);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+  }
+
+  // 1+2 = 3
+  ASSERT_EQ(counters.assert_get("a"), 3);
+
+  dumpDb(db);
+
+  // 1+...+49 = ?
+  uint64_t sum = 0;
+  for (int i = 1; i < 50; i++) {
+    counters.assert_add("b", i);
+    sum += i;
+  }
+  ASSERT_EQ(counters.assert_get("b"), sum);
+
+  dumpDb(db);
+
+  if (test_compaction) {
+    ASSERT_OK(db->Flush(o));
+
+    ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+    dumpDb(db);
+
+    ASSERT_EQ(counters.assert_get("a"), 3);
+    ASSERT_EQ(counters.assert_get("b"), sum);
+  }
+}
+
+void testCountersWithFlushAndCompaction(Counters& counters, DB* db) {
+  ASSERT_OK(db->Put({}, "1", "1"));
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  std::atomic<int> cnt{0};
+  const auto get_thread_id = [&cnt]() {
+    thread_local int thread_id{cnt++};
+    return thread_id;
+  };
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:BeforeWriterWaiting", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (1 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_compact_thread:0");
+        } else if (2 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::bg_flush_thread:0");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WriteManifest", [&](void* /*arg*/) {
+        int thread_id = get_thread_id();
+        if (0 == thread_id) {
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:0");
+          TEST_SYNC_POINT(
+              "testCountersWithFlushAndCompaction::set_options_thread:1");
+        }
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::LogAndApply:WakeUpAndDone", [&](void* arg) {
+        auto* mutex = reinterpret_cast<InstrumentedMutex*>(arg);
+        mutex->AssertHeld();
+        int thread_id = get_thread_id();
+        ASSERT_EQ(2, thread_id);
+        mutex->Unlock();
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:1");
+        TEST_SYNC_POINT(
+            "testCountersWithFlushAndCompaction::bg_flush_thread:2");
+        mutex->Lock();
+      });
+  SyncPoint::GetInstance()->LoadDependency({
+      {"testCountersWithFlushAndCompaction::set_options_thread:0",
+       "testCountersWithCompactionAndFlush:BeforeCompact"},
+      {"testCountersWithFlushAndCompaction::bg_compact_thread:0",
+       "testCountersWithFlushAndCompaction:BeforeIncCounters"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:0",
+       "testCountersWithFlushAndCompaction::set_options_thread:1"},
+      {"testCountersWithFlushAndCompaction::bg_flush_thread:1",
+       "testCountersWithFlushAndCompaction:BeforeVerification"},
+      {"testCountersWithFlushAndCompaction:AfterGet",
+       "testCountersWithFlushAndCompaction::bg_flush_thread:2"},
+  });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread set_options_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->SetOptions(
+        {{"disable_auto_compactions", "false"}}));
+  });
+  TEST_SYNC_POINT("testCountersWithCompactionAndFlush:BeforeCompact");
+  port::Thread compact_thread([&]() {
+    ASSERT_OK(reinterpret_cast<DBImpl*>(db)->CompactRange(
+        CompactRangeOptions(), db->DefaultColumnFamily(), nullptr, nullptr));
+  });
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeIncCounters");
+  counters.add("test-key", 1);
+
+  FlushOptions flush_opts;
+  flush_opts.wait = false;
+  ASSERT_OK(db->Flush(flush_opts));
+
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:BeforeVerification");
+  std::string expected;
+  PutFixed64(&expected, 1);
+  std::string actual;
+  Status s = db->Get(ReadOptions(), "test-key", &actual);
+  TEST_SYNC_POINT("testCountersWithFlushAndCompaction:AfterGet");
+  set_options_thread.join();
+  compact_thread.join();
+  ASSERT_OK(s);
+  ASSERT_EQ(expected, actual);
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+void testSuccessiveMerge(Counters& counters, size_t max_num_merges,
+                         size_t num_merges) {
+  counters.assert_remove("z");
+  uint64_t sum = 0;
+
+  for (size_t i = 1; i <= num_merges; ++i) {
+    resetNumMergeOperatorCalls();
+    counters.assert_add("z", i);
+    sum += i;
+
+    if (i % (max_num_merges + 1) == 0) {
+      ASSERT_EQ(num_merge_operator_calls, max_num_merges + 1);
+    } else {
+      ASSERT_EQ(num_merge_operator_calls, 0);
+    }
+
+    resetNumMergeOperatorCalls();
+    ASSERT_EQ(counters.assert_get("z"), sum);
+    ASSERT_EQ(num_merge_operator_calls, i % (max_num_merges + 1));
+  }
+}
+
+void testPartialMerge(Counters* counters, DB* db, size_t max_merge,
+                      size_t min_merge, size_t count) {
+  FlushOptions o;
+  o.wait = true;
+
+  // Test case 1: partial merge should be called when the number of merge
+  //              operands exceeds the threshold.
+  uint64_t tmp_sum = 0;
+  resetNumPartialMergeCalls();
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("b", i);
+    tmp_sum += i;
+  }
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(tmp_sum, counters->assert_get("b"));
+  if (count > max_merge) {
+    // in this case, FullMerge should be called instead.
+    ASSERT_EQ(num_partial_merge_calls, 0U);
+  } else {
+    // if count >= min_merge, then partial merge should be called once.
+    ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1));
+  }
+
+  // Test case 2: partial merge should not be called when a put is found.
+  resetNumPartialMergeCalls();
+  tmp_sum = 0;
+  ASSERT_OK(db->Put(ROCKSDB_NAMESPACE::WriteOptions(), "c", "10"));
+  for (size_t i = 1; i <= count; i++) {
+    counters->assert_add("c", i);
+    tmp_sum += i;
+  }
+  ASSERT_OK(db->Flush(o));
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ(tmp_sum, counters->assert_get("c"));
+  ASSERT_EQ(num_partial_merge_calls, 0U);
+  // NowNanos was previously called in MergeHelper::FilterMerge(), which
+  // harmed performance.
+  ASSERT_EQ(EnvMergeTest::now_nanos_count_, 0U);
+}
+
+void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges,
+                                    size_t num_merges) {
+  ASSERT_GT(num_merges, max_num_merges);
+
+  Slice key("BatchSuccessiveMerge");
+  uint64_t merge_value = 1;
+  char buf[sizeof(merge_value)];
+  EncodeFixed64(buf, merge_value);
+  Slice merge_value_slice(buf, sizeof(merge_value));
+
+  // Create the batch
+  WriteBatch batch;
+  for (size_t i = 0; i < num_merges; ++i) {
+    ASSERT_OK(batch.Merge(key, merge_value_slice));
+  }
+
+  // Apply to memtable and count the number of merges
+  resetNumMergeOperatorCalls();
+  ASSERT_OK(db->Write(WriteOptions(), &batch));
+  ASSERT_EQ(
+      num_merge_operator_calls,
+      static_cast<size_t>(num_merges - (num_merges % (max_num_merges + 1))));
+
+  // Get the value
+  resetNumMergeOperatorCalls();
+  std::string get_value_str;
+  ASSERT_OK(db->Get(ReadOptions(), key, &get_value_str));
+  assert(get_value_str.size() == sizeof(uint64_t));
+  uint64_t get_value = DecodeFixed64(&get_value_str[0]);
+  ASSERT_EQ(get_value, num_merges * merge_value);
+  ASSERT_EQ(num_merge_operator_calls,
+            static_cast<size_t>((num_merges % (max_num_merges + 1))));
+}
+
+void runTest(const std::string& dbname, const bool use_ttl = false) {
+  {
+    auto db = OpenDb(dbname, use_ttl);
+
+    {
+      Counters counters(db, 0);
+      testCounters(counters, db.get(), true);
+    }
+
+    {
+      MergeBasedCounters counters(db, 0);
+      testCounters(counters, db.get(), use_compression);
+    }
+  }
+
+  ASSERT_OK(DestroyDB(dbname, Options()));
+
+  {
+    size_t max_merge = 5;
+    auto db = OpenDb(dbname, use_ttl, max_merge);
+    MergeBasedCounters counters(db, 0);
+    testCounters(counters, db.get(), use_compression);
+    testSuccessiveMerge(counters, max_merge, max_merge * 2);
+    testSingleBatchSuccessiveMerge(db.get(), 5, 7);
+    ASSERT_OK(db->Close());
+    ASSERT_OK(DestroyDB(dbname, Options()));
+  }
+
+  {
+    size_t max_merge = 100;
+    // Min merge is hard-coded to 2.
+    uint32_t min_merge = 2;
+    for (uint32_t count = min_merge - 1; count <= min_merge + 1; count++) {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge, count);
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
+    }
+    {
+      auto db = OpenDb(dbname, use_ttl, max_merge);
+      MergeBasedCounters counters(db, 0);
+      testPartialMerge(&counters, db.get(), max_merge, min_merge,
+                       min_merge * 10);
+      ASSERT_OK(db->Close());
+      ASSERT_OK(DestroyDB(dbname, Options()));
+    }
+  }
+
+  {
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+    }
+
+    DB* reopen_db;
+    ASSERT_OK(DB::Open(Options(), dbname, &reopen_db));
+    std::string value;
+    ASSERT_NOK(reopen_db->Get(ReadOptions(), "test-key", &value));
+    delete reopen_db;
+    ASSERT_OK(DestroyDB(dbname, Options()));
+  }
+
+  /* Temporary remove this test
+  {
+    std::cout << "Test merge-operator not set after reopen (recovery case)\n";
+    {
+      auto db = OpenDb(dbname);
+      MergeBasedCounters counters(db, 0);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+      counters.add("test-key", 1);
+    }
+
+    DB* reopen_db;
+    ASSERT_TRUE(DB::Open(Options(), dbname, &reopen_db).IsInvalidArgument());
+  }
+  */
+}
+
+TEST_F(MergeTest, MergeDbTest) {
+  runTest(test::PerThreadDBPath("merge_testdb"));
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(MergeTest, MergeDbTtlTest) {
+  runTest(test::PerThreadDBPath("merge_testdbttl"),
+          true);  // Run test on TTL database
+}
+
+TEST_F(MergeTest, MergeWithCompactionAndFlush) {
+  const std::string dbname =
+      test::PerThreadDBPath("merge_with_compaction_and_flush");
+  {
+    auto db = OpenDb(dbname);
+    {
+      MergeBasedCounters counters(db, 0);
+      testCountersWithFlushAndCompaction(counters, db.get());
+    }
+  }
+  ASSERT_OK(DestroyDB(dbname, Options()));
+}
+#endif  // !ROCKSDB_LITE
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::use_compression = false;
+  if (argc > 1) {
+    ROCKSDB_NAMESPACE::use_compression = true;
+  }
+
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/obsolete_files_test.cc b/src/rocksdb/db/obsolete_files_test.cc
new file mode 100644
index 000000000..8e9f28f65
--- /dev/null
+++ b/src/rocksdb/db/obsolete_files_test.cc
@@ -0,0 +1,328 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <stdlib.h>
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "port/stack_trace.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/transaction_log.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ObsoleteFilesTest : public DBTestBase {
+ public:
+  ObsoleteFilesTest()
+      : DBTestBase("obsolete_files_test", /*env_do_fsync=*/true),
+        wal_dir_(dbname_ + "/wal_files") {}
+
+  void AddKeys(int numkeys, int startkey) {
+    WriteOptions options;
+    options.sync = false;
+    for (int i = startkey; i < (numkeys + startkey); i++) {
+      std::string temp = std::to_string(i);
+      Slice key(temp);
+      Slice value(temp);
+      ASSERT_OK(db_->Put(options, key, value));
+    }
+  }
+
+  void createLevel0Files(int numFiles, int numKeysPerFile) {
+    int startKey = 0;
+    for (int i = 0; i < numFiles; i++) {
+      AddKeys(numKeysPerFile, startKey);
+      startKey += numKeysPerFile;
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+      ASSERT_OK(
+          dbfull()->TEST_WaitForCompact());  // wait for background flush (flush
+                                             // is also a kind of compaction).
+    }
+  }
+
+  void CheckFileTypeCounts(const std::string& dir, int required_log,
+                           int required_sst, int required_manifest) {
+    std::vector<std::string> filenames;
+    ASSERT_OK(env_->GetChildren(dir, &filenames));
+
+    int log_cnt = 0;
+    int sst_cnt = 0;
+    int manifest_cnt = 0;
+    for (auto file : filenames) {
+      uint64_t number;
+      FileType type;
+      if (ParseFileName(file, &number, &type)) {
+        log_cnt += (type == kWalFile);
+        sst_cnt += (type == kTableFile);
+        manifest_cnt += (type == kDescriptorFile);
+      }
+    }
+    ASSERT_EQ(required_log, log_cnt);
+    ASSERT_EQ(required_sst, sst_cnt);
+    ASSERT_EQ(required_manifest, manifest_cnt);
+  }
+
+  void ReopenDB() {
+    Options options = CurrentOptions();
+    // Trigger compaction when the number of level 0 files reaches 2.
+    options.create_if_missing = true;
+    options.level0_file_num_compaction_trigger = 2;
+    options.disable_auto_compactions = false;
+    options.delete_obsolete_files_period_micros = 0;  // always do full purge
+    options.enable_thread_tracking = true;
+    options.write_buffer_size = 1024 * 1024 * 1000;
+    options.target_file_size_base = 1024 * 1024 * 1000;
+    options.max_bytes_for_level_base = 1024 * 1024 * 1000;
+    options.WAL_ttl_seconds = 300;     // Used to test log files
+    options.WAL_size_limit_MB = 1024;  // Used to test log files
+    options.wal_dir = wal_dir_;
+
+    // Note: the following prevents an otherwise harmless data race between the
+    // test setup code (AddBlobFile) in ObsoleteFilesTest.BlobFiles and the
+    // periodic stat dumping thread.
+    options.stats_dump_period_sec = 0;
+
+    Destroy(options);
+    Reopen(options);
+  }
+
+  const std::string wal_dir_;
+};
+
+TEST_F(ObsoleteFilesTest, RaceForObsoleteFileDeletion) {
+  ReopenDB();
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->LoadDependency({
+      {"DBImpl::BackgroundCallCompaction:FoundObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:1"},
+      {"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+       "ObsoleteFilesTest::RaceForObsoleteFileDeletion:2"},
+  });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl:AfterDeletion", [&](void* arg) {
+        Status* p_status = reinterpret_cast<Status*>(arg);
+        ASSERT_OK(*p_status);
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::CloseHelper:PendingPurgeFinished", [&](void* arg) {
+        std::unordered_set<uint64_t>* files_grabbed_for_purge_ptr =
+            reinterpret_cast<std::unordered_set<uint64_t>*>(arg);
+        ASSERT_TRUE(files_grabbed_for_purge_ptr->empty());
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  port::Thread user_thread([this]() {
+    JobContext jobCxt(0);
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:1");
+    dbfull()->TEST_LockMutex();
+    dbfull()->FindObsoleteFiles(&jobCxt, true /* force=true */,
+                                false /* no_full_scan=false */);
+    dbfull()->TEST_UnlockMutex();
+    TEST_SYNC_POINT("ObsoleteFilesTest::RaceForObsoleteFileDeletion:2");
+    dbfull()->PurgeObsoleteFiles(jobCxt);
+    jobCxt.Clean();
+  });
+
+  user_thread.join();
+}
+
+TEST_F(ObsoleteFilesTest, DeleteObsoleteOptionsFile) {
+  ReopenDB();
+
+  createLevel0Files(2, 50000);
+  CheckFileTypeCounts(wal_dir_, 1, 0, 0);
+
+  ASSERT_OK(dbfull()->DisableFileDeletions());
+  for (int i = 0; i != 4; ++i) {
+    if (i % 2) {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "false"}}));
+    } else {
+      ASSERT_OK(dbfull()->SetOptions(dbfull()->DefaultColumnFamily(),
+                                     {{"paranoid_file_checks", "true"}}));
+    }
+  }
+  ASSERT_OK(dbfull()->EnableFileDeletions(true /* force */));
+
+  Close();
+
+  std::vector<std::string> files;
+  int opts_file_count = 0;
+  ASSERT_OK(env_->GetChildren(dbname_, &files));
+  for (const auto& file : files) {
+    uint64_t file_num;
+    Slice dummy_info_log_name_prefix;
+    FileType type;
+    WalFileType log_type;
+    if (ParseFileName(file, &file_num, dummy_info_log_name_prefix, &type,
+                      &log_type) &&
+        type == kOptionsFile) {
+      opts_file_count++;
+    }
+  }
+  ASSERT_EQ(2, opts_file_count);
+}
+
+TEST_F(ObsoleteFilesTest, BlobFiles) {
+  ReopenDB();
+
+  VersionSet* const versions = dbfull()->GetVersionSet();
+  assert(versions);
+  assert(versions->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  const ImmutableCFOptions* const ioptions = cfd->ioptions();
+  assert(ioptions);
+  assert(!ioptions->cf_paths.empty());
+
+  const std::string& path = ioptions->cf_paths.front().path;
+
+  // Add an obsolete blob file.
+  constexpr uint64_t first_blob_file_number = 234;
+  versions->AddObsoleteBlobFile(first_blob_file_number, path);
+
+  // Add a live blob file.
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  constexpr uint64_t second_garbage_blob_count = 0;
+  constexpr uint64_t second_garbage_blob_bytes = 0;
+
+  auto meta = BlobFileMetaData::Create(
+      std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+      second_garbage_blob_count, second_garbage_blob_bytes);
+
+  storage_info->AddBlobFile(std::move(meta));
+
+  // Check for obsolete files and make sure the first blob file is picked up
+  // and grabbed for purge. The second blob file should be on the live list.
+  constexpr int job_id = 0;
+  JobContext job_context{job_id};
+
+  dbfull()->TEST_LockMutex();
+  constexpr bool force_full_scan = false;
+  dbfull()->FindObsoleteFiles(&job_context, force_full_scan);
+  dbfull()->TEST_UnlockMutex();
+
+  ASSERT_TRUE(job_context.HaveSomethingToDelete());
+  ASSERT_EQ(job_context.blob_delete_files.size(), 1);
+  ASSERT_EQ(job_context.blob_delete_files[0].GetBlobFileNumber(),
+            first_blob_file_number);
+
+  const auto& files_grabbed_for_purge =
+      dbfull()->TEST_GetFilesGrabbedForPurge();
+  ASSERT_NE(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  ASSERT_EQ(job_context.blob_live.size(), 1);
+  ASSERT_EQ(job_context.blob_live[0], second_blob_file_number);
+
+  // Hack the job context a bit by adding a few files to the full scan
+  // list and adjusting the pending file number. We add the two files
+  // above as well as two additional ones, where one is old
+  // and should be cleaned up, and the other is still pending.
+  constexpr uint64_t old_blob_file_number = 123;
+  constexpr uint64_t pending_blob_file_number = 567;
+
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(old_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(first_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(second_blob_file_number), path);
+  job_context.full_scan_candidate_files.emplace_back(
+      BlobFileName(pending_blob_file_number), path);
+
+  job_context.min_pending_output = pending_blob_file_number;
+
+  // Purge obsolete files and make sure we purge the old file and the first file
+  // (and keep the second file and the pending file).
+  std::vector<std::string> deleted_files;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::DeleteObsoleteFileImpl::BeforeDeletion", [&](void* arg) {
+        const std::string* file = static_cast<std::string*>(arg);
+        assert(file);
+
+        constexpr char blob_extension[] = ".blob";
+
+        if (file->find(blob_extension) != std::string::npos) {
+          deleted_files.emplace_back(*file);
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  dbfull()->PurgeObsoleteFiles(job_context);
+  job_context.Clean();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  ASSERT_EQ(files_grabbed_for_purge.find(first_blob_file_number),
+            files_grabbed_for_purge.end());
+
+  std::sort(deleted_files.begin(), deleted_files.end());
+  const std::vector<std::string> expected_deleted_files{
+      BlobFileName(path, old_blob_file_number),
+      BlobFileName(path, first_blob_file_number)};
+
+  ASSERT_EQ(deleted_files, expected_deleted_files);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as DBImpl::DeleteFile is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/options_file_test.cc b/src/rocksdb/db/options_file_test.cc
new file mode 100644
index 000000000..eb02e6ca4
--- /dev/null
+++ b/src/rocksdb/db/options_file_test.cc
@@ -0,0 +1,120 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+#include <string>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+class OptionsFileTest : public testing::Test {
+ public:
+  OptionsFileTest() : dbname_(test::PerThreadDBPath("options_file_test")) {}
+
+  std::string dbname_;
+};
+
+namespace {
+void UpdateOptionsFiles(DB* db,
+                        std::unordered_set<std::string>* filename_history,
+                        int* options_files_count) {
+  std::vector<std::string> filenames;
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+  uint64_t number;
+  FileType type;
+  *options_files_count = 0;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      filename_history->insert(filename);
+      (*options_files_count)++;
+    }
+  }
+}
+
+// Verify whether the current Options Files are the latest ones.
+void VerifyOptionsFileName(
+    DB* db, const std::unordered_set<std::string>& past_filenames) {
+  std::vector<std::string> filenames;
+  std::unordered_set<std::string> current_filenames;
+  EXPECT_OK(db->GetEnv()->GetChildren(db->GetName(), &filenames));
+  uint64_t number;
+  FileType type;
+  for (auto filename : filenames) {
+    if (ParseFileName(filename, &number, &type) && type == kOptionsFile) {
+      current_filenames.insert(filename);
+    }
+  }
+  for (auto past_filename : past_filenames) {
+    if (current_filenames.find(past_filename) != current_filenames.end()) {
+      continue;
+    }
+    for (auto filename : current_filenames) {
+      ASSERT_GT(filename, past_filename);
+    }
+  }
+}
+}  // anonymous namespace
+
+TEST_F(OptionsFileTest, NumberOfOptionsFiles) {
+  const int kReopenCount = 20;
+  Options opt;
+  opt.create_if_missing = true;
+  ASSERT_OK(DestroyDB(dbname_, opt));
+  std::unordered_set<std::string> filename_history;
+  DB* db;
+  for (int i = 0; i < kReopenCount; ++i) {
+    ASSERT_OK(DB::Open(opt, dbname_, &db));
+    int num_options_files = 0;
+    UpdateOptionsFiles(db, &filename_history, &num_options_files);
+    ASSERT_GT(num_options_files, 0);
+    ASSERT_LE(num_options_files, 2);
+    // Make sure we always keep the latest option files.
+    VerifyOptionsFileName(db, filename_history);
+    delete db;
+  }
+}
+
+TEST_F(OptionsFileTest, OptionsFileName) {
+  const uint64_t kOptionsFileNum = 12345;
+  uint64_t number;
+  FileType type;
+
+  auto options_file_name = OptionsFileName("", kOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(options_file_name, &number, &type, nullptr));
+  ASSERT_EQ(type, kOptionsFile);
+  ASSERT_EQ(number, kOptionsFileNum);
+
+  const uint64_t kTempOptionsFileNum = 54352;
+  auto temp_options_file_name = TempOptionsFileName("", kTempOptionsFileNum);
+  ASSERT_TRUE(ParseFileName(temp_options_file_name, &number, &type, nullptr));
+  ASSERT_NE(temp_options_file_name.find(kTempFileNameSuffix),
+            std::string::npos);
+  ASSERT_EQ(type, kTempFile);
+  ASSERT_EQ(number, kTempOptionsFileNum);
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+#if !(defined NDEBUG) || !defined(OS_WIN)
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+#else
+  return 0;
+#endif  // !(defined NDEBUG) || !defined(OS_WIN)
+}
+#else
+
+#include <cstdio>
+
+int main(int /*argc*/, char** /*argv*/) {
+  printf("Skipped as Options file is not supported in RocksDBLite.\n");
+  return 0;
+}
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/output_validator.cc b/src/rocksdb/db/output_validator.cc
new file mode 100644
index 000000000..e93e2d68c
--- /dev/null
+++ b/src/rocksdb/db/output_validator.cc
@@ -0,0 +1,33 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/output_validator.h"
+
+#include "test_util/sync_point.h"
+#include "util/hash.h"
+
+namespace ROCKSDB_NAMESPACE {
+Status OutputValidator::Add(const Slice& key, const Slice& value) {
+  if (enable_hash_) {
+    // Generate a rolling 64-bit hash of the key and values
+    paranoid_hash_ = NPHash64(key.data(), key.size(), paranoid_hash_);
+    paranoid_hash_ = NPHash64(value.data(), value.size(), paranoid_hash_);
+  }
+  if (enable_order_check_) {
+    TEST_SYNC_POINT_CALLBACK("OutputValidator::Add:order_check",
+                             /*arg=*/nullptr);
+    if (key.size() < kNumInternalBytes) {
+      return Status::Corruption(
+          "Compaction tries to write a key without internal bytes.");
+    }
+    // prev_key_ starts with empty.
+    if (!prev_key_.empty() && icmp_.Compare(key, prev_key_) < 0) {
+      return Status::Corruption("Compaction sees out-of-order keys.");
+    }
+    prev_key_.assign(key.data(), key.size());
+  }
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/output_validator.h b/src/rocksdb/db/output_validator.h
new file mode 100644
index 000000000..40635f9c4
--- /dev/null
+++ b/src/rocksdb/db/output_validator.h
@@ -0,0 +1,48 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include "db/dbformat.h"
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+// A class that validates key/value that is inserted to an SST file.
+// Pass every key/value of the file using OutputValidator::Add()
+// and the class validates key order and optionally calculate a hash
+// of all the key and value.
+class OutputValidator {
+ public:
+  explicit OutputValidator(const InternalKeyComparator& icmp,
+                           bool enable_order_check, bool enable_hash,
+                           uint64_t precalculated_hash = 0)
+      : icmp_(icmp),
+        paranoid_hash_(precalculated_hash),
+        enable_order_check_(enable_order_check),
+        enable_hash_(enable_hash) {}
+
+  // Add a key to the KV sequence, and return whether the key follows
+  // criteria, e.g. key is ordered.
+  Status Add(const Slice& key, const Slice& value);
+
+  // Compare result of two key orders are the same. It can be used
+  // to compare the keys inserted into a file, and what is read back.
+  // Return true if the validation passes.
+  bool CompareValidator(const OutputValidator& other_validator) {
+    return GetHash() == other_validator.GetHash();
+  }
+
+  // Not (yet) intended to be persisted, so subject to change
+  // without notice between releases.
+  uint64_t GetHash() const { return paranoid_hash_; }
+
+ private:
+  const InternalKeyComparator& icmp_;
+  std::string prev_key_;
+  uint64_t paranoid_hash_ = 0;
+  bool enable_order_check_;
+  bool enable_hash_;
+};
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/perf_context_test.cc b/src/rocksdb/db/perf_context_test.cc
new file mode 100644
index 000000000..454d12dc5
--- /dev/null
+++ b/src/rocksdb/db/perf_context_test.cc
@@ -0,0 +1,1010 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "rocksdb/perf_context.h"
+
+#include <algorithm>
+#include <iostream>
+#include <thread>
+#include <vector>
+
+#include "monitoring/histogram.h"
+#include "monitoring/instrumented_mutex.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/thread_status_util.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+bool FLAGS_random_key = false;
+bool FLAGS_use_set_based_memetable = false;
+int FLAGS_total_keys = 100;
+int FLAGS_write_buffer_size = 1000000000;
+int FLAGS_max_write_buffer_number = 8;
+int FLAGS_min_write_buffer_number_to_merge = 7;
+bool FLAGS_verbose = false;
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("perf_context_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+std::shared_ptr<DB> OpenDb(bool read_only = false) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.max_open_files = -1;
+  options.write_buffer_size = FLAGS_write_buffer_size;
+  options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+  options.min_write_buffer_number_to_merge =
+      FLAGS_min_write_buffer_number_to_merge;
+
+  if (FLAGS_use_set_based_memetable) {
+#ifndef ROCKSDB_LITE
+    options.prefix_extractor.reset(
+        ROCKSDB_NAMESPACE::NewFixedPrefixTransform(0));
+    options.memtable_factory.reset(NewHashSkipListRepFactory());
+#endif  // ROCKSDB_LITE
+  }
+
+  Status s;
+  if (!read_only) {
+    s = DB::Open(options, kDbName, &db);
+  } else {
+    s = DB::OpenForReadOnly(options, kDbName, &db);
+  }
+  EXPECT_OK(s);
+  return std::shared_ptr<DB>(db);
+}
+
+class PerfContextTest : public testing::Test {};
+
+TEST_F(PerfContextTest, SeekIntoDeletion) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    ASSERT_OK(db->Put(write_options, key, value));
+  }
+
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    ASSERT_OK(db->Delete(write_options, key));
+  }
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_time;
+  for (int i = 0; i < FLAGS_total_keys - 1; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value;
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get());
+    timer.Start();
+    auto status = db->Get(read_options, key, &value);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    ASSERT_TRUE(status.IsNotFound());
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+    hist_get_time.Add(elapsed_nanos);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Get user key comparison: \n"
+              << hist_get.ToString() << "Get time: \n"
+              << hist_get_time.ToString();
+  }
+
+  {
+    HistogramImpl hist_seek_to_first;
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get(), true);
+    iter->SeekToFirst();
+    hist_seek_to_first.Add(get_perf_context()->user_key_comparison_count);
+    auto elapsed_nanos = timer.ElapsedNanos();
+
+    if (FLAGS_verbose) {
+      std::cout << "SeekToFirst user key comparison: \n"
+                << hist_seek_to_first.ToString() << "ikey skipped: "
+                << get_perf_context()->internal_key_skipped_count << "\n"
+                << "idelete skipped: "
+                << get_perf_context()->internal_delete_skipped_count << "\n"
+                << "elapsed: " << elapsed_nanos << "\n";
+    }
+  }
+
+  HistogramImpl hist_seek;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    std::string key = "k" + std::to_string(i);
+
+    get_perf_context()->Reset();
+    StopWatchNano timer(SystemClock::Default().get(), true);
+    iter->Seek(key);
+    auto elapsed_nanos = timer.ElapsedNanos();
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+    if (FLAGS_verbose) {
+      std::cout << "seek cmp: " << get_perf_context()->user_key_comparison_count
+                << " ikey skipped "
+                << get_perf_context()->internal_key_skipped_count
+                << " idelete skipped "
+                << get_perf_context()->internal_delete_skipped_count
+                << " elapsed: " << elapsed_nanos << "ns\n";
+    }
+
+    get_perf_context()->Reset();
+    ASSERT_TRUE(iter->Valid());
+    StopWatchNano timer2(SystemClock::Default().get(), true);
+    iter->Next();
+    auto elapsed_nanos2 = timer2.ElapsedNanos();
+    if (FLAGS_verbose) {
+      std::cout << "next cmp: " << get_perf_context()->user_key_comparison_count
+                << "elapsed: " << elapsed_nanos2 << "ns\n";
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Seek user key comparison: \n" << hist_seek.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchNanoOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatchNano timer(SystemClock::Default().get(), true);
+  for (auto& timing : timings) {
+    timing = timer.ElapsedNanos(true /* reset */);
+  }
+
+  HistogramImpl histogram;
+  for (const auto timing : timings) {
+    histogram.Add(timing);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, StopWatchOverhead) {
+  // profile the timer cost by itself!
+  const int kTotalIterations = 1000000;
+  uint64_t elapsed = 0;
+  std::vector<uint64_t> timings(kTotalIterations);
+
+  StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed);
+  for (auto& timing : timings) {
+    timing = elapsed;
+  }
+
+  HistogramImpl histogram;
+  uint64_t prev_timing = 0;
+  for (const auto timing : timings) {
+    histogram.Add(timing - prev_timing);
+    prev_timing = timing;
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << histogram.ToString();
+  }
+}
+
+void ProfileQueries(bool enabled_time = false) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));  // Start this test with a fresh DB
+
+  auto db = OpenDb();
+
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  HistogramImpl hist_put;
+
+  HistogramImpl hist_get;
+  HistogramImpl hist_get_snapshot;
+  HistogramImpl hist_get_memtable;
+  HistogramImpl hist_get_files;
+  HistogramImpl hist_get_post_process;
+  HistogramImpl hist_num_memtable_checked;
+
+  HistogramImpl hist_mget;
+  HistogramImpl hist_mget_snapshot;
+  HistogramImpl hist_mget_memtable;
+  HistogramImpl hist_mget_files;
+  HistogramImpl hist_mget_post_process;
+  HistogramImpl hist_mget_num_memtable_checked;
+
+  HistogramImpl hist_write_pre_post;
+  HistogramImpl hist_write_wal_time;
+  HistogramImpl hist_write_memtable_time;
+  HistogramImpl hist_write_delay_time;
+  HistogramImpl hist_write_thread_wait_nanos;
+  HistogramImpl hist_write_scheduling_time;
+
+  uint64_t total_db_mutex_nanos = 0;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  const int kFlushFlag = -1;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+    if (i == FLAGS_total_keys / 2) {
+      // Issuing a flush in the middle.
+      keys.push_back(kFlushFlag);
+    }
+  }
+
+  if (FLAGS_random_key) {
+    RandomShuffle(std::begin(keys), std::end(keys));
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U);
+#endif
+  int num_mutex_waited = 0;
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      FlushOptions fo;
+      db->Flush(fo);
+      continue;
+    }
+
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Put(write_options, key, value));
+    if (++num_mutex_waited > 3) {
+#ifndef NDEBUG
+      ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+    }
+    hist_write_pre_post.Add(
+        get_perf_context()->write_pre_and_post_process_time);
+    hist_write_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_write_memtable_time.Add(get_perf_context()->write_memtable_time);
+    hist_write_delay_time.Add(get_perf_context()->write_delay_time);
+    hist_write_thread_wait_nanos.Add(
+        get_perf_context()->write_thread_wait_nanos);
+    hist_write_scheduling_time.Add(
+        get_perf_context()->write_scheduling_flushes_compactions_time);
+    hist_put.Add(get_perf_context()->user_key_comparison_count);
+    total_db_mutex_nanos += get_perf_context()->db_mutex_lock_nanos;
+  }
+#ifndef NDEBUG
+  ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U);
+#endif
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + std::to_string(i);
+    std::string expected_value = "v" + std::to_string(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(
+        get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put user key comparison: \n"
+              << hist_put.ToString() << "Get user key comparison: \n"
+              << hist_get.ToString() << "MultiGet user key comparison: \n"
+              << hist_get.ToString();
+    std::cout << "Put(): Pre and Post Process Time: \n"
+              << hist_write_pre_post.ToString() << " Writing WAL time: \n"
+              << hist_write_wal_time.ToString() << "\n"
+              << " Writing Mem Table time: \n"
+              << hist_write_memtable_time.ToString() << "\n"
+              << " Write Delay: \n"
+              << hist_write_delay_time.ToString() << "\n"
+              << " Waiting for Batch time: \n"
+              << hist_write_thread_wait_nanos.ToString() << "\n"
+              << " Scheduling Flushes and Compactions Time: \n"
+              << hist_write_scheduling_time.ToString() << "\n"
+              << " Total DB mutex nanos: \n"
+              << total_db_mutex_nanos << "\n";
+
+    std::cout << "Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_get_post_process.ToString() << "\n";
+
+    std::cout << "MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+
+    EXPECT_GT(hist_write_pre_post.Average(), 0);
+    EXPECT_GT(hist_write_wal_time.Average(), 0);
+    EXPECT_GT(hist_write_memtable_time.Average(), 0);
+    EXPECT_EQ(hist_write_delay_time.Average(), 0);
+    EXPECT_EQ(hist_write_thread_wait_nanos.Average(), 0);
+    EXPECT_GT(hist_write_scheduling_time.Average(), 0);
+
+#ifndef NDEBUG
+    ASSERT_LT(total_db_mutex_nanos, 100U);
+#endif
+  }
+
+  db.reset();
+  db = OpenDb(true);
+
+  hist_get.Clear();
+  hist_get_snapshot.Clear();
+  hist_get_memtable.Clear();
+  hist_get_files.Clear();
+  hist_get_post_process.Clear();
+  hist_num_memtable_checked.Clear();
+
+  hist_mget.Clear();
+  hist_mget_snapshot.Clear();
+  hist_mget_memtable.Clear();
+  hist_mget_files.Clear();
+  hist_mget_post_process.Clear();
+  hist_mget_num_memtable_checked.Clear();
+
+  for (const int i : keys) {
+    if (i == kFlushFlag) {
+      continue;
+    }
+    std::string key = "k" + std::to_string(i);
+    std::string expected_value = "v" + std::to_string(i);
+    std::string value;
+
+    std::vector<Slice> multiget_keys = {Slice(key)};
+    std::vector<std::string> values;
+
+    get_perf_context()->Reset();
+    ASSERT_OK(db->Get(read_options, key, &value));
+    ASSERT_EQ(expected_value, value);
+    hist_get_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_get_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_get_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_num_memtable_checked.Add(get_perf_context()->get_from_memtable_count);
+    hist_get_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_get.Add(get_perf_context()->user_key_comparison_count);
+
+    get_perf_context()->Reset();
+    auto statuses = db->MultiGet(read_options, multiget_keys, &values);
+    for (const auto& s : statuses) {
+      ASSERT_OK(s);
+    }
+    hist_mget_snapshot.Add(get_perf_context()->get_snapshot_time);
+    hist_mget_memtable.Add(get_perf_context()->get_from_memtable_time);
+    hist_mget_files.Add(get_perf_context()->get_from_output_files_time);
+    hist_mget_num_memtable_checked.Add(
+        get_perf_context()->get_from_memtable_count);
+    hist_mget_post_process.Add(get_perf_context()->get_post_process_time);
+    hist_mget.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "ReadOnly Get user key comparison: \n"
+              << hist_get.ToString()
+              << "ReadOnly MultiGet user key comparison: \n"
+              << hist_mget.ToString();
+
+    std::cout << "ReadOnly Get(): Time to get snapshot: \n"
+              << hist_get_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_get_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_get_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_get_post_process.ToString() << "\n";
+
+    std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n"
+              << hist_mget_snapshot.ToString()
+              << " Time to get value from memtables: \n"
+              << hist_mget_memtable.ToString() << "\n"
+              << " Time to get value from output files: \n"
+              << hist_mget_files.ToString() << "\n"
+              << " Number of memtables checked: \n"
+              << hist_mget_num_memtable_checked.ToString() << "\n"
+              << " Time to post process: \n"
+              << hist_mget_post_process.ToString() << "\n";
+  }
+
+  if (enabled_time) {
+    ASSERT_GT(hist_get.Average(), 0);
+    ASSERT_GT(hist_get_memtable.Average(), 0);
+    ASSERT_GT(hist_get_files.Average(), 0);
+    ASSERT_GT(hist_num_memtable_checked.Average(), 0);
+    // In read-only mode Get(), no super version operation is needed
+    ASSERT_EQ(hist_get_post_process.Average(), 0);
+    ASSERT_GT(hist_get_snapshot.Average(), 0);
+
+    ASSERT_GT(hist_mget.Average(), 0);
+    ASSERT_GT(hist_mget_snapshot.Average(), 0);
+    ASSERT_GT(hist_mget_memtable.Average(), 0);
+    ASSERT_GT(hist_mget_files.Average(), 0);
+    ASSERT_GT(hist_mget_post_process.Average(), 0);
+    ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0);
+  }
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(PerfContextTest, KeyComparisonCount) {
+  SetPerfLevel(kEnableCount);
+  ProfileQueries();
+
+  SetPerfLevel(kDisable);
+  ProfileQueries();
+
+  SetPerfLevel(kEnableTime);
+  ProfileQueries(true);
+}
+#endif  // ROCKSDB_LITE
+
+// make perf_context_test
+// export ROCKSDB_TESTS=PerfContextTest.SeekKeyComparison
+// For one memtable:
+// ./perf_context_test --write_buffer_size=500000 --total_keys=10000
+// For two memtables:
+// ./perf_context_test --write_buffer_size=250000 --total_keys=10000
+// Specify --random_key=1 to shuffle the key before insertion
+// Results show that, for sequential insertion, worst-case Seek Key comparison
+// is close to the total number of keys (linear), when there is only one
+// memtable. When there are two memtables, even the avg Seek Key comparison
+// starts to become linear to the input size.
+
+TEST_F(PerfContextTest, SeekKeyComparison) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+
+  if (FLAGS_verbose) {
+    std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n";
+  }
+
+  std::vector<int> keys;
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    keys.push_back(i);
+  }
+
+  if (FLAGS_random_key) {
+    RandomShuffle(std::begin(keys), std::end(keys));
+  }
+
+  HistogramImpl hist_put_time;
+  HistogramImpl hist_wal_time;
+  HistogramImpl hist_time_diff;
+
+  SetPerfLevel(kEnableTime);
+  StopWatchNano timer(SystemClock::Default().get());
+  for (const int i : keys) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    get_perf_context()->Reset();
+    timer.Start();
+    ASSERT_OK(db->Put(write_options, key, value));
+    auto put_time = timer.ElapsedNanos();
+    hist_put_time.Add(put_time);
+    hist_wal_time.Add(get_perf_context()->write_wal_time);
+    hist_time_diff.Add(put_time - get_perf_context()->write_wal_time);
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << "Put time:\n"
+              << hist_put_time.ToString() << "WAL time:\n"
+              << hist_wal_time.ToString() << "time diff:\n"
+              << hist_time_diff.ToString();
+  }
+
+  HistogramImpl hist_seek;
+  HistogramImpl hist_next;
+
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string key = "k" + std::to_string(i);
+    std::string value = "v" + std::to_string(i);
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    get_perf_context()->Reset();
+    iter->Seek(key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(iter->value().ToString(), value);
+    hist_seek.Add(get_perf_context()->user_key_comparison_count);
+  }
+
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  for (iter->SeekToFirst(); iter->Valid();) {
+    get_perf_context()->Reset();
+    iter->Next();
+    hist_next.Add(get_perf_context()->user_key_comparison_count);
+  }
+  ASSERT_OK(iter->status());
+  if (FLAGS_verbose) {
+    std::cout << "Seek:\n"
+              << hist_seek.ToString() << "Next:\n"
+              << hist_next.ToString();
+  }
+}
+
+TEST_F(PerfContextTest, DBMutexLockCounter) {
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (PerfLevel perf_level_test :
+       {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) {
+    for (int c = 0; c < 2; ++c) {
+      InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                              stats_code[c]);
+      mutex.Lock();
+      ROCKSDB_NAMESPACE::port::Thread child_thread([&] {
+        SetPerfLevel(perf_level_test);
+        get_perf_context()->Reset();
+        ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        mutex.Lock();
+        mutex.Unlock();
+        if (perf_level_test == PerfLevel::kEnableTimeExceptForMutex ||
+            stats_code[c] != DB_MUTEX_WAIT_MICROS) {
+          ASSERT_EQ(get_perf_context()->db_mutex_lock_nanos, 0);
+        } else {
+          // increment the counter only when it's a DB Mutex
+          ASSERT_GT(get_perf_context()->db_mutex_lock_nanos, 0);
+        }
+      });
+      SystemClock::Default()->SleepForMicroseconds(100);
+      mutex.Unlock();
+      child_thread.join();
+    }
+  }
+}
+
+TEST_F(PerfContextTest, FalseDBMutexWait) {
+  SetPerfLevel(kEnableTime);
+  int stats_code[] = {0, static_cast<int>(DB_MUTEX_WAIT_MICROS)};
+  for (int c = 0; c < 2; ++c) {
+    InstrumentedMutex mutex(nullptr, SystemClock::Default().get(),
+                            stats_code[c]);
+    InstrumentedCondVar lock(&mutex);
+    get_perf_context()->Reset();
+    mutex.Lock();
+    lock.TimedWait(100);
+    mutex.Unlock();
+    if (stats_code[c] == static_cast<int>(DB_MUTEX_WAIT_MICROS)) {
+      // increment the counter only when it's a DB Mutex
+      ASSERT_GT(get_perf_context()->db_condition_wait_nanos, 0);
+    } else {
+      ASSERT_EQ(get_perf_context()->db_condition_wait_nanos, 0);
+    }
+  }
+}
+
+TEST_F(PerfContextTest, ToString) {
+  get_perf_context()->Reset();
+  get_perf_context()->block_read_count = 12345;
+
+  std::string zero_included = get_perf_context()->ToString();
+  ASSERT_NE(std::string::npos, zero_included.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_included.find("= 12345"));
+
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_EQ(std::string::npos, zero_excluded.find("= 0"));
+  ASSERT_NE(std::string::npos, zero_excluded.find("= 12345"));
+}
+
+TEST_F(PerfContextTest, MergeOperatorTime) {
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Status s = DB::Open(options, kDbName, &db);
+  EXPECT_OK(s);
+
+  std::string val;
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val1"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val2"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val3"));
+  ASSERT_OK(db->Merge(WriteOptions(), "k1", "val4"));
+
+  SetPerfLevel(kEnableTime);
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->Flush(FlushOptions()));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  ASSERT_OK(db->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+
+  get_perf_context()->Reset();
+  ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+#ifdef OS_SOLARIS
+  for (int i = 0; i < 100; i++) {
+    ASSERT_OK(db->Get(ReadOptions(), "k1", &val));
+  }
+#endif
+  EXPECT_GT(get_perf_context()->merge_operator_time_nanos, 0);
+
+  delete db;
+}
+
+TEST_F(PerfContextTest, CopyAndMove) {
+  // Assignment operator
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_assign;
+    perf_context_assign = *get_perf_context();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1,
+        (*(perf_context_assign.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_assign.ClearPerLevelPerfContext();
+    perf_context_assign.Reset();
+  }
+  // Copy constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_copy(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_copy.ClearPerLevelPerfContext();
+    perf_context_copy.Reset();
+  }
+  // Move constructor
+  {
+    get_perf_context()->Reset();
+    get_perf_context()->EnablePerLevelPerfContext();
+    PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+    ASSERT_EQ(
+        1,
+        (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+    PerfContext perf_context_move = std::move(*get_perf_context());
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    get_perf_context()->ClearPerLevelPerfContext();
+    get_perf_context()->Reset();
+    ASSERT_EQ(
+        1, (*(perf_context_move.level_to_perf_context))[5].bloom_filter_useful);
+    perf_context_move.ClearPerLevelPerfContext();
+    perf_context_move.Reset();
+  }
+}
+
+TEST_F(PerfContextTest, PerfContextDisableEnable) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  get_perf_context()->DisablePerLevelPerfContext();
+  PerfContext perf_context_copy(*get_perf_context());
+  ASSERT_EQ(1, (*(perf_context_copy.level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  // this was set when per level perf context is disabled, should not be copied
+  ASSERT_NE(
+      1, (*(perf_context_copy.level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(perf_context_copy.level_to_perf_context))[0].block_cache_hit_count);
+  perf_context_copy.ClearPerLevelPerfContext();
+  perf_context_copy.Reset();
+  get_perf_context()->ClearPerLevelPerfContext();
+  get_perf_context()->Reset();
+}
+
+TEST_F(PerfContextTest, PerfContextByLevelGetSet) {
+  get_perf_context()->Reset();
+  get_perf_context()->EnablePerLevelPerfContext();
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_positive, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 5);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_useful, 1, 7);
+  PERF_COUNTER_BY_LEVEL_ADD(bloom_filter_full_true_positive, 1, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 1, 0);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_hit_count, 5, 2);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 2, 3);
+  PERF_COUNTER_BY_LEVEL_ADD(block_cache_miss_count, 4, 1);
+  ASSERT_EQ(
+      0, (*(get_perf_context()->level_to_perf_context))[0].bloom_filter_useful);
+  ASSERT_EQ(
+      1, (*(get_perf_context()->level_to_perf_context))[5].bloom_filter_useful);
+  ASSERT_EQ(
+      2, (*(get_perf_context()->level_to_perf_context))[7].bloom_filter_useful);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[0]
+                   .bloom_filter_full_positive);
+  ASSERT_EQ(1, (*(get_perf_context()->level_to_perf_context))[2]
+                   .bloom_filter_full_true_positive);
+  ASSERT_EQ(
+      1,
+      (*(get_perf_context()->level_to_perf_context))[0].block_cache_hit_count);
+  ASSERT_EQ(
+      5,
+      (*(get_perf_context()->level_to_perf_context))[2].block_cache_hit_count);
+  ASSERT_EQ(
+      2,
+      (*(get_perf_context()->level_to_perf_context))[3].block_cache_miss_count);
+  ASSERT_EQ(
+      4,
+      (*(get_perf_context()->level_to_perf_context))[1].block_cache_miss_count);
+  std::string zero_excluded = get_perf_context()->ToString(true);
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_useful = 1@level5, 2@level7"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_positive = 1@level0"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("bloom_filter_full_true_positive = 1@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_hit_count = 1@level0, 5@level2"));
+  ASSERT_NE(std::string::npos,
+            zero_excluded.find("block_cache_miss_count = 4@level1, 2@level3"));
+}
+
+TEST_F(PerfContextTest, CPUTimer) {
+  if (SystemClock::Default()->CPUNanos() == 0) {
+    ROCKSDB_GTEST_SKIP("Target without CPUNanos support");
+    return;
+  }
+
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex);
+
+  std::string max_str = "0";
+  for (int i = 0; i < FLAGS_total_keys; ++i) {
+    std::string i_str = std::to_string(i);
+    std::string key = "k" + i_str;
+    std::string value = "v" + i_str;
+    max_str = max_str > i_str ? max_str : i_str;
+
+    ASSERT_OK(db->Put(write_options, key, value));
+  }
+  std::string last_key = "k" + max_str;
+  std::string last_value = "v" + max_str;
+
+  {
+    // Get
+    get_perf_context()->Reset();
+    std::string value;
+    ASSERT_OK(db->Get(read_options, "k0", &value));
+    ASSERT_EQ(value, "v0");
+
+    if (FLAGS_verbose) {
+      std::cout << "Get CPU time nanos: " << get_perf_context()->get_cpu_nanos
+                << "ns\n";
+    }
+
+    // Iter
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    // Seek
+    get_perf_context()->Reset();
+    iter->Seek(last_key);
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Seek CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekForPrev
+    get_perf_context()->Reset();
+    iter->SeekForPrev(last_key);
+    ASSERT_TRUE(iter->Valid());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekForPrev CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToLast
+    get_perf_context()->Reset();
+    iter->SeekToLast();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(last_value, iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToLast CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // SeekToFirst
+    get_perf_context()->Reset();
+    iter->SeekToFirst();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter SeekToFirst CPU time nanos: "
+                << get_perf_context()->iter_seek_cpu_nanos << "ns\n";
+    }
+
+    // Next
+    get_perf_context()->Reset();
+    iter->Next();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v1", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Next CPU time nanos: "
+                << get_perf_context()->iter_next_cpu_nanos << "ns\n";
+    }
+
+    // Prev
+    get_perf_context()->Reset();
+    iter->Prev();
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ("v0", iter->value().ToString());
+
+    if (FLAGS_verbose) {
+      std::cout << "Iter Prev CPU time nanos: "
+                << get_perf_context()->iter_prev_cpu_nanos << "ns\n";
+    }
+
+    // monotonically increasing
+    get_perf_context()->Reset();
+    auto count = get_perf_context()->iter_seek_cpu_nanos;
+    for (int i = 0; i < FLAGS_total_keys; ++i) {
+      iter->Seek("k" + std::to_string(i));
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("v" + std::to_string(i), iter->value().ToString());
+      auto next_count = get_perf_context()->iter_seek_cpu_nanos;
+      ASSERT_GT(next_count, count);
+      count = next_count;
+    }
+
+    // iterator creation/destruction; multiple iterators
+    {
+      std::unique_ptr<Iterator> iter2(db->NewIterator(read_options));
+      ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+      iter2->Seek(last_key);
+      ASSERT_TRUE(iter2->Valid());
+      ASSERT_EQ(last_value, iter2->value().ToString());
+      ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, count);
+      count = get_perf_context()->iter_seek_cpu_nanos;
+    }
+    ASSERT_EQ(count, get_perf_context()->iter_seek_cpu_nanos);
+  }
+}
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  for (int i = 1; i < argc; i++) {
+    int n;
+    char junk;
+
+    if (sscanf(argv[i], "--write_buffer_size=%d%c", &n, &junk) == 1) {
+      FLAGS_write_buffer_size = n;
+    }
+
+    if (sscanf(argv[i], "--total_keys=%d%c", &n, &junk) == 1) {
+      FLAGS_total_keys = n;
+    }
+
+    if (sscanf(argv[i], "--random_key=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_random_key = n;
+    }
+
+    if (sscanf(argv[i], "--use_set_based_memetable=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_use_set_based_memetable = n;
+    }
+
+    if (sscanf(argv[i], "--verbose=%d%c", &n, &junk) == 1 &&
+        (n == 0 || n == 1)) {
+      FLAGS_verbose = n;
+    }
+  }
+
+  if (FLAGS_verbose) {
+    std::cout << kDbName << "\n";
+  }
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/periodic_task_scheduler.cc b/src/rocksdb/db/periodic_task_scheduler.cc
new file mode 100644
index 000000000..2024510dd
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.cc
@@ -0,0 +1,113 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "rocksdb/system_clock.h"
+
+#ifndef ROCKSDB_LITE
+namespace ROCKSDB_NAMESPACE {
+
+// `timer_mutex` is a global mutex serves 3 purposes currently:
+// (1) to ensure calls to `Start()` and `Shutdown()` are serialized, as
+//     they are currently not implemented in a thread-safe way; and
+// (2) to ensure the `Timer::Add()`s and `Timer::Start()` run atomically, and
+//     the `Timer::Cancel()`s and `Timer::Shutdown()` run atomically.
+// (3) protect tasks_map_ in PeriodicTaskScheduler
+// Note: It's not efficient to have a static global mutex, for
+// PeriodicTaskScheduler it should be okay, as the operations are called
+// infrequently.
+static port::Mutex timer_mutex;
+
+static const std::map<PeriodicTaskType, uint64_t> kDefaultPeriodSeconds = {
+    {PeriodicTaskType::kDumpStats, kInvalidPeriodSec},
+    {PeriodicTaskType::kPersistStats, kInvalidPeriodSec},
+    {PeriodicTaskType::kFlushInfoLog, 10},
+    {PeriodicTaskType::kRecordSeqnoTime, kInvalidPeriodSec},
+};
+
+static const std::map<PeriodicTaskType, std::string> kPeriodicTaskTypeNames = {
+    {PeriodicTaskType::kDumpStats, "dump_st"},
+    {PeriodicTaskType::kPersistStats, "pst_st"},
+    {PeriodicTaskType::kFlushInfoLog, "flush_info_log"},
+    {PeriodicTaskType::kRecordSeqnoTime, "record_seq_time"},
+};
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+                                       const PeriodicTaskFunc& fn) {
+  return Register(task_type, fn, kDefaultPeriodSeconds.at(task_type));
+}
+
+Status PeriodicTaskScheduler::Register(PeriodicTaskType task_type,
+                                       const PeriodicTaskFunc& fn,
+                                       uint64_t repeat_period_seconds) {
+  MutexLock l(&timer_mutex);
+  static std::atomic<uint64_t> initial_delay(0);
+
+  if (repeat_period_seconds == kInvalidPeriodSec) {
+    return Status::InvalidArgument("Invalid task repeat period");
+  }
+  auto it = tasks_map_.find(task_type);
+  if (it != tasks_map_.end()) {
+    // the task already exists and it's the same, no update needed
+    if (it->second.repeat_every_sec == repeat_period_seconds) {
+      return Status::OK();
+    }
+    // cancel the existing one before register new one
+    timer_->Cancel(it->second.name);
+    tasks_map_.erase(it);
+  }
+
+  timer_->Start();
+  // put task type name as prefix, for easy debug
+  std::string unique_id =
+      kPeriodicTaskTypeNames.at(task_type) + std::to_string(id_++);
+
+  bool succeeded = timer_->Add(
+      fn, unique_id,
+      (initial_delay.fetch_add(1) % repeat_period_seconds) * kMicrosInSecond,
+      repeat_period_seconds * kMicrosInSecond);
+  if (!succeeded) {
+    return Status::Aborted("Failed to register periodic task");
+  }
+  auto result = tasks_map_.try_emplace(
+      task_type, TaskInfo{unique_id, repeat_period_seconds});
+  if (!result.second) {
+    return Status::Aborted("Failed to add periodic task");
+  };
+  return Status::OK();
+}
+
+Status PeriodicTaskScheduler::Unregister(PeriodicTaskType task_type) {
+  MutexLock l(&timer_mutex);
+  auto it = tasks_map_.find(task_type);
+  if (it != tasks_map_.end()) {
+    timer_->Cancel(it->second.name);
+    tasks_map_.erase(it);
+  }
+  if (!timer_->HasPendingTask()) {
+    timer_->Shutdown();
+  }
+  return Status::OK();
+}
+
+Timer* PeriodicTaskScheduler::Default() {
+  static Timer timer(SystemClock::Default().get());
+  return &timer;
+}
+
+#ifndef NDEBUG
+void PeriodicTaskScheduler::TEST_OverrideTimer(SystemClock* clock) {
+  static Timer test_timer(clock);
+  test_timer.TEST_OverrideTimer(clock);
+  MutexLock l(&timer_mutex);
+  timer_ = &test_timer;
+}
+#endif  // NDEBUG
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler.h b/src/rocksdb/db/periodic_task_scheduler.h
new file mode 100644
index 000000000..f45b80c4d
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler.h
@@ -0,0 +1,110 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#ifndef ROCKSDB_LITE
+
+#include "util/timer.h"
+
+namespace ROCKSDB_NAMESPACE {
+class SystemClock;
+
+using PeriodicTaskFunc = std::function<void()>;
+
+constexpr uint64_t kInvalidPeriodSec = 0;
+
+// List of task types
+enum class PeriodicTaskType : uint8_t {
+  kDumpStats = 0,
+  kPersistStats,
+  kFlushInfoLog,
+  kRecordSeqnoTime,
+  kMax,
+};
+
+// PeriodicTaskScheduler contains the periodic task scheduled from the DB
+// instance. It's used to schedule/unschedule DumpStats(), PersistStats(),
+// FlushInfoLog(), etc. Each type of the task can only have one instance,
+// re-register the same task type would only update the repeat period.
+//
+// Internally, it uses a global single threaded timer object to run the periodic
+// task functions. Timer thread will always be started since the info log
+// flushing cannot be disabled.
+class PeriodicTaskScheduler {
+ public:
+  explicit PeriodicTaskScheduler() = default;
+
+  PeriodicTaskScheduler(const PeriodicTaskScheduler&) = delete;
+  PeriodicTaskScheduler(PeriodicTaskScheduler&&) = delete;
+  PeriodicTaskScheduler& operator=(const PeriodicTaskScheduler&) = delete;
+  PeriodicTaskScheduler& operator=(PeriodicTaskScheduler&&) = delete;
+
+  // Register a task with its default repeat period
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn);
+
+  // Register a task with specified repeat period. 0 is an invalid argument
+  // (kInvalidPeriodSec). To stop the task, please use Unregister() specifically
+  Status Register(PeriodicTaskType task_type, const PeriodicTaskFunc& fn,
+                  uint64_t repeat_period_seconds);
+
+  // Unregister the task
+  Status Unregister(PeriodicTaskType task_type);
+
+#ifndef NDEBUG
+  // Override the timer for the unittest
+  void TEST_OverrideTimer(SystemClock* clock);
+
+  // Call Timer TEST_WaitForRun() which wait until Timer starting waiting.
+  void TEST_WaitForRun(const std::function<void()>& callback) const {
+    if (timer_ != nullptr) {
+      timer_->TEST_WaitForRun(callback);
+    }
+  }
+
+  // Get global valid task number in the Timer
+  size_t TEST_GetValidTaskNum() const {
+    if (timer_ != nullptr) {
+      return timer_->TEST_GetPendingTaskNum();
+    }
+    return 0;
+  }
+
+  // If it has the specified task type registered
+  bool TEST_HasTask(PeriodicTaskType task_type) const {
+    auto it = tasks_map_.find(task_type);
+    return it != tasks_map_.end();
+  }
+#endif  // NDEBUG
+
+ private:
+  // default global Timer instance
+  static Timer* Default();
+
+  // Internal structure to store task information
+  struct TaskInfo {
+    TaskInfo(std::string _name, uint64_t _repeat_every_sec)
+        : name(std::move(_name)), repeat_every_sec(_repeat_every_sec) {}
+    std::string name;
+    uint64_t repeat_every_sec;
+  };
+
+  // Internal tasks map
+  std::map<PeriodicTaskType, TaskInfo> tasks_map_;
+
+  // Global timer pointer, which doesn't support synchronous add/cancel tasks
+  // so having a global `timer_mutex` for add/cancel task.
+  Timer* timer_ = Default();
+
+  // Global task id, protected by the global `timer_mutex`
+  inline static uint64_t id_;
+
+  static constexpr uint64_t kMicrosInSecond = 1000U * 1000U;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/periodic_task_scheduler_test.cc b/src/rocksdb/db/periodic_task_scheduler_test.cc
new file mode 100644
index 000000000..4abea4d5e
--- /dev/null
+++ b/src/rocksdb/db/periodic_task_scheduler_test.cc
@@ -0,0 +1,231 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/periodic_task_scheduler.h"
+
+#include "db/db_test_util.h"
+#include "env/composite_env_wrapper.h"
+#include "test_util/mock_time_env.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class PeriodicTaskSchedulerTest : public DBTestBase {
+ public:
+  PeriodicTaskSchedulerTest()
+      : DBTestBase("periodic_task_scheduler_test", /*env_do_fsync=*/true) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_.reset(new CompositeEnvWrapper(env_, mock_clock_));
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+  }
+};
+
+TEST_F(PeriodicTaskSchedulerTest, Basic) {
+  constexpr unsigned int kPeriodSec = 10;
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:StartRunning",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+
+  int flush_info_log_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::FlushInfoLog:StartRunning",
+      [&](void*) { flush_info_log_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Reopen(options);
+
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(kPeriodSec, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_GT(kPeriodSec, 1u);
+  dbfull()->TEST_WaitForPeridicTaskRun([&] {
+    mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec) - 1);
+  });
+
+  const PeriodicTaskScheduler& scheduler =
+      dbfull()->TEST_GetPeriodicTaskScheduler();
+  ASSERT_EQ(3, scheduler.TEST_GetValidTaskNum());
+
+  ASSERT_EQ(1, dump_st_counter);
+  ASSERT_EQ(1, pst_st_counter);
+  ASSERT_EQ(1, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(2, dump_st_counter);
+  ASSERT_EQ(2, pst_st_counter);
+  ASSERT_EQ(2, flush_info_log_counter);
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(3, flush_info_log_counter);
+
+  // Disable scheduler with SetOption
+  ASSERT_OK(dbfull()->SetDBOptions(
+      {{"stats_dump_period_sec", "0"}, {"stats_persist_period_sec", "0"}}));
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  // Info log flush should still run.
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(3, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(4, flush_info_log_counter);
+
+  ASSERT_EQ(1u, scheduler.TEST_GetValidTaskNum());
+
+  // Re-enable one task
+  ASSERT_OK(dbfull()->SetDBOptions({{"stats_dump_period_sec", "5"}}));
+  ASSERT_EQ(5u, dbfull()->GetDBOptions().stats_dump_period_sec);
+  ASSERT_EQ(0u, dbfull()->GetDBOptions().stats_persist_period_sec);
+
+  ASSERT_EQ(2, scheduler.TEST_GetValidTaskNum());
+
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kPeriodSec)); });
+  ASSERT_EQ(4, dump_st_counter);
+  ASSERT_EQ(3, pst_st_counter);
+  ASSERT_EQ(5, flush_info_log_counter);
+
+  Close();
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiInstances) {
+  constexpr int kPeriodSec = 5;
+  const int kInstanceNum = 10;
+
+  Close();
+  Options options;
+  options.stats_dump_period_sec = kPeriodSec;
+  options.stats_persist_period_sec = kPeriodSec;
+  options.create_if_missing = true;
+  options.env = mock_env_.get();
+
+  int dump_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::DumpStats:2",
+                                        [&](void*) { dump_st_counter++; });
+
+  int pst_st_counter = 0;
+  SyncPoint::GetInstance()->SetCallBack("DBImpl::PersistStats:StartRunning",
+                                        [&](void*) { pst_st_counter++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  auto dbs = std::vector<DB*>(kInstanceNum);
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(
+        DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+  }
+
+  auto dbi = static_cast_with_check<DBImpl>(dbs[kInstanceNum - 1]);
+
+  const PeriodicTaskScheduler& scheduler = dbi->TEST_GetPeriodicTaskScheduler();
+  ASSERT_EQ(kInstanceNum * 3, scheduler.TEST_GetValidTaskNum());
+
+  int expected_run = kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec - 1); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  expected_run += kInstanceNum;
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  int half = kInstanceNum / 2;
+  for (int i = 0; i < half; i++) {
+    delete dbs[i];
+  }
+
+  expected_run += (kInstanceNum - half) * 2;
+
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  dbi->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(kPeriodSec); });
+  ASSERT_EQ(expected_run, dump_st_counter);
+  ASSERT_EQ(expected_run, pst_st_counter);
+
+  for (int i = half; i < kInstanceNum; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    delete dbs[i];
+  }
+}
+
+TEST_F(PeriodicTaskSchedulerTest, MultiEnv) {
+  constexpr int kDumpPeriodSec = 5;
+  constexpr int kPersistPeriodSec = 10;
+  Close();
+  Options options1;
+  options1.stats_dump_period_sec = kDumpPeriodSec;
+  options1.stats_persist_period_sec = kPersistPeriodSec;
+  options1.create_if_missing = true;
+  options1.env = mock_env_.get();
+
+  Reopen(options1);
+
+  std::unique_ptr<Env> mock_env2(
+      new CompositeEnvWrapper(Env::Default(), mock_clock_));
+  Options options2;
+  options2.stats_dump_period_sec = kDumpPeriodSec;
+  options2.stats_persist_period_sec = kPersistPeriodSec;
+  options2.create_if_missing = true;
+  options1.env = mock_env2.get();
+
+  std::string dbname = test::PerThreadDBPath("multi_env_test");
+  DB* db;
+  ASSERT_OK(DB::Open(options2, dbname, &db));
+
+  ASSERT_OK(db->Close());
+  delete db;
+  Close();
+}
+
+#endif  // !ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/pinned_iterators_manager.h b/src/rocksdb/db/pinned_iterators_manager.h
new file mode 100644
index 000000000..0fcf231da
--- /dev/null
+++ b/src/rocksdb/db/pinned_iterators_manager.h
@@ -0,0 +1,92 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
+// and it will be responsible for deleting pinned Iterators when they are
+// not needed anymore.
+class PinnedIteratorsManager : public Cleanable {
+ public:
+  PinnedIteratorsManager() : pinning_enabled(false) {}
+  ~PinnedIteratorsManager() {
+    if (pinning_enabled) {
+      ReleasePinnedData();
+    }
+  }
+
+  // Move constructor and move assignment is allowed.
+  PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default;
+  PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept =
+      default;
+
+  // Enable Iterators pinning
+  void StartPinning() {
+    assert(pinning_enabled == false);
+    pinning_enabled = true;
+  }
+
+  // Is pinning enabled ?
+  bool PinningEnabled() { return pinning_enabled; }
+
+  // Take ownership of iter and delete it when ReleasePinnedData() is called
+  void PinIterator(InternalIterator* iter, bool arena = false) {
+    if (arena) {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
+    } else {
+      PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
+    }
+  }
+
+  using ReleaseFunction = void (*)(void* arg1);
+  void PinPtr(void* ptr, ReleaseFunction release_func) {
+    assert(pinning_enabled);
+    if (ptr == nullptr) {
+      return;
+    }
+    pinned_ptrs_.emplace_back(ptr, release_func);
+  }
+
+  // Release pinned Iterators
+  inline void ReleasePinnedData() {
+    assert(pinning_enabled == true);
+    pinning_enabled = false;
+
+    // Remove duplicate pointers
+    std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
+    auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
+
+    for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
+      void* ptr = i->first;
+      ReleaseFunction release_func = i->second;
+      release_func(ptr);
+    }
+    pinned_ptrs_.clear();
+    // Also do cleanups from the base Cleanable
+    Cleanable::Reset();
+  }
+
+ private:
+  static void ReleaseInternalIterator(void* ptr) {
+    delete reinterpret_cast<InternalIterator*>(ptr);
+  }
+
+  static void ReleaseArenaInternalIterator(void* ptr) {
+    reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
+  }
+
+  bool pinning_enabled;
+  std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/plain_table_db_test.cc b/src/rocksdb/db/plain_table_db_test.cc
new file mode 100644
index 000000000..755b639b0
--- /dev/null
+++ b/src/rocksdb/db/plain_table_db_test.cc
@@ -0,0 +1,1357 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <set>
+
+#include "db/db_impl/db_impl.h"
+#include "db/version_set.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/compaction_filter.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/table.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_bloom.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/plain/plain_table_key_coding.h"
+#include "table/plain/plain_table_reader.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/cast_util.h"
+#include "util/hash.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+class PlainTableKeyDecoderTest : public testing::Test {};
+
+TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
+  Random rnd(301);
+  const uint32_t kLength = 2222;
+  std::string tmp = rnd.RandomString(kLength);
+  Slice contents(tmp);
+  test::StringSource* string_source =
+      new test::StringSource(contents, 0, false);
+  std::unique_ptr<FSRandomAccessFile> holder(string_source);
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(std::move(holder), "test"));
+  std::unique_ptr<PlainTableReaderFileInfo> file_info(
+      new PlainTableReaderFileInfo(std::move(file_reader), EnvOptions(),
+                                   kLength));
+
+  {
+    PlainTableFileReader reader(file_info.get());
+
+    const uint32_t kReadSize = 77;
+    for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
+      uint32_t read_size = std::min(kLength - pos, kReadSize);
+      Slice out;
+      ASSERT_TRUE(reader.Read(pos, read_size, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
+    }
+
+    ASSERT_LT(uint32_t(string_source->total_reads()), kLength / kReadSize / 2);
+  }
+
+  std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
+      {{600, 30}, {590, 30}, {600, 20}, {600, 40}},
+      {{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
+      {{1000, 20}, {500, 20}, {1000, 50}},
+      {{1000, 20}, {500, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
+      {{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
+      {{600, 500}, {610, 20}, {100, 20}},
+      {{500, 100}, {490, 100}, {550, 50}},
+  };
+
+  std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
+
+  for (size_t i = 0; i < reads.size(); i++) {
+    string_source->set_total_reads(0);
+    PlainTableFileReader reader(file_info.get());
+    for (auto p : reads[i]) {
+      Slice out;
+      ASSERT_TRUE(reader.Read(p.first, p.second, &out));
+      ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
+    }
+    ASSERT_EQ(num_file_reads[i], string_source->total_reads());
+  }
+}
+
+class PlainTableDBTest : public testing::Test,
+                         public testing::WithParamInterface<bool> {
+ protected:
+ private:
+  std::string dbname_;
+  Env* env_;
+  DB* db_;
+
+  bool mmap_mode_;
+  Options last_options_;
+
+ public:
+  PlainTableDBTest() : env_(Env::Default()) {}
+
+  ~PlainTableDBTest() override {
+    delete db_;
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  void SetUp() override {
+    mmap_mode_ = GetParam();
+    dbname_ = test::PerThreadDBPath("plain_table_db_test");
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+    db_ = nullptr;
+    Reopen();
+  }
+
+  // Return the current option configuration.
+  Options CurrentOptions() {
+    Options options;
+
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 0;
+    plain_table_options.bloom_bits_per_key = 2;
+    plain_table_options.hash_table_ratio = 0.8;
+    plain_table_options.index_sparseness = 3;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPrefix;
+    plain_table_options.full_scan_mode = false;
+    plain_table_options.store_index_in_file = false;
+
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+    options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true));
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    options.allow_mmap_reads = mmap_mode_;
+    options.allow_concurrent_memtable_write = false;
+    options.unordered_write = false;
+    return options;
+  }
+
+  DBImpl* dbfull() { return static_cast_with_check<DBImpl>(db_); }
+
+  void Reopen(Options* options = nullptr) { ASSERT_OK(TryReopen(options)); }
+
+  void Close() {
+    delete db_;
+    db_ = nullptr;
+  }
+
+  bool mmap_mode() const { return mmap_mode_; }
+
+  void DestroyAndReopen(Options* options = nullptr) {
+    // Destroy using last options
+    Destroy(&last_options_);
+    ASSERT_OK(TryReopen(options));
+  }
+
+  void Destroy(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    ASSERT_OK(DestroyDB(dbname_, *options));
+  }
+
+  Status PureReopen(Options* options, DB** db) {
+    return DB::Open(*options, dbname_, db);
+  }
+
+  Status ReopenForReadOnly(Options* options) {
+    delete db_;
+    db_ = nullptr;
+    return DB::OpenForReadOnly(*options, dbname_, &db_);
+  }
+
+  Status TryReopen(Options* options = nullptr) {
+    delete db_;
+    db_ = nullptr;
+    Options opts;
+    if (options != nullptr) {
+      opts = *options;
+    } else {
+      opts = CurrentOptions();
+      opts.create_if_missing = true;
+    }
+    last_options_ = opts;
+
+    return DB::Open(opts, dbname_, &db_);
+  }
+
+  Status Put(const Slice& k, const Slice& v) {
+    return db_->Put(WriteOptions(), k, v);
+  }
+
+  Status Delete(const std::string& k) { return db_->Delete(WriteOptions(), k); }
+
+  std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) {
+    ReadOptions options;
+    options.snapshot = snapshot;
+    std::string result;
+    Status s = db_->Get(options, k, &result);
+    if (s.IsNotFound()) {
+      result = "NOT_FOUND";
+    } else if (!s.ok()) {
+      result = s.ToString();
+    }
+    return result;
+  }
+
+  int NumTableFilesAtLevel(int level) {
+    std::string property;
+    EXPECT_TRUE(db_->GetProperty(
+        "rocksdb.num-files-at-level" + std::to_string(level), &property));
+    return atoi(property.c_str());
+  }
+
+  // Return spread of files per level
+  std::string FilesPerLevel() {
+    std::string result;
+    size_t last_non_zero_offset = 0;
+    for (int level = 0; level < db_->NumberLevels(); level++) {
+      int f = NumTableFilesAtLevel(level);
+      char buf[100];
+      snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f);
+      result += buf;
+      if (f > 0) {
+        last_non_zero_offset = result.size();
+      }
+    }
+    result.resize(last_non_zero_offset);
+    return result;
+  }
+
+  std::string IterStatus(Iterator* iter) {
+    std::string result;
+    if (iter->Valid()) {
+      result = iter->key().ToString() + "->" + iter->value().ToString();
+    } else {
+      result = "(invalid)";
+    }
+    return result;
+  }
+};
+
+TEST_P(PlainTableDBTest, Empty) {
+  ASSERT_TRUE(dbfull() != nullptr);
+  ASSERT_EQ("NOT_FOUND", Get("0000000000000foo"));
+}
+
+extern const uint64_t kPlainTableMagicNumber;
+
+class TestPlainTableReader : public PlainTableReader {
+ public:
+  TestPlainTableReader(
+      const EnvOptions& env_options, const InternalKeyComparator& icomparator,
+      EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key,
+      double hash_table_ratio, size_t index_sparseness,
+      std::unique_ptr<TableProperties>&& props,
+      std::unique_ptr<RandomAccessFileReader>&& file,
+      const ImmutableOptions& ioptions, const SliceTransform* prefix_extractor,
+      bool* expect_bloom_not_match, bool store_index_in_file,
+      uint32_t column_family_id, const std::string& column_family_name)
+      : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
+                         encoding_type, file_size, props.get(),
+                         prefix_extractor),
+        expect_bloom_not_match_(expect_bloom_not_match) {
+    Status s = MmapDataIfNeeded();
+    EXPECT_TRUE(s.ok());
+
+    s = PopulateIndex(props.get(), bloom_bits_per_key, hash_table_ratio,
+                      index_sparseness, 2 * 1024 * 1024);
+    EXPECT_TRUE(s.ok());
+
+    EXPECT_EQ(column_family_id, static_cast<uint32_t>(props->column_family_id));
+    EXPECT_EQ(column_family_name, props->column_family_name);
+    if (store_index_in_file) {
+      auto bloom_version_ptr = props->user_collected_properties.find(
+          PlainTablePropertyNames::kBloomVersion);
+      EXPECT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
+      EXPECT_EQ(bloom_version_ptr->second, std::string("1"));
+      if (ioptions.bloom_locality > 0) {
+        auto num_blocks_ptr = props->user_collected_properties.find(
+            PlainTablePropertyNames::kNumBloomBlocks);
+        EXPECT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
+      }
+    }
+    table_properties_ = std::move(props);
+  }
+
+  ~TestPlainTableReader() override {}
+
+ private:
+  bool MatchBloom(uint32_t hash) const override {
+    bool ret = PlainTableReader::MatchBloom(hash);
+    if (*expect_bloom_not_match_) {
+      EXPECT_TRUE(!ret);
+    } else {
+      EXPECT_TRUE(ret);
+    }
+    return ret;
+  }
+  bool* expect_bloom_not_match_;
+};
+
+extern const uint64_t kPlainTableMagicNumber;
+class TestPlainTableFactory : public PlainTableFactory {
+ public:
+  explicit TestPlainTableFactory(bool* expect_bloom_not_match,
+                                 const PlainTableOptions& options,
+                                 uint32_t column_family_id,
+                                 std::string column_family_name)
+      : PlainTableFactory(options),
+        bloom_bits_per_key_(options.bloom_bits_per_key),
+        hash_table_ratio_(options.hash_table_ratio),
+        index_sparseness_(options.index_sparseness),
+        store_index_in_file_(options.store_index_in_file),
+        expect_bloom_not_match_(expect_bloom_not_match),
+        column_family_id_(column_family_id),
+        column_family_name_(std::move(column_family_name)) {}
+
+  using PlainTableFactory::NewTableReader;
+  Status NewTableReader(
+      const ReadOptions& /*ro*/, const TableReaderOptions& table_reader_options,
+      std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
+      std::unique_ptr<TableReader>* table,
+      bool /*prefetch_index_and_filter_in_cache*/) const override {
+    std::unique_ptr<TableProperties> props;
+    auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
+                                 table_reader_options.ioptions, &props);
+    EXPECT_TRUE(s.ok());
+
+    if (store_index_in_file_) {
+      BlockHandle bloom_block_handle;
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              BloomBlockBuilder::kBloomBlock,
+                              &bloom_block_handle);
+      EXPECT_TRUE(s.ok());
+
+      BlockHandle index_block_handle;
+      s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber,
+                              table_reader_options.ioptions,
+                              PlainTableIndexBuilder::kPlainTableIndexBlock,
+                              &index_block_handle);
+      EXPECT_TRUE(s.ok());
+    }
+
+    auto& user_props = props->user_collected_properties;
+    auto encoding_type_prop =
+        user_props.find(PlainTablePropertyNames::kEncodingType);
+    assert(encoding_type_prop != user_props.end());
+    EncodingType encoding_type = static_cast<EncodingType>(
+        DecodeFixed32(encoding_type_prop->second.c_str()));
+
+    std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
+        table_reader_options.env_options,
+        table_reader_options.internal_comparator, encoding_type, file_size,
+        bloom_bits_per_key_, hash_table_ratio_, index_sparseness_,
+        std::move(props), std::move(file), table_reader_options.ioptions,
+        table_reader_options.prefix_extractor.get(), expect_bloom_not_match_,
+        store_index_in_file_, column_family_id_, column_family_name_));
+
+    *table = std::move(new_reader);
+    return s;
+  }
+
+ private:
+  int bloom_bits_per_key_;
+  double hash_table_ratio_;
+  size_t index_sparseness_;
+  bool store_index_in_file_;
+  bool* expect_bloom_not_match_;
+  const uint32_t column_family_id_;
+  const std::string column_family_name_;
+};
+
+TEST_P(PlainTableDBTest, BadOptions1) {
+  // Build with a prefix extractor
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  // Bad attempt to re-open without a prefix extractor
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor is missing when opening a PlainTable "
+      "built using a prefix extractor",
+      TryReopen(&options).ToString());
+
+  // Bad attempt to re-open with different prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(6));
+  ASSERT_EQ(
+      "Invalid argument: Prefix extractor given doesn't match the one used to "
+      "build PlainTable",
+      TryReopen(&options).ToString());
+
+  // Correct prefix extractor
+  options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, BadOptions2) {
+  Options options = CurrentOptions();
+  options.prefix_extractor.reset();
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+  // Build without a prefix extractor
+  // (apparently works even if hash_table_ratio > 0)
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  // Build without a prefix extractor, this call will fail and returns the
+  // status for this bad attempt.
+  ASSERT_NOK(dbfull()->TEST_FlushMemTable());
+
+  // Bad attempt to re-open with hash_table_ratio > 0 and no prefix extractor
+  Status s = TryReopen(&options);
+  ASSERT_EQ(
+      "Not implemented: PlainTable requires a prefix extractor enable prefix "
+      "hash mode.",
+      s.ToString());
+
+  // OK to open with hash_table_ratio == 0 and no prefix extractor
+  PlainTableOptions plain_table_options;
+  plain_table_options.hash_table_ratio = 0;
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+
+  // OK to open newly with a prefix_extractor and hash table; builds index
+  // in memory.
+  options = CurrentOptions();
+  Reopen(&options);
+  ASSERT_EQ("v1", Get("1000000000000foo"));
+}
+
+TEST_P(PlainTableDBTest, Flush) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom = -1; bloom <= 117; bloom += 117) {
+        const int bloom_bits = std::max(bloom, 0);
+        const bool full_scan_mode = bloom < 0;
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          for (int store_index_in_file = 0; store_index_in_file <= 1;
+               ++store_index_in_file) {
+            Options options = CurrentOptions();
+            options.create_if_missing = true;
+            // Set only one bucket to force bucket conflict.
+            // Test index interval for the same prefix to be 1, 2 and 4
+            if (total_order) {
+              options.prefix_extractor.reset();
+
+              PlainTableOptions plain_table_options;
+              plain_table_options.user_key_len = 0;
+              plain_table_options.bloom_bits_per_key = bloom_bits;
+              plain_table_options.hash_table_ratio = 0;
+              plain_table_options.index_sparseness = 2;
+              plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+              plain_table_options.encoding_type = encoding_type;
+              plain_table_options.full_scan_mode = full_scan_mode;
+              plain_table_options.store_index_in_file = store_index_in_file;
+
+              options.table_factory.reset(
+                  NewPlainTableFactory(plain_table_options));
+            } else {
+              PlainTableOptions plain_table_options;
+              plain_table_options.user_key_len = 0;
+              plain_table_options.bloom_bits_per_key = bloom_bits;
+              plain_table_options.hash_table_ratio = 0.75;
+              plain_table_options.index_sparseness = 16;
+              plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+              plain_table_options.encoding_type = encoding_type;
+              plain_table_options.full_scan_mode = full_scan_mode;
+              plain_table_options.store_index_in_file = store_index_in_file;
+
+              options.table_factory.reset(
+                  NewPlainTableFactory(plain_table_options));
+            }
+            DestroyAndReopen(&options);
+            uint64_t int_num;
+            ASSERT_TRUE(dbfull()->GetIntProperty(
+                "rocksdb.estimate-table-readers-mem", &int_num));
+            ASSERT_EQ(int_num, 0U);
+
+            ASSERT_OK(Put("1000000000000foo", "v1"));
+            ASSERT_OK(Put("0000000000000bar", "v2"));
+            ASSERT_OK(Put("1000000000000foo", "v3"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+            ASSERT_TRUE(dbfull()->GetIntProperty(
+                "rocksdb.estimate-table-readers-mem", &int_num));
+            ASSERT_GT(int_num, 0U);
+
+            TablePropertiesCollection ptc;
+            ASSERT_OK(reinterpret_cast<DB*>(dbfull())->GetPropertiesOfAllTables(
+                &ptc));
+            ASSERT_EQ(1U, ptc.size());
+            auto row = ptc.begin();
+            auto tp = row->second;
+
+            if (full_scan_mode) {
+              // Does not support Get/Seek
+              std::unique_ptr<Iterator> iter(
+                  dbfull()->NewIterator(ReadOptions()));
+              iter->SeekToFirst();
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ("0000000000000bar", iter->key().ToString());
+              ASSERT_EQ("v2", iter->value().ToString());
+              iter->Next();
+              ASSERT_TRUE(iter->Valid());
+              ASSERT_EQ("1000000000000foo", iter->key().ToString());
+              ASSERT_EQ("v3", iter->value().ToString());
+              iter->Next();
+              ASSERT_TRUE(!iter->Valid());
+              ASSERT_TRUE(iter->status().ok());
+            } else {
+              if (!store_index_in_file) {
+                ASSERT_EQ(total_order ? "4" : "12",
+                          (tp->user_collected_properties)
+                              .at("plain_table_hash_table_size"));
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_sub_index_size"));
+              } else {
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_hash_table_size"));
+                ASSERT_EQ("0", (tp->user_collected_properties)
+                                   .at("plain_table_sub_index_size"));
+              }
+              ASSERT_EQ("v3", Get("1000000000000foo"));
+              ASSERT_EQ("v2", Get("0000000000000bar"));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Flush2) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          for (int store_index_in_file = 0; store_index_in_file <= 1;
+               ++store_index_in_file) {
+            if (encoding_type == kPrefix && total_order) {
+              continue;
+            }
+            if (!bloom_bits && store_index_in_file) {
+              continue;
+            }
+            if (total_order && store_index_in_file) {
+              continue;
+            }
+            bool expect_bloom_not_match = false;
+            Options options = CurrentOptions();
+            options.create_if_missing = true;
+            // Set only one bucket to force bucket conflict.
+            // Test index interval for the same prefix to be 1, 2 and 4
+            PlainTableOptions plain_table_options;
+            if (total_order) {
+              options.prefix_extractor = nullptr;
+              plain_table_options.hash_table_ratio = 0;
+              plain_table_options.index_sparseness = 2;
+            } else {
+              plain_table_options.hash_table_ratio = 0.75;
+              plain_table_options.index_sparseness = 16;
+            }
+            plain_table_options.user_key_len = kPlainTableVariableLength;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+            plain_table_options.store_index_in_file = store_index_in_file;
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+
+            DestroyAndReopen(&options);
+            ASSERT_OK(Put("0000000000000bar", "b"));
+            ASSERT_OK(Put("1000000000000foo", "v1"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+            ASSERT_OK(Put("1000000000000foo", "v2"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v2", Get("1000000000000foo"));
+
+            ASSERT_OK(Put("0000000000000eee", "v3"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v3", Get("0000000000000eee"));
+
+            ASSERT_OK(Delete("0000000000000bar"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("NOT_FOUND", Get("0000000000000bar"));
+
+            ASSERT_OK(Put("0000000000000eee", "v5"));
+            ASSERT_OK(Put("9000000000000eee", "v5"));
+            ASSERT_OK(dbfull()->TEST_FlushMemTable());
+            ASSERT_EQ("v5", Get("0000000000000eee"));
+
+            // Test Bloom Filter
+            if (bloom_bits > 0) {
+              // Neither key nor value should exist.
+              expect_bloom_not_match = true;
+              ASSERT_EQ("NOT_FOUND", Get("5_not00000000bar"));
+              // Key doesn't exist any more but prefix exists.
+              if (total_order) {
+                ASSERT_EQ("NOT_FOUND", Get("1000000000000not"));
+                ASSERT_EQ("NOT_FOUND", Get("0000000000000not"));
+              }
+              expect_bloom_not_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, Immortal) {
+  for (EncodingType encoding_type : {kPlain, kPrefix}) {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.max_open_files = -1;
+    // Set only one bucket to force bucket conflict.
+    // Test index interval for the same prefix to be 1, 2 and 4
+    PlainTableOptions plain_table_options;
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.user_key_len = kPlainTableVariableLength;
+    plain_table_options.bloom_bits_per_key = 10;
+    plain_table_options.encoding_type = encoding_type;
+    options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+    DestroyAndReopen(&options);
+    ASSERT_OK(Put("0000000000000bar", "b"));
+    ASSERT_OK(Put("1000000000000foo", "v1"));
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+    int copied = 0;
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "GetContext::SaveValue::PinSelf", [&](void* /*arg*/) { copied++; });
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ(2, copied);
+    copied = 0;
+
+    Close();
+    ASSERT_OK(ReopenForReadOnly(&options));
+
+    ASSERT_EQ("b", Get("0000000000000bar"));
+    ASSERT_EQ("v1", Get("1000000000000foo"));
+    ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+    if (mmap_mode()) {
+      ASSERT_EQ(0, copied);
+    } else {
+      ASSERT_EQ(2, copied);
+    }
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+  }
+}
+
+TEST_P(PlainTableDBTest, Iterator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (EncodingType encoding_type : {kPlain, kPrefix}) {
+      for (int bloom_bits = 0; bloom_bits <= 117; bloom_bits += 117) {
+        for (int total_order = 0; total_order <= 1; total_order++) {
+          if (encoding_type == kPrefix && total_order == 1) {
+            continue;
+          }
+          bool expect_bloom_not_match = false;
+          Options options = CurrentOptions();
+          options.create_if_missing = true;
+          // Set only one bucket to force bucket conflict.
+          // Test index interval for the same prefix to be 1, 2 and 4
+          if (total_order) {
+            options.prefix_extractor = nullptr;
+
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 16;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0;
+            plain_table_options.index_sparseness = 2;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+          } else {
+            PlainTableOptions plain_table_options;
+            plain_table_options.user_key_len = 16;
+            plain_table_options.bloom_bits_per_key = bloom_bits;
+            plain_table_options.hash_table_ratio = 0.75;
+            plain_table_options.index_sparseness = 16;
+            plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+            plain_table_options.encoding_type = encoding_type;
+
+            options.table_factory.reset(new TestPlainTableFactory(
+                &expect_bloom_not_match, plain_table_options,
+                0 /* column_family_id */, kDefaultColumnFamilyName));
+          }
+          DestroyAndReopen(&options);
+
+          ASSERT_OK(Put("1000000000foo002", "v_2"));
+          ASSERT_OK(Put("0000000000000bar", "random"));
+          ASSERT_OK(Put("1000000000foo001", "v1"));
+          ASSERT_OK(Put("3000000000000bar", "bar_v"));
+          ASSERT_OK(Put("1000000000foo003", "v__3"));
+          ASSERT_OK(Put("1000000000foo004", "v__4"));
+          ASSERT_OK(Put("1000000000foo005", "v__5"));
+          ASSERT_OK(Put("1000000000foo007", "v__7"));
+          ASSERT_OK(Put("1000000000foo008", "v__8"));
+          ASSERT_OK(dbfull()->TEST_FlushMemTable());
+          ASSERT_EQ("v1", Get("1000000000foo001"));
+          ASSERT_EQ("v__3", Get("1000000000foo003"));
+          Iterator* iter = dbfull()->NewIterator(ReadOptions());
+          iter->Seek("1000000000foo000");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo001", iter->key().ToString());
+          ASSERT_EQ("v1", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo002", iter->key().ToString());
+          ASSERT_EQ("v_2", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo003", iter->key().ToString());
+          ASSERT_EQ("v__3", iter->value().ToString());
+
+          iter->Next();
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo004", iter->key().ToString());
+          ASSERT_EQ("v__4", iter->value().ToString());
+
+          iter->Seek("3000000000000bar");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("3000000000000bar", iter->key().ToString());
+          ASSERT_EQ("bar_v", iter->value().ToString());
+
+          iter->Seek("1000000000foo000");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo001", iter->key().ToString());
+          ASSERT_EQ("v1", iter->value().ToString());
+
+          iter->Seek("1000000000foo005");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo005", iter->key().ToString());
+          ASSERT_EQ("v__5", iter->value().ToString());
+
+          iter->Seek("1000000000foo006");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo007", iter->key().ToString());
+          ASSERT_EQ("v__7", iter->value().ToString());
+
+          iter->Seek("1000000000foo008");
+          ASSERT_TRUE(iter->Valid());
+          ASSERT_EQ("1000000000foo008", iter->key().ToString());
+          ASSERT_EQ("v__8", iter->value().ToString());
+
+          if (total_order == 0) {
+            iter->Seek("1000000000foo009");
+            ASSERT_TRUE(iter->Valid());
+            ASSERT_EQ("3000000000000bar", iter->key().ToString());
+          }
+
+          // Test Bloom Filter
+          if (bloom_bits > 0) {
+            if (!total_order) {
+              // Neither key nor value should exist.
+              expect_bloom_not_match = true;
+              iter->Seek("2not000000000bar");
+              ASSERT_TRUE(!iter->Valid());
+              ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+              expect_bloom_not_match = false;
+            } else {
+              expect_bloom_not_match = true;
+              ASSERT_EQ("NOT_FOUND", Get("2not000000000bar"));
+              expect_bloom_not_match = false;
+            }
+          }
+          ASSERT_OK(iter->status());
+          delete iter;
+        }
+      }
+    }
+  }
+}
+
+namespace {
+std::string NthKey(size_t n, char filler) {
+  std::string rv(16, filler);
+  rv[0] = n % 10;
+  rv[1] = (n / 10) % 10;
+  rv[2] = (n / 100) % 10;
+  rv[3] = (n / 1000) % 10;
+  return rv;
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, BloomSchema) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  for (int bloom_locality = 0; bloom_locality <= 1; bloom_locality++) {
+    options.bloom_locality = bloom_locality;
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 16;
+    plain_table_options.bloom_bits_per_key = 3;  // high FP rate for test
+    plain_table_options.hash_table_ratio = 0.75;
+    plain_table_options.index_sparseness = 16;
+    plain_table_options.huge_page_tlb_size = 0;
+    plain_table_options.encoding_type = kPlain;
+
+    bool expect_bloom_not_match = false;
+    options.table_factory.reset(new TestPlainTableFactory(
+        &expect_bloom_not_match, plain_table_options, 0 /* column_family_id */,
+        kDefaultColumnFamilyName));
+    DestroyAndReopen(&options);
+
+    for (unsigned i = 0; i < 2345; ++i) {
+      ASSERT_OK(Put(NthKey(i, 'y'), "added"));
+    }
+    ASSERT_OK(dbfull()->TEST_FlushMemTable());
+    ASSERT_EQ("added", Get(NthKey(42, 'y')));
+
+    for (unsigned i = 0; i < 32; ++i) {
+      // Known pattern of Bloom filter false positives can detect schema change
+      // with high probability. Known FPs stuffed into bits:
+      uint32_t pattern;
+      if (!bloom_locality) {
+        pattern = 1785868347UL;
+      } else if (CACHE_LINE_SIZE == 64U) {
+        pattern = 2421694657UL;
+      } else if (CACHE_LINE_SIZE == 128U) {
+        pattern = 788710956UL;
+      } else {
+        ASSERT_EQ(CACHE_LINE_SIZE, 256U);
+        pattern = 163905UL;
+      }
+      bool expect_fp = pattern & (1UL << i);
+      // fprintf(stderr, "expect_fp@%u: %d\n", i, (int)expect_fp);
+      expect_bloom_not_match = !expect_fp;
+      ASSERT_EQ("NOT_FOUND", Get(NthKey(i, 'n')));
+    }
+  }
+}
+
+namespace {
+std::string MakeLongKey(size_t length, char c) {
+  return std::string(length, c);
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeys) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 0;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  options.prefix_extractor.reset();
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {MakeLongKey(30, '0'), MakeLongKey(16, '1'),
+                            MakeLongKey(32, '2'), MakeLongKey(60, '3'),
+                            MakeLongKey(90, '4'), MakeLongKey(50, '5'),
+                            MakeLongKey(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+namespace {
+std::string MakeLongKeyWithPrefix(size_t length, char c) {
+  return "00000000" + std::string(length - 8, c);
+}
+}  // anonymous namespace
+
+TEST_P(PlainTableDBTest, IteratorLargeKeysWithPrefix) {
+  Options options = CurrentOptions();
+
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0.8;
+  plain_table_options.index_sparseness = 3;
+  plain_table_options.huge_page_tlb_size = 0;
+  plain_table_options.encoding_type = kPrefix;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  options.create_if_missing = true;
+  DestroyAndReopen(&options);
+
+  std::string key_list[] = {
+      MakeLongKeyWithPrefix(30, '0'), MakeLongKeyWithPrefix(16, '1'),
+      MakeLongKeyWithPrefix(32, '2'), MakeLongKeyWithPrefix(60, '3'),
+      MakeLongKeyWithPrefix(90, '4'), MakeLongKeyWithPrefix(50, '5'),
+      MakeLongKeyWithPrefix(26, '6')};
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_OK(Put(key_list[i], std::to_string(i)));
+  }
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek(key_list[0]);
+
+  for (size_t i = 0; i < 7; i++) {
+    ASSERT_TRUE(iter->Valid());
+    ASSERT_EQ(key_list[i], iter->key().ToString());
+    ASSERT_EQ(std::to_string(i), iter->value().ToString());
+    iter->Next();
+  }
+
+  ASSERT_TRUE(!iter->Valid());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, IteratorReverseSuffixComparator) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  test::SimpleSuffixReverseComparator comp;
+  options.comparator = &comp;
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000foo002", "v_2"));
+  ASSERT_OK(Put("0000000000000bar", "random"));
+  ASSERT_OK(Put("1000000000foo001", "v1"));
+  ASSERT_OK(Put("3000000000000bar", "bar_v"));
+  ASSERT_OK(Put("1000000000foo003", "v__3"));
+  ASSERT_OK(Put("1000000000foo004", "v__4"));
+  ASSERT_OK(Put("1000000000foo005", "v__5"));
+  ASSERT_OK(Put("1000000000foo007", "v__7"));
+  ASSERT_OK(Put("1000000000foo008", "v__8"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v1", Get("1000000000foo001"));
+  ASSERT_EQ("v__3", Get("1000000000foo003"));
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+  iter->Seek("1000000000foo009");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo007", iter->key().ToString());
+  ASSERT_EQ("v__7", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Next();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo004", iter->key().ToString());
+  ASSERT_EQ("v__4", iter->value().ToString());
+
+  iter->Seek("3000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+  ASSERT_EQ("bar_v", iter->value().ToString());
+
+  iter->Seek("1000000000foo005");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo006");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo005", iter->key().ToString());
+  ASSERT_EQ("v__5", iter->value().ToString());
+
+  iter->Seek("1000000000foo008");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("1000000000foo008", iter->key().ToString());
+  ASSERT_EQ("v__8", iter->value().ToString());
+
+  iter->Seek("1000000000foo000");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("3000000000000bar", iter->key().ToString());
+
+  delete iter;
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflict) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo0");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000bar");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo8");
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.comparator->Compare(iter->key(), "20000001") > 0);
+
+      iter->Seek("5000000000000fo8");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_OK(iter->status());
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) {
+  for (size_t huge_page_tlb_size = 0; huge_page_tlb_size <= 2 * 1024 * 1024;
+       huge_page_tlb_size += 2 * 1024 * 1024) {
+    for (unsigned char i = 1; i <= 3; i++) {
+      Options options = CurrentOptions();
+      options.create_if_missing = true;
+      test::SimpleSuffixReverseComparator comp;
+      options.comparator = &comp;
+      // Set only one bucket to force bucket conflict.
+      // Test index interval for the same prefix to be 1, 2 and 4
+
+      PlainTableOptions plain_table_options;
+      plain_table_options.user_key_len = 16;
+      plain_table_options.bloom_bits_per_key = 0;
+      plain_table_options.hash_table_ratio = 0;
+      plain_table_options.index_sparseness = 2 ^ i;
+      plain_table_options.huge_page_tlb_size = huge_page_tlb_size;
+
+      options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+      DestroyAndReopen(&options);
+      ASSERT_OK(Put("5000000000000fo0", "v1"));
+      ASSERT_OK(Put("5000000000000fo1", "v2"));
+      ASSERT_OK(Put("5000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo0", "v3"));
+      ASSERT_OK(Put("2000000000000fo1", "v4"));
+      ASSERT_OK(Put("2000000000000fo2", "v"));
+      ASSERT_OK(Put("2000000000000fo3", "v"));
+
+      ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+      ASSERT_EQ("v1", Get("5000000000000fo0"));
+      ASSERT_EQ("v2", Get("5000000000000fo1"));
+      ASSERT_EQ("v3", Get("2000000000000fo0"));
+      ASSERT_EQ("v4", Get("2000000000000fo1"));
+
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000bar"));
+      ASSERT_EQ("NOT_FOUND", Get("5000000000000fo8"));
+      ASSERT_EQ("NOT_FOUND", Get("2000000000000fo8"));
+
+      ReadOptions ro;
+      Iterator* iter = dbfull()->NewIterator(ro);
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+      iter->Seek("5000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo0", iter->key().ToString());
+
+      iter->Seek("2000000000000fo1");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo1", iter->key().ToString());
+
+      iter->Seek("2000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("2000000000000fo3", iter->key().ToString());
+
+      iter->Seek("5000000000000var");
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_EQ("5000000000000fo2", iter->key().ToString());
+
+      std::string seek_key = "2000000000000bar";
+      iter->Seek(seek_key);
+      ASSERT_TRUE(!iter->Valid() ||
+                  options.prefix_extractor->Transform(iter->key()) !=
+                      options.prefix_extractor->Transform(seek_key));
+
+      iter->Seek("1000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("3000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      iter->Seek("8000000000000fo2");
+      ASSERT_TRUE(!iter->Valid());
+
+      ASSERT_OK(iter->status());
+      delete iter;
+    }
+  }
+}
+
+TEST_P(PlainTableDBTest, NonExistingKeyToNonEmptyBucket) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  // Set only one bucket to force bucket conflict.
+  // Test index interval for the same prefix to be 1, 2 and 4
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 16;
+  plain_table_options.bloom_bits_per_key = 0;
+  plain_table_options.hash_table_ratio = 0;
+  plain_table_options.index_sparseness = 5;
+
+  options.table_factory.reset(NewPlainTableFactory(plain_table_options));
+  DestroyAndReopen(&options);
+  ASSERT_OK(Put("5000000000000fo0", "v1"));
+  ASSERT_OK(Put("5000000000000fo1", "v2"));
+  ASSERT_OK(Put("5000000000000fo2", "v3"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  ASSERT_EQ("v1", Get("5000000000000fo0"));
+  ASSERT_EQ("v2", Get("5000000000000fo1"));
+  ASSERT_EQ("v3", Get("5000000000000fo2"));
+
+  ASSERT_EQ("NOT_FOUND", Get("8000000000000bar"));
+  ASSERT_EQ("NOT_FOUND", Get("1000000000000bar"));
+
+  Iterator* iter = dbfull()->NewIterator(ReadOptions());
+
+  iter->Seek("5000000000000bar");
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ("5000000000000fo0", iter->key().ToString());
+
+  iter->Seek("5000000000000fo8");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("1000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  iter->Seek("8000000000000fo2");
+  ASSERT_TRUE(!iter->Valid());
+
+  ASSERT_OK(iter->status());
+  delete iter;
+}
+
+static std::string Key(int i) {
+  char buf[100];
+  snprintf(buf, sizeof(buf), "key_______%06d", i);
+  return std::string(buf);
+}
+
+TEST_P(PlainTableDBTest, CompactionTrigger) {
+  Options options = CurrentOptions();
+  options.write_buffer_size = 120 << 10;  // 120KB
+  options.num_levels = 3;
+  options.level0_file_num_compaction_trigger = 3;
+  Reopen(&options);
+
+  Random rnd(301);
+
+  for (int num = 0; num < options.level0_file_num_compaction_trigger - 1;
+       num++) {
+    std::vector<std::string> values;
+    // Write 120KB (10 values, each 12K)
+    for (int i = 0; i < 10; i++) {
+      values.push_back(rnd.RandomString(12 << 10));
+      ASSERT_OK(Put(Key(i), values[i]));
+    }
+    ASSERT_OK(Put(Key(999), ""));
+    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
+    ASSERT_EQ(NumTableFilesAtLevel(0), num + 1);
+  }
+
+  // generate one more file in level-0, and should trigger level-0 compaction
+  std::vector<std::string> values;
+  for (int i = 0; i < 12; i++) {
+    values.push_back(rnd.RandomString(10000));
+    ASSERT_OK(Put(Key(i), values[i]));
+  }
+  ASSERT_OK(Put(Key(999), ""));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  ASSERT_EQ(NumTableFilesAtLevel(0), 0);
+  ASSERT_EQ(NumTableFilesAtLevel(1), 1);
+}
+
+TEST_P(PlainTableDBTest, AdaptiveTable) {
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+
+  options.table_factory.reset(NewPlainTableFactory());
+  DestroyAndReopen(&options);
+
+  ASSERT_OK(Put("1000000000000foo", "v1"));
+  ASSERT_OK(Put("0000000000000bar", "v2"));
+  ASSERT_OK(Put("1000000000000foo", "v3"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+
+  options.create_if_missing = false;
+  std::shared_ptr<TableFactory> block_based_factory(
+      NewBlockBasedTableFactory());
+  std::shared_ptr<TableFactory> plain_table_factory(NewPlainTableFactory());
+  std::shared_ptr<TableFactory> dummy_factory;
+  options.table_factory.reset(NewAdaptiveTableFactory(
+      block_based_factory, block_based_factory, plain_table_factory));
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+
+  ASSERT_OK(Put("2000000000000foo", "v4"));
+  ASSERT_OK(Put("3000000000000bar", "v5"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable());
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  Reopen(&options);
+  ASSERT_EQ("v3", Get("1000000000000foo"));
+  ASSERT_EQ("v2", Get("0000000000000bar"));
+  ASSERT_EQ("v4", Get("2000000000000foo"));
+  ASSERT_EQ("v5", Get("3000000000000bar"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewBlockBasedTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v3", Get("1000000000000foo"));
+
+  options.paranoid_checks = false;
+  options.table_factory.reset(NewPlainTableFactory());
+  Reopen(&options);
+  ASSERT_NE("v5", Get("3000000000000bar"));
+}
+
+INSTANTIATE_TEST_CASE_P(PlainTableDBTest, PlainTableDBTest, ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as plain table is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/post_memtable_callback.h b/src/rocksdb/db/post_memtable_callback.h
new file mode 100644
index 000000000..fbf2fbe86
--- /dev/null
+++ b/src/rocksdb/db/post_memtable_callback.h
@@ -0,0 +1,25 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates. All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Callback invoked after finishing writing to the memtable but before
+// publishing the sequence number to readers.
+// Note that with write-prepared/write-unprepared transactions with
+// two-write-queues, PreReleaseCallback is called before publishing the
+// sequence numbers to readers.
+class PostMemTableCallback {
+ public:
+  virtual ~PostMemTableCallback() {}
+
+  virtual Status operator()(SequenceNumber seq, bool disable_memtable) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/pre_release_callback.h b/src/rocksdb/db/pre_release_callback.h
new file mode 100644
index 000000000..6b9039487
--- /dev/null
+++ b/src/rocksdb/db/pre_release_callback.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class PreReleaseCallback {
+ public:
+  virtual ~PreReleaseCallback() {}
+
+  // Will be called while on the write thread after the write to the WAL and
+  // before the write to memtable. This is useful if any operation needs to be
+  // done before the write gets visible to the readers, or if we want to reduce
+  // the overhead of locking by updating something sequentially while we are on
+  // the write thread. If the callback fails, this function returns a non-OK
+  // status, the sequence number will not be released, and same status will be
+  // propagated to all the writers in the write group.
+  // seq is the sequence number that is used for this write and will be
+  // released.
+  // is_mem_disabled is currently used for debugging purposes to assert that
+  // the callback is done from the right write queue.
+  // If non-zero, log_number indicates the WAL log to which we wrote.
+  // index >= 0 specifies the order of callback in the same write thread.
+  // total > index specifies the total number of callbacks in the same write
+  // thread. Together with index, could be used to reduce the redundant
+  // operations among the callbacks.
+  virtual Status Callback(SequenceNumber seq, bool is_mem_disabled,
+                          uint64_t log_number, size_t index, size_t total) = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/prefix_test.cc b/src/rocksdb/db/prefix_test.cc
new file mode 100644
index 000000000..8592b8f31
--- /dev/null
+++ b/src/rocksdb/db/prefix_test.cc
@@ -0,0 +1,906 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run this test... Skipping...\n");
+  return 0;
+}
+#else
+
+#include <algorithm>
+#include <iostream>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "monitoring/histogram.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/filter_policy.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/perf_context.h"
+#include "rocksdb/slice_transform.h"
+#include "rocksdb/system_clock.h"
+#include "rocksdb/table.h"
+#include "test_util/testharness.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "utilities/merge_operators.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_bool(trigger_deadlock, false,
+            "issue delete in range scan to trigger PrefixHashMap deadlock");
+DEFINE_int32(bucket_count, 100000, "number of buckets");
+DEFINE_uint64(num_locks, 10001, "number of locks");
+DEFINE_bool(random_prefix, false, "randomize prefix");
+DEFINE_uint64(total_prefixes, 100000, "total number of prefixes");
+DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix");
+DEFINE_int64(write_buffer_size, 33554432, "");
+DEFINE_int32(max_write_buffer_number, 2, "");
+DEFINE_int32(min_write_buffer_number_to_merge, 1, "");
+DEFINE_int32(skiplist_height, 4, "");
+DEFINE_double(memtable_prefix_bloom_size_ratio, 0.1, "");
+DEFINE_int32(memtable_huge_page_size, 2 * 1024 * 1024, "");
+DEFINE_int32(value_size, 40, "");
+DEFINE_bool(enable_print, false, "Print options generated to console.");
+
+// Path to the database on file system
+const std::string kDbName =
+    ROCKSDB_NAMESPACE::test::PerThreadDBPath("prefix_test");
+
+namespace ROCKSDB_NAMESPACE {
+
+struct TestKey {
+  uint64_t prefix;
+  uint64_t sorted;
+
+  TestKey(uint64_t _prefix, uint64_t _sorted)
+      : prefix(_prefix), sorted(_sorted) {}
+};
+
+// return a slice backed by test_key
+inline Slice TestKeyToSlice(std::string& s, const TestKey& test_key) {
+  s.clear();
+  PutFixed64(&s, test_key.prefix);
+  PutFixed64(&s, test_key.sorted);
+  return Slice(s.c_str(), s.size());
+}
+
+inline const TestKey SliceToTestKey(const Slice& slice) {
+  return TestKey(DecodeFixed64(slice.data()), DecodeFixed64(slice.data() + 8));
+}
+
+class TestKeyComparator : public Comparator {
+ public:
+  // Compare needs to be aware of the possibility of a and/or b is
+  // prefix only
+  int Compare(const Slice& a, const Slice& b) const override {
+    const TestKey kkey_a = SliceToTestKey(a);
+    const TestKey kkey_b = SliceToTestKey(b);
+    const TestKey* key_a = &kkey_a;
+    const TestKey* key_b = &kkey_b;
+    if (key_a->prefix != key_b->prefix) {
+      if (key_a->prefix < key_b->prefix) return -1;
+      if (key_a->prefix > key_b->prefix) return 1;
+    } else {
+      EXPECT_TRUE(key_a->prefix == key_b->prefix);
+      // note, both a and b could be prefix only
+      if (a.size() != b.size()) {
+        // one of them is prefix
+        EXPECT_TRUE(
+            (a.size() == sizeof(uint64_t) && b.size() == sizeof(TestKey)) ||
+            (b.size() == sizeof(uint64_t) && a.size() == sizeof(TestKey)));
+        if (a.size() < b.size()) return -1;
+        if (a.size() > b.size()) return 1;
+      } else {
+        // both a and b are prefix
+        if (a.size() == sizeof(uint64_t)) {
+          return 0;
+        }
+
+        // both a and b are whole key
+        EXPECT_TRUE(a.size() == sizeof(TestKey) && b.size() == sizeof(TestKey));
+        if (key_a->sorted < key_b->sorted) return -1;
+        if (key_a->sorted > key_b->sorted) return 1;
+        if (key_a->sorted == key_b->sorted) return 0;
+      }
+    }
+    return 0;
+  }
+
+  bool operator()(const TestKey& a, const TestKey& b) const {
+    std::string sa, sb;
+    return Compare(TestKeyToSlice(sa, a), TestKeyToSlice(sb, b)) < 0;
+  }
+
+  const char* Name() const override { return "TestKeyComparator"; }
+
+  void FindShortestSeparator(std::string* /*start*/,
+                             const Slice& /*limit*/) const override {}
+
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+namespace {
+void PutKey(DB* db, WriteOptions write_options, uint64_t prefix,
+            uint64_t suffix, const Slice& value) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void PutKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+            const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Put(write_options, key, value));
+}
+
+void MergeKey(DB* db, WriteOptions write_options, const TestKey& test_key,
+              const Slice& value) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Merge(write_options, key, value));
+}
+
+void DeleteKey(DB* db, WriteOptions write_options, const TestKey& test_key) {
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  ASSERT_OK(db->Delete(write_options, key));
+}
+
+void SeekIterator(Iterator* iter, uint64_t prefix, uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s;
+  Slice key = TestKeyToSlice(s, test_key);
+  iter->Seek(key);
+}
+
+const std::string kNotFoundResult = "NOT_FOUND";
+
+std::string Get(DB* db, const ReadOptions& read_options, uint64_t prefix,
+                uint64_t suffix) {
+  TestKey test_key(prefix, suffix);
+  std::string s2;
+  Slice key = TestKeyToSlice(s2, test_key);
+
+  std::string result;
+  Status s = db->Get(read_options, key, &result);
+  if (s.IsNotFound()) {
+    result = kNotFoundResult;
+  } else if (!s.ok()) {
+    result = s.ToString();
+  }
+  return result;
+}
+
+class SamePrefixTransform : public SliceTransform {
+ private:
+  const Slice prefix_;
+  std::string name_;
+
+ public:
+  explicit SamePrefixTransform(const Slice& prefix)
+      : prefix_(prefix), name_("rocksdb.SamePrefix." + prefix.ToString()) {}
+
+  const char* Name() const override { return name_.c_str(); }
+
+  Slice Transform(const Slice& src) const override {
+    assert(InDomain(src));
+    return prefix_;
+  }
+
+  bool InDomain(const Slice& src) const override {
+    if (src.size() >= prefix_.size()) {
+      return Slice(src.data(), prefix_.size()) == prefix_;
+    }
+    return false;
+  }
+
+  bool InRange(const Slice& dst) const override { return dst == prefix_; }
+
+  bool FullLengthEnabled(size_t* /*len*/) const override { return false; }
+};
+
+}  // anonymous namespace
+
+class PrefixTest : public testing::Test {
+ public:
+  std::shared_ptr<DB> OpenDb() {
+    DB* db;
+
+    options.create_if_missing = true;
+    options.write_buffer_size = FLAGS_write_buffer_size;
+    options.max_write_buffer_number = FLAGS_max_write_buffer_number;
+    options.min_write_buffer_number_to_merge =
+        FLAGS_min_write_buffer_number_to_merge;
+
+    options.memtable_prefix_bloom_size_ratio =
+        FLAGS_memtable_prefix_bloom_size_ratio;
+    options.memtable_huge_page_size = FLAGS_memtable_huge_page_size;
+
+    options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+    BlockBasedTableOptions bbto;
+    bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+    bbto.whole_key_filtering = false;
+    options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+    options.allow_concurrent_memtable_write = false;
+
+    Status s = DB::Open(options, kDbName, &db);
+    EXPECT_OK(s);
+    return std::shared_ptr<DB>(db);
+  }
+
+  void FirstOption() { option_config_ = kBegin; }
+
+  bool NextOptions(int bucket_count) {
+    // skip some options
+    option_config_++;
+    if (option_config_ < kEnd) {
+      options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+      switch (option_config_) {
+        case kHashSkipList:
+          options.memtable_factory.reset(
+              NewHashSkipListRepFactory(bucket_count, FLAGS_skiplist_height));
+          return true;
+        case kHashLinkList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count));
+          return true;
+        case kHashLinkListHugePageTlb:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 2 * 1024 * 1024));
+          return true;
+        case kHashLinkListTriggerSkipList:
+          options.memtable_factory.reset(
+              NewHashLinkListRepFactory(bucket_count, 0, 3));
+          return true;
+        default:
+          return false;
+      }
+    }
+    return false;
+  }
+
+  PrefixTest() : option_config_(kBegin) {
+    options.comparator = new TestKeyComparator();
+  }
+  ~PrefixTest() override { delete options.comparator; }
+
+ protected:
+  enum OptionConfig {
+    kBegin,
+    kHashSkipList,
+    kHashLinkList,
+    kHashLinkListHugePageTlb,
+    kHashLinkListTriggerSkipList,
+    kEnd
+  };
+  int option_config_;
+  Options options;
+};
+
+TEST(SamePrefixTest, InDomainTest) {
+  DB* db;
+  Options options;
+  options.create_if_missing = true;
+  options.prefix_extractor.reset(new SamePrefixTransform("HHKB"));
+  BlockBasedTableOptions bbto;
+  bbto.filter_policy.reset(NewBloomFilterPolicy(10, false));
+  bbto.whole_key_filtering = false;
+  options.table_factory.reset(NewBlockBasedTableFactory(bbto));
+  WriteOptions write_options;
+  ReadOptions read_options;
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2", "Mar 24, 2006"));
+    ASSERT_OK(db->Put(write_options, "HHKB pro2 Type-S", "June 29, 2011"));
+    ASSERT_OK(db->Put(write_options, "Realforce 87u", "idk"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Realforce 87u");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    ASSERT_EQ(db_iter->key(), "Realforce 87u");
+    ASSERT_EQ(db_iter->value(), "idk");
+
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+
+  {
+    ASSERT_OK(DB::Open(options, kDbName, &db));
+    ASSERT_OK(db->Put(write_options, "pikachu", "1"));
+    ASSERT_OK(db->Put(write_options, "Meowth", "1"));
+    ASSERT_OK(db->Put(write_options, "Mewtwo", "idk"));
+    ASSERT_OK(db->Flush(FlushOptions()));
+    std::string result;
+    auto db_iter = db->NewIterator(ReadOptions());
+
+    db_iter->Seek("Mewtwo");
+    ASSERT_TRUE(db_iter->Valid());
+    ASSERT_OK(db_iter->status());
+    delete db_iter;
+    delete db;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+  }
+}
+
+TEST_F(PrefixTest, TestResult) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets << std::endl;
+      ASSERT_OK(DestroyDB(kDbName, Options()));
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // 1. Insert one row.
+      Slice v16("v16");
+      PutKey(db.get(), write_options, 1, 6, v16);
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 0, 6));
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 2, 6));
+
+      // 2. Insert an entry for the same prefix as the last entry in the bucket.
+      Slice v17("v17");
+      PutKey(db.get(), write_options, 1, 7, v17);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+
+      // 3. Insert an entry for the same prefix as the head of the bucket.
+      Slice v15("v15");
+      PutKey(db.get(), write_options, 1, 5, v15);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+
+      // 4. Insert an entry with a larger prefix
+      Slice v22("v22");
+      PutKey(db.get(), write_options, 2, 2, v22);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 2, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 5. Insert an entry with a smaller prefix
+      Slice v02("v02");
+      PutKey(db.get(), write_options, 0, 2, v02);
+      iter.reset(db->NewIterator(read_options));
+
+      SeekIterator(iter.get(), 0, 2);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      SeekIterator(iter.get(), 1, 5);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      // 6. Insert to the beginning and the end of the first prefix
+      Slice v13("v13");
+      Slice v18("v18");
+      PutKey(db.get(), write_options, 1, 3, v13);
+      PutKey(db.get(), write_options, 1, 8, v18);
+      iter.reset(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 1, 7);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      SeekIterator(iter.get(), 1, 3);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v13 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v15 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      SeekIterator(iter.get(), 0, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v02 == iter->value());
+
+      SeekIterator(iter.get(), 2, 0);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v22 == iter->value());
+
+      ASSERT_EQ(v22.ToString(), Get(db.get(), read_options, 2, 2));
+      ASSERT_EQ(v02.ToString(), Get(db.get(), read_options, 0, 2));
+      ASSERT_EQ(v13.ToString(), Get(db.get(), read_options, 1, 3));
+      ASSERT_EQ(v15.ToString(), Get(db.get(), read_options, 1, 5));
+      ASSERT_EQ(v16.ToString(), Get(db.get(), read_options, 1, 6));
+      ASSERT_EQ(v17.ToString(), Get(db.get(), read_options, 1, 7));
+      ASSERT_EQ(v18.ToString(), Get(db.get(), read_options, 1, 8));
+    }
+  }
+}
+
+// Show results in prefix
+TEST_F(PrefixTest, PrefixValid) {
+  for (int num_buckets = 1; num_buckets <= 2; num_buckets++) {
+    FirstOption();
+    while (NextOptions(num_buckets)) {
+      std::cout << "*** Mem table: " << options.memtable_factory->Name()
+                << " number of buckets: " << num_buckets << std::endl;
+      ASSERT_OK(DestroyDB(kDbName, Options()));
+      auto db = OpenDb();
+      WriteOptions write_options;
+      ReadOptions read_options;
+
+      // Insert keys with common prefix and one key with different
+      Slice v16("v16");
+      Slice v17("v17");
+      Slice v18("v18");
+      Slice v19("v19");
+      PutKey(db.get(), write_options, 12345, 6, v16);
+      PutKey(db.get(), write_options, 12345, 7, v17);
+      PutKey(db.get(), write_options, 12345, 8, v18);
+      PutKey(db.get(), write_options, 12345, 9, v19);
+      PutKey(db.get(), write_options, 12346, 8, v16);
+      ASSERT_OK(db->Flush(FlushOptions()));
+      TestKey test_key(12346, 8);
+      std::string s;
+      ASSERT_OK(db->Delete(write_options, TestKeyToSlice(s, test_key)));
+      ASSERT_OK(db->Flush(FlushOptions()));
+      read_options.prefix_same_as_start = true;
+      std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+      SeekIterator(iter.get(), 12345, 6);
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v16 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v17 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v18 == iter->value());
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_TRUE(v19 == iter->value());
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_EQ(kNotFoundResult, Get(db.get(), read_options, 12346, 8));
+
+      // Verify seeking past the prefix won't return a result.
+      SeekIterator(iter.get(), 12345, 10);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  }
+}
+
+TEST_F(PrefixTest, DynamicPrefixIterator) {
+  while (NextOptions(FLAGS_bucket_count)) {
+    std::cout << "*** Mem table: " << options.memtable_factory->Name()
+              << std::endl;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+
+    std::vector<uint64_t> prefixes;
+    for (uint64_t i = 0; i < FLAGS_total_prefixes; ++i) {
+      prefixes.push_back(i);
+    }
+
+    if (FLAGS_random_prefix) {
+      RandomShuffle(prefixes.begin(), prefixes.end());
+    }
+
+    HistogramImpl hist_put_time;
+    HistogramImpl hist_put_comparison;
+    // insert x random prefix, each with y continuous element.
+    for (auto prefix : prefixes) {
+      for (uint64_t sorted = 0; sorted < FLAGS_items_per_prefix; sorted++) {
+        TestKey test_key(prefix, sorted);
+
+        std::string s;
+        Slice key = TestKeyToSlice(s, test_key);
+        std::string value(FLAGS_value_size, 0);
+
+        get_perf_context()->Reset();
+        StopWatchNano timer(SystemClock::Default().get(), true);
+        ASSERT_OK(db->Put(write_options, key, value));
+        hist_put_time.Add(timer.ElapsedNanos());
+        hist_put_comparison.Add(get_perf_context()->user_key_comparison_count);
+      }
+    }
+
+    std::cout << "Put key comparison: \n"
+              << hist_put_comparison.ToString() << "Put time: \n"
+              << hist_put_time.ToString();
+
+    // test seek existing keys
+    HistogramImpl hist_seek_time;
+    HistogramImpl hist_seek_comparison;
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+
+    for (auto prefix : prefixes) {
+      TestKey test_key(prefix, FLAGS_items_per_prefix / 2);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+      std::string value = "v" + std::to_string(0);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(SystemClock::Default().get(), true);
+      auto key_prefix = options.prefix_extractor->Transform(key);
+      uint64_t total_keys = 0;
+      for (iter->Seek(key);
+           iter->Valid() && iter->key().starts_with(key_prefix); iter->Next()) {
+        if (FLAGS_trigger_deadlock) {
+          std::cout << "Behold the deadlock!\n";
+          db->Delete(write_options, iter->key());
+        }
+        total_keys++;
+      }
+      hist_seek_time.Add(timer.ElapsedNanos());
+      hist_seek_comparison.Add(get_perf_context()->user_key_comparison_count);
+      ASSERT_EQ(total_keys,
+                FLAGS_items_per_prefix - FLAGS_items_per_prefix / 2);
+    }
+
+    std::cout << "Seek key comparison: \n"
+              << hist_seek_comparison.ToString() << "Seek time: \n"
+              << hist_seek_time.ToString();
+
+    // test non-existing keys
+    HistogramImpl hist_no_seek_time;
+    HistogramImpl hist_no_seek_comparison;
+
+    for (auto prefix = FLAGS_total_prefixes;
+         prefix < FLAGS_total_prefixes + 10000; prefix++) {
+      TestKey test_key(prefix, 0);
+      std::string s;
+      Slice key = TestKeyToSlice(s, test_key);
+
+      get_perf_context()->Reset();
+      StopWatchNano timer(SystemClock::Default().get(), true);
+      iter->Seek(key);
+      hist_no_seek_time.Add(timer.ElapsedNanos());
+      hist_no_seek_comparison.Add(
+          get_perf_context()->user_key_comparison_count);
+      ASSERT_TRUE(!iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+
+    std::cout << "non-existing Seek key comparison: \n"
+              << hist_no_seek_comparison.ToString()
+              << "non-existing Seek time: \n"
+              << hist_no_seek_time.ToString();
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev) {
+  // Only for SkipListFactory
+  options.memtable_factory.reset(new SkipListFactory);
+  options.merge_operator = MergeOperators::CreatePutOperator();
+  options.write_buffer_size = 1024 * 1024;
+  Random rnd(1);
+  for (size_t m = 1; m < 100; m++) {
+    std::cout << "[" + std::to_string(m) + "]" + "*** Mem table: "
+              << options.memtable_factory->Name() << std::endl;
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    std::map<TestKey, std::string, TestKeyComparator> entry_maps[3], whole_map;
+    for (uint64_t i = 0; i < 10; i++) {
+      int div = i % 3 + 1;
+      for (uint64_t j = 0; j < 10; j++) {
+        whole_map[TestKey(i, j)] = entry_maps[rnd.Uniform(div)][TestKey(i, j)] =
+            'v' + std::to_string(i) + std::to_string(j);
+      }
+    }
+
+    std::map<TestKey, std::string, TestKeyComparator> type_map;
+    for (size_t i = 0; i < 3; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(3)) {
+          PutKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "value";
+        } else {
+          MergeKey(db.get(), write_options, kv.first, kv.second);
+          type_map[kv.first] = "merge";
+        }
+      }
+      if (i < 2) {
+        ASSERT_OK(db->Flush(FlushOptions()));
+      }
+    }
+
+    for (size_t i = 0; i < 2; i++) {
+      for (auto& kv : entry_maps[i]) {
+        if (rnd.OneIn(10)) {
+          whole_map.erase(kv.first);
+          DeleteKey(db.get(), write_options, kv.first);
+          entry_maps[2][kv.first] = "delete";
+        }
+      }
+    }
+
+    if (FLAGS_enable_print) {
+      for (size_t i = 0; i < 3; i++) {
+        for (auto& kv : entry_maps[i]) {
+          std::cout << "[" << i << "]" << kv.first.prefix << kv.first.sorted
+                    << " " << kv.second + " " + type_map[kv.first] << std::endl;
+        }
+      }
+    }
+
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    for (uint64_t prefix = 0; prefix < 10; prefix++) {
+      uint64_t start_suffix = rnd.Uniform(9);
+      SeekIterator(iter.get(), prefix, start_suffix);
+      auto it = whole_map.find(TestKey(prefix, start_suffix));
+      if (it == whole_map.end()) {
+        continue;
+      }
+      ASSERT_NE(it, whole_map.end());
+      ASSERT_TRUE(iter->Valid());
+      if (FLAGS_enable_print) {
+        std::cout << "round " << prefix
+                  << " iter: " << SliceToTestKey(iter->key()).prefix
+                  << SliceToTestKey(iter->key()).sorted
+                  << " | map: " << it->first.prefix << it->first.sorted << " | "
+                  << iter->value().ToString() << " " << it->second << std::endl;
+      }
+      ASSERT_EQ(iter->value(), it->second);
+      uint64_t stored_prefix = prefix;
+      for (size_t k = 0; k < 9; k++) {
+        if (rnd.OneIn(2) || it == whole_map.begin()) {
+          iter->Next();
+          ++it;
+          if (FLAGS_enable_print) {
+            std::cout << "Next >> ";
+          }
+        } else {
+          iter->Prev();
+          it--;
+          if (FLAGS_enable_print) {
+            std::cout << "Prev >> ";
+          }
+        }
+        if (!iter->Valid() ||
+            SliceToTestKey(iter->key()).prefix != stored_prefix) {
+          break;
+        }
+        ASSERT_OK(iter->status());
+        stored_prefix = SliceToTestKey(iter->key()).prefix;
+        ASSERT_TRUE(iter->Valid());
+        ASSERT_NE(it, whole_map.end());
+        ASSERT_EQ(iter->value(), it->second);
+        if (FLAGS_enable_print) {
+          std::cout << "iter: " << SliceToTestKey(iter->key()).prefix
+                    << SliceToTestKey(iter->key()).sorted
+                    << " | map: " << it->first.prefix << it->first.sorted
+                    << " | " << iter->value().ToString() << " " << it->second
+                    << std::endl;
+        }
+      }
+    }
+  }
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev2) {
+  // Only for SkipListFactory
+  // test the case
+  //        iter1                iter2
+  // | prefix | suffix |  | prefix | suffix |
+  // |   1    |   1    |  |   1    |   2    |
+  // |   1    |   3    |  |   1    |   4    |
+  // |   2    |   1    |  |   3    |   3    |
+  // |   2    |   2    |  |   3    |   4    |
+  // after seek(15), iter1 will be at 21 and iter2 will be 33.
+  // Then if call Prev() in prefix mode where SeekForPrev(21) gets called,
+  // iter2 should turn to invalid state because of bloom filter.
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v13("v13");
+  ASSERT_OK(DestroyDB(kDbName, Options()));
+  auto db = OpenDb();
+  WriteOptions write_options;
+  ReadOptions read_options;
+  PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+  PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+  PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+  PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+  PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+  PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+  PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+  PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+  ASSERT_OK(db->Flush(FlushOptions()));
+  ASSERT_OK(
+      static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+  std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+  SeekIterator(iter.get(), 1, 5);
+  iter->Prev();
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(iter->value(), v13);
+}
+
+TEST_F(PrefixTest, PrefixSeekModePrev3) {
+  // Only for SkipListFactory
+  // test SeekToLast() with iterate_upper_bound_ in prefix_seek_mode
+  options.memtable_factory.reset(new SkipListFactory);
+  options.write_buffer_size = 1024 * 1024;
+  std::string v14("v14");
+  TestKey upper_bound_key = TestKey(1, 5);
+  std::string s;
+  Slice upper_bound = TestKeyToSlice(s, upper_bound_key);
+
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    PutKey(db.get(), write_options, TestKey(2, 1), "v21");
+    PutKey(db.get(), write_options, TestKey(2, 2), "v22");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+  {
+    ASSERT_OK(DestroyDB(kDbName, Options()));
+    auto db = OpenDb();
+    WriteOptions write_options;
+    ReadOptions read_options;
+    read_options.iterate_upper_bound = &upper_bound;
+    PutKey(db.get(), write_options, TestKey(1, 2), "v12");
+    PutKey(db.get(), write_options, TestKey(1, 4), "v14");
+    PutKey(db.get(), write_options, TestKey(3, 3), "v33");
+    PutKey(db.get(), write_options, TestKey(3, 4), "v34");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    PutKey(db.get(), write_options, TestKey(1, 1), "v11");
+    PutKey(db.get(), write_options, TestKey(1, 3), "v13");
+    ASSERT_OK(db->Flush(FlushOptions()));
+    ASSERT_OK(
+        static_cast_with_check<DBImpl>(db.get())->TEST_WaitForFlushMemTable());
+    std::unique_ptr<Iterator> iter(db->NewIterator(read_options));
+    iter->SeekToLast();
+    ASSERT_EQ(iter->value(), v14);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  ParseCommandLineFlags(&argc, &argv, true);
+  return RUN_ALL_TESTS();
+}
+
+#endif  // GFLAGS
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as HashSkipList and HashLinkList are not supported in "
+          "ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/range_del_aggregator.cc b/src/rocksdb/db/range_del_aggregator.cc
new file mode 100644
index 000000000..c03efa11f
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.cc
@@ -0,0 +1,524 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TruncatedRangeDelIterator::TruncatedRangeDelIterator(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+    const InternalKeyComparator* icmp, const InternalKey* smallest,
+    const InternalKey* largest)
+    : iter_(std::move(iter)),
+      icmp_(icmp),
+      smallest_ikey_(smallest),
+      largest_ikey_(largest) {
+  if (smallest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_smallest = pinned_bounds_.back();
+    Status pik_status = ParseInternalKey(smallest->Encode(), &parsed_smallest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+    smallest_ = &parsed_smallest;
+  }
+  if (largest != nullptr) {
+    pinned_bounds_.emplace_back();
+    auto& parsed_largest = pinned_bounds_.back();
+
+    Status pik_status = ParseInternalKey(largest->Encode(), &parsed_largest,
+                                         false /* log_err_key */);  // TODO
+    pik_status.PermitUncheckedError();
+    assert(pik_status.ok());
+
+    if (parsed_largest.type == kTypeRangeDeletion &&
+        parsed_largest.sequence == kMaxSequenceNumber) {
+      // The file boundary has been artificially extended by a range tombstone.
+      // We do not need to adjust largest to properly truncate range
+      // tombstones that extend past the boundary.
+    } else if (parsed_largest.sequence == 0) {
+      // The largest key in the sstable has a sequence number of 0. Since we
+      // guarantee that no internal keys with the same user key and sequence
+      // number can exist in a DB, we know that the largest key in this sstable
+      // cannot exist as the smallest key in the next sstable. This further
+      // implies that no range tombstone in this sstable covers largest;
+      // otherwise, the file boundary would have been artificially extended.
+      //
+      // Therefore, we will never truncate a range tombstone at largest, so we
+      // can leave it unchanged.
+    } else {
+      // The same user key may straddle two sstable boundaries. To ensure that
+      // the truncated end key can cover the largest key in this sstable, reduce
+      // its sequence number by 1.
+      parsed_largest.sequence -= 1;
+      // This line is not needed for correctness, but it ensures that the
+      // truncated end key is not covering keys from the next SST file.
+      parsed_largest.type = kValueTypeForSeek;
+    }
+    largest_ = &parsed_largest;
+  }
+}
+
+bool TruncatedRangeDelIterator::Valid() const {
+  assert(iter_ != nullptr);
+  return iter_->Valid() &&
+         (smallest_ == nullptr ||
+          icmp_->Compare(*smallest_, iter_->parsed_end_key()) < 0) &&
+         (largest_ == nullptr ||
+          icmp_->Compare(iter_->parsed_start_key(), *largest_) < 0);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::Seek(const Slice& target) {
+  if (largest_ != nullptr &&
+      icmp_->Compare(*largest_, ParsedInternalKey(target, kMaxSequenceNumber,
+                                                  kTypeRangeDeletion)) <= 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (smallest_ != nullptr &&
+      icmp_->user_comparator()->Compare(target, smallest_->user_key) < 0) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->Seek(target);
+}
+
+// NOTE: target is a user key, with timestamp if enabled.
+void TruncatedRangeDelIterator::SeekForPrev(const Slice& target) {
+  if (smallest_ != nullptr &&
+      icmp_->Compare(ParsedInternalKey(target, 0, kTypeRangeDeletion),
+                     *smallest_) < 0) {
+    iter_->Invalidate();
+    return;
+  }
+  if (largest_ != nullptr &&
+      icmp_->user_comparator()->Compare(largest_->user_key, target) < 0) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekForPrev(target);
+}
+
+void TruncatedRangeDelIterator::SeekToFirst() {
+  if (smallest_ != nullptr) {
+    iter_->Seek(smallest_->user_key);
+    return;
+  }
+  iter_->SeekToTopFirst();
+}
+
+void TruncatedRangeDelIterator::SeekToLast() {
+  if (largest_ != nullptr) {
+    iter_->SeekForPrev(largest_->user_key);
+    return;
+  }
+  iter_->SeekToTopLast();
+}
+
+std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+TruncatedRangeDelIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  using FragmentedIterPair =
+      std::pair<const SequenceNumber,
+                std::unique_ptr<FragmentedRangeTombstoneIterator>>;
+
+  auto split_untruncated_iters = iter_->SplitBySnapshot(snapshots);
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+      split_truncated_iters;
+  std::for_each(
+      split_untruncated_iters.begin(), split_untruncated_iters.end(),
+      [&](FragmentedIterPair& iter_pair) {
+        auto truncated_iter = std::make_unique<TruncatedRangeDelIterator>(
+            std::move(iter_pair.second), icmp_, smallest_ikey_, largest_ikey_);
+        split_truncated_iters.emplace(iter_pair.first,
+                                      std::move(truncated_iter));
+      });
+  return split_truncated_iters;
+}
+
+ForwardRangeDelIterator::ForwardRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(EndKeyMinComparator(icmp)),
+      inactive_iters_(StartKeyMinComparator(icmp)) {}
+
+bool ForwardRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that end before parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare((*active_iters_.top())->end_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Next();
+    } while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that start before parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(inactive_iters_.top()->start_key(), parsed) <= 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      iter->Next();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ForwardRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+ReverseRangeDelIterator::ReverseRangeDelIterator(
+    const InternalKeyComparator* icmp)
+    : icmp_(icmp),
+      unused_idx_(0),
+      active_seqnums_(SeqMaxComparator()),
+      active_iters_(StartKeyMaxComparator(icmp)),
+      inactive_iters_(EndKeyMaxComparator(icmp)) {}
+
+bool ReverseRangeDelIterator::ShouldDelete(const ParsedInternalKey& parsed) {
+  // Move active iterators that start after parsed.
+  while (!active_iters_.empty() &&
+         icmp_->Compare(parsed, (*active_iters_.top())->start_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopActiveIter();
+    do {
+      iter->Prev();
+    } while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  // Move inactive iterators that end after parsed.
+  while (!inactive_iters_.empty() &&
+         icmp_->Compare(parsed, inactive_iters_.top()->end_key()) < 0) {
+    TruncatedRangeDelIterator* iter = PopInactiveIter();
+    while (iter->Valid() && icmp_->Compare(parsed, iter->start_key()) < 0) {
+      iter->Prev();
+    }
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  return active_seqnums_.empty()
+             ? false
+             : (*active_seqnums_.begin())->seq() > parsed.sequence;
+}
+
+void ReverseRangeDelIterator::Invalidate() {
+  unused_idx_ = 0;
+  active_iters_.clear();
+  active_seqnums_.clear();
+  inactive_iters_.clear();
+}
+
+bool RangeDelAggregator::StripeRep::ShouldDelete(
+    const ParsedInternalKey& parsed, RangeDelPositioningMode mode) {
+  if (!InStripe(parsed.sequence) || IsEmpty()) {
+    return false;
+  }
+  switch (mode) {
+    case RangeDelPositioningMode::kForwardTraversal:
+      InvalidateReverseIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), forward_iter_.UnusedIdx());
+           it != iters_.end(); ++it, forward_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        forward_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return forward_iter_.ShouldDelete(parsed);
+    case RangeDelPositioningMode::kBackwardTraversal:
+      InvalidateForwardIter();
+
+      // Pick up previously unseen iterators.
+      for (auto it = std::next(iters_.begin(), reverse_iter_.UnusedIdx());
+           it != iters_.end(); ++it, reverse_iter_.IncUnusedIdx()) {
+        auto& iter = *it;
+        reverse_iter_.AddNewIter(iter.get(), parsed);
+      }
+
+      return reverse_iter_.ShouldDelete(parsed);
+    default:
+      assert(false);
+      return false;
+  }
+}
+
+bool RangeDelAggregator::StripeRep::IsRangeOverlapped(const Slice& start,
+                                                      const Slice& end) {
+  Invalidate();
+
+  // Set the internal start/end keys so that:
+  // - if start_ikey has the same user key and sequence number as the
+  // current end key, start_ikey will be considered greater; and
+  // - if end_ikey has the same user key and sequence number as the current
+  // start key, end_ikey will be considered greater.
+  ParsedInternalKey start_ikey(start, kMaxSequenceNumber,
+                               static_cast<ValueType>(0));
+  ParsedInternalKey end_ikey(end, 0, static_cast<ValueType>(0));
+  for (auto& iter : iters_) {
+    bool checked_candidate_tombstones = false;
+    for (iter->SeekForPrev(start);
+         iter->Valid() && icmp_->Compare(iter->start_key(), end_ikey) <= 0;
+         iter->Next()) {
+      checked_candidate_tombstones = true;
+      if (icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+
+    if (!checked_candidate_tombstones) {
+      // Do an additional check for when the end of the range is the begin
+      // key of a tombstone, which we missed earlier since SeekForPrev'ing
+      // to the start was invalid.
+      iter->SeekForPrev(end);
+      if (iter->Valid() && icmp_->Compare(start_ikey, iter->end_key()) < 0 &&
+          icmp_->Compare(iter->start_key(), end_ikey) <= 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void ReadRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  rep_.AddTombstones(std::make_unique<TruncatedRangeDelIterator>(
+      std::move(input_iter), icmp_, smallest, largest));
+}
+
+bool ReadRangeDelAggregator::ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                                              RangeDelPositioningMode mode) {
+  return rep_.ShouldDelete(parsed, mode);
+}
+
+bool ReadRangeDelAggregator::IsRangeOverlapped(const Slice& start,
+                                               const Slice& end) {
+  InvalidateRangeDelMapPositions();
+  return rep_.IsRangeOverlapped(start, end);
+}
+
+void CompactionRangeDelAggregator::AddTombstones(
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+    const InternalKey* smallest, const InternalKey* largest) {
+  if (input_iter == nullptr || input_iter->empty()) {
+    return;
+  }
+  // This bounds output of CompactionRangeDelAggregator::NewIterator.
+  if (!trim_ts_.empty()) {
+    assert(icmp_->user_comparator()->timestamp_size() > 0);
+    input_iter->SetTimestampUpperBound(&trim_ts_);
+  }
+
+  assert(input_iter->lower_bound() == 0);
+  assert(input_iter->upper_bound() == kMaxSequenceNumber);
+  parent_iters_.emplace_back(new TruncatedRangeDelIterator(
+      std::move(input_iter), icmp_, smallest, largest));
+
+  Slice* ts_upper_bound = nullptr;
+  if (!ts_upper_bound_.empty()) {
+    assert(icmp_->user_comparator()->timestamp_size() > 0);
+    ts_upper_bound = &ts_upper_bound_;
+  }
+  auto split_iters = parent_iters_.back()->SplitBySnapshot(*snapshots_);
+  for (auto& split_iter : split_iters) {
+    auto it = reps_.find(split_iter.first);
+    if (it == reps_.end()) {
+      bool inserted;
+      SequenceNumber upper_bound = split_iter.second->upper_bound();
+      SequenceNumber lower_bound = split_iter.second->lower_bound();
+      std::tie(it, inserted) = reps_.emplace(
+          split_iter.first, StripeRep(icmp_, upper_bound, lower_bound));
+      assert(inserted);
+    }
+    assert(it != reps_.end());
+    // ts_upper_bound is used to bound ShouldDelete() to only consider
+    // range tombstones under full_history_ts_low_ and trim_ts_. Keys covered by
+    // range tombstones that are above full_history_ts_low_ should not be
+    // dropped prematurely: user may read with a timestamp between the range
+    // tombstone and the covered key. Note that we cannot set timestamp
+    // upperbound on the original `input_iter` since `input_iter`s are later
+    // used in CompactionRangeDelAggregator::NewIterator to output range
+    // tombstones for persistence. We do not want to only persist range
+    // tombstones with timestamp lower than ts_upper_bound.
+    split_iter.second->SetTimestampUpperBound(ts_upper_bound);
+    it->second.AddTombstones(std::move(split_iter.second));
+  }
+}
+
+bool CompactionRangeDelAggregator::ShouldDelete(const ParsedInternalKey& parsed,
+                                                RangeDelPositioningMode mode) {
+  auto it = reps_.lower_bound(parsed.sequence);
+  if (it == reps_.end()) {
+    return false;
+  }
+  return it->second.ShouldDelete(parsed, mode);
+}
+
+namespace {
+
+// Produce a sorted (by start internal key) stream of range tombstones from
+// `children`. lower_bound and upper_bound on user key can be
+// optionally specified. Range tombstones that ends before lower_bound or starts
+// after upper_bound are excluded.
+// If user-defined timestamp is enabled, lower_bound and upper_bound should
+// contain timestamp, but comparison is done ignoring timestamps.
+class TruncatedRangeDelMergingIter : public InternalIterator {
+ public:
+  TruncatedRangeDelMergingIter(
+      const InternalKeyComparator* icmp, const Slice* lower_bound,
+      const Slice* upper_bound, bool upper_bound_inclusive,
+      const std::vector<std::unique_ptr<TruncatedRangeDelIterator>>& children)
+      : icmp_(icmp),
+        lower_bound_(lower_bound),
+        upper_bound_(upper_bound),
+        upper_bound_inclusive_(upper_bound_inclusive),
+        heap_(StartKeyMinComparator(icmp)),
+        ts_sz_(icmp_->user_comparator()->timestamp_size()) {
+    for (auto& child : children) {
+      if (child != nullptr) {
+        assert(child->lower_bound() == 0);
+        assert(child->upper_bound() == kMaxSequenceNumber);
+        children_.push_back(child.get());
+      }
+    }
+  }
+
+  bool Valid() const override {
+    return !heap_.empty() && BeforeEndKey(heap_.top());
+  }
+  Status status() const override { return Status::OK(); }
+
+  void SeekToFirst() override {
+    heap_.clear();
+    for (auto& child : children_) {
+      if (lower_bound_ != nullptr) {
+        child->Seek(*lower_bound_);
+      } else {
+        child->SeekToFirst();
+      }
+      if (child->Valid()) {
+        heap_.push(child);
+      }
+    }
+  }
+
+  void Next() override {
+    auto* top = heap_.top();
+    top->InternalNext();
+    if (top->Valid()) {
+      heap_.replace_top(top);
+    } else {
+      heap_.pop();
+    }
+  }
+
+  Slice key() const override {
+    auto* top = heap_.top();
+    if (ts_sz_) {
+      cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                         kTypeRangeDeletion, top->timestamp());
+    } else {
+      cur_start_key_.Set(top->start_key().user_key, top->seq(),
+                         kTypeRangeDeletion);
+    }
+    assert(top->start_key().user_key.size() >= ts_sz_);
+    return cur_start_key_.Encode();
+  }
+
+  Slice value() const override {
+    auto* top = heap_.top();
+    if (!ts_sz_) {
+      return top->end_key().user_key;
+    }
+    assert(top->timestamp().size() == ts_sz_);
+    cur_end_key_.clear();
+    cur_end_key_.append(top->end_key().user_key.data(),
+                        top->end_key().user_key.size() - ts_sz_);
+    cur_end_key_.append(top->timestamp().data(), ts_sz_);
+    return cur_end_key_;
+  }
+
+  // Unused InternalIterator methods
+  void Prev() override { assert(false); }
+  void Seek(const Slice& /* target */) override { assert(false); }
+  void SeekForPrev(const Slice& /* target */) override { assert(false); }
+  void SeekToLast() override { assert(false); }
+
+ private:
+  bool BeforeEndKey(const TruncatedRangeDelIterator* iter) const {
+    if (upper_bound_ == nullptr) {
+      return true;
+    }
+    int cmp = icmp_->user_comparator()->CompareWithoutTimestamp(
+        iter->start_key().user_key, *upper_bound_);
+    return upper_bound_inclusive_ ? cmp <= 0 : cmp < 0;
+  }
+
+  const InternalKeyComparator* icmp_;
+  const Slice* lower_bound_;
+  const Slice* upper_bound_;
+  bool upper_bound_inclusive_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> heap_;
+  std::vector<TruncatedRangeDelIterator*> children_;
+
+  mutable InternalKey cur_start_key_;
+  mutable std::string cur_end_key_;
+  size_t ts_sz_;
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<FragmentedRangeTombstoneIterator>
+CompactionRangeDelAggregator::NewIterator(const Slice* lower_bound,
+                                          const Slice* upper_bound,
+                                          bool upper_bound_inclusive) {
+  InvalidateRangeDelMapPositions();
+  auto merging_iter = std::make_unique<TruncatedRangeDelMergingIter>(
+      icmp_, lower_bound, upper_bound, upper_bound_inclusive, parent_iters_);
+
+  auto fragmented_tombstone_list =
+      std::make_shared<FragmentedRangeTombstoneList>(
+          std::move(merging_iter), *icmp_, true /* for_compaction */,
+          *snapshots_);
+
+  return std::make_unique<FragmentedRangeTombstoneIterator>(
+      fragmented_tombstone_list, *icmp_, kMaxSequenceNumber /* upper_bound */);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator.h b/src/rocksdb/db/range_del_aggregator.h
new file mode 100644
index 000000000..9bd40967d
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator.h
@@ -0,0 +1,476 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/compaction/compaction_iteration_stats.h"
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/version_edit.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/types.h"
+#include "table/internal_iterator.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/table_builder.h"
+#include "util/heap.h"
+#include "util/kv_map.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TruncatedRangeDelIterator {
+ public:
+  TruncatedRangeDelIterator(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> iter,
+      const InternalKeyComparator* icmp, const InternalKey* smallest,
+      const InternalKey* largest);
+
+  bool Valid() const;
+
+  void Next() { iter_->TopNext(); }
+  void Prev() { iter_->TopPrev(); }
+
+  void InternalNext() { iter_->Next(); }
+
+  // Seeks to the tombstone with the highest visible sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the earliest tombstone that ends after target.
+  // REQUIRES: target is a user key.
+  void Seek(const Slice& target);
+
+  // Seeks to the tombstone with the highest visible sequence number that covers
+  // target (a user key). If no such tombstone exists, the position will be at
+  // the latest tombstone that starts before target.
+  void SeekForPrev(const Slice& target);
+
+  void SeekToFirst();
+  void SeekToLast();
+
+  ParsedInternalKey start_key() const {
+    return (smallest_ == nullptr ||
+            icmp_->Compare(*smallest_, iter_->parsed_start_key()) <= 0)
+               ? iter_->parsed_start_key()
+               : *smallest_;
+  }
+
+  ParsedInternalKey end_key() const {
+    return (largest_ == nullptr ||
+            icmp_->Compare(iter_->parsed_end_key(), *largest_) <= 0)
+               ? iter_->parsed_end_key()
+               : *largest_;
+  }
+
+  SequenceNumber seq() const { return iter_->seq(); }
+  Slice timestamp() const {
+    assert(icmp_->user_comparator()->timestamp_size());
+    return iter_->timestamp();
+  }
+  void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+    iter_->SetTimestampUpperBound(ts_upper_bound);
+  }
+
+  std::map<SequenceNumber, std::unique_ptr<TruncatedRangeDelIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return iter_->upper_bound(); }
+
+  SequenceNumber lower_bound() const { return iter_->lower_bound(); }
+
+ private:
+  std::unique_ptr<FragmentedRangeTombstoneIterator> iter_;
+  const InternalKeyComparator* icmp_;
+  const ParsedInternalKey* smallest_ = nullptr;
+  const ParsedInternalKey* largest_ = nullptr;
+  std::list<ParsedInternalKey> pinned_bounds_;
+
+  const InternalKey* smallest_ikey_;
+  const InternalKey* largest_ikey_;
+};
+
+struct SeqMaxComparator {
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return a->seq() > b->seq();
+  }
+};
+
+struct StartKeyMinComparator {
+  explicit StartKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+  bool operator()(const TruncatedRangeDelIterator* a,
+                  const TruncatedRangeDelIterator* b) const {
+    return icmp->Compare(a->start_key(), b->start_key()) > 0;
+  }
+
+  const InternalKeyComparator* icmp;
+};
+
+class ForwardRangeDelIterator {
+ public:
+  explicit ForwardRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->Seek(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMinComparator {
+    explicit EndKeyMinComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->end_key(), (*b)->end_key()) > 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+      return;
+    }
+    int cmp = icmp_->Compare(parsed, iter->start_key());
+    if (cmp < 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, EndKeyMinComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, StartKeyMinComparator> inactive_iters_;
+};
+
+class ReverseRangeDelIterator {
+ public:
+  explicit ReverseRangeDelIterator(const InternalKeyComparator* icmp);
+
+  bool ShouldDelete(const ParsedInternalKey& parsed);
+  void Invalidate();
+
+  void AddNewIter(TruncatedRangeDelIterator* iter,
+                  const ParsedInternalKey& parsed) {
+    iter->SeekForPrev(parsed.user_key);
+    PushIter(iter, parsed);
+    assert(active_iters_.size() == active_seqnums_.size());
+  }
+
+  size_t UnusedIdx() const { return unused_idx_; }
+  void IncUnusedIdx() { unused_idx_++; }
+
+ private:
+  using ActiveSeqSet =
+      std::multiset<TruncatedRangeDelIterator*, SeqMaxComparator>;
+
+  struct EndKeyMaxComparator {
+    explicit EndKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const TruncatedRangeDelIterator* a,
+                    const TruncatedRangeDelIterator* b) const {
+      return icmp->Compare(a->end_key(), b->end_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+  struct StartKeyMaxComparator {
+    explicit StartKeyMaxComparator(const InternalKeyComparator* c) : icmp(c) {}
+
+    bool operator()(const ActiveSeqSet::const_iterator& a,
+                    const ActiveSeqSet::const_iterator& b) const {
+      return icmp->Compare((*a)->start_key(), (*b)->start_key()) < 0;
+    }
+
+    const InternalKeyComparator* icmp;
+  };
+
+  void PushIter(TruncatedRangeDelIterator* iter,
+                const ParsedInternalKey& parsed) {
+    if (!iter->Valid()) {
+      // The iterator has been fully consumed, so we don't need to add it to
+      // either of the heaps.
+    } else if (icmp_->Compare(iter->end_key(), parsed) <= 0) {
+      PushInactiveIter(iter);
+    } else {
+      PushActiveIter(iter);
+    }
+  }
+
+  void PushActiveIter(TruncatedRangeDelIterator* iter) {
+    auto seq_pos = active_seqnums_.insert(iter);
+    active_iters_.push(seq_pos);
+  }
+
+  TruncatedRangeDelIterator* PopActiveIter() {
+    auto active_top = active_iters_.top();
+    auto iter = *active_top;
+    active_iters_.pop();
+    active_seqnums_.erase(active_top);
+    return iter;
+  }
+
+  void PushInactiveIter(TruncatedRangeDelIterator* iter) {
+    inactive_iters_.push(iter);
+  }
+
+  TruncatedRangeDelIterator* PopInactiveIter() {
+    auto* iter = inactive_iters_.top();
+    inactive_iters_.pop();
+    return iter;
+  }
+
+  const InternalKeyComparator* icmp_;
+  size_t unused_idx_;
+  ActiveSeqSet active_seqnums_;
+  BinaryHeap<ActiveSeqSet::const_iterator, StartKeyMaxComparator> active_iters_;
+  BinaryHeap<TruncatedRangeDelIterator*, EndKeyMaxComparator> inactive_iters_;
+};
+
+enum class RangeDelPositioningMode { kForwardTraversal, kBackwardTraversal };
+class RangeDelAggregator {
+ public:
+  explicit RangeDelAggregator(const InternalKeyComparator* icmp)
+      : icmp_(icmp) {}
+  virtual ~RangeDelAggregator() {}
+
+  virtual void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) = 0;
+
+  bool ShouldDelete(const Slice& ikey, RangeDelPositioningMode mode) {
+    ParsedInternalKey parsed;
+
+    Status pik_status =
+        ParseInternalKey(ikey, &parsed, false /* log_err_key */);  // TODO
+    assert(pik_status.ok());
+    if (!pik_status.ok()) {
+      return false;
+    }
+
+    return ShouldDelete(parsed, mode);
+  }
+  virtual bool ShouldDelete(const ParsedInternalKey& parsed,
+                            RangeDelPositioningMode mode) = 0;
+
+  virtual void InvalidateRangeDelMapPositions() = 0;
+
+  virtual bool IsEmpty() const = 0;
+
+  bool AddFile(uint64_t file_number) {
+    return files_seen_.insert(file_number).second;
+  }
+
+ protected:
+  class StripeRep {
+   public:
+    StripeRep(const InternalKeyComparator* icmp, SequenceNumber upper_bound,
+              SequenceNumber lower_bound)
+        : icmp_(icmp),
+          forward_iter_(icmp),
+          reverse_iter_(icmp),
+          upper_bound_(upper_bound),
+          lower_bound_(lower_bound) {}
+
+    void AddTombstones(std::unique_ptr<TruncatedRangeDelIterator> input_iter) {
+      iters_.push_back(std::move(input_iter));
+    }
+
+    bool IsEmpty() const { return iters_.empty(); }
+
+    bool ShouldDelete(const ParsedInternalKey& parsed,
+                      RangeDelPositioningMode mode);
+
+    void Invalidate() {
+      if (!IsEmpty()) {
+        InvalidateForwardIter();
+        InvalidateReverseIter();
+      }
+    }
+
+    // If user-defined timestamp is enabled, `start` and `end` are user keys
+    // with timestamp.
+    bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+   private:
+    bool InStripe(SequenceNumber seq) const {
+      return lower_bound_ <= seq && seq <= upper_bound_;
+    }
+
+    void InvalidateForwardIter() { forward_iter_.Invalidate(); }
+
+    void InvalidateReverseIter() { reverse_iter_.Invalidate(); }
+
+    const InternalKeyComparator* icmp_;
+    std::vector<std::unique_ptr<TruncatedRangeDelIterator>> iters_;
+    ForwardRangeDelIterator forward_iter_;
+    ReverseRangeDelIterator reverse_iter_;
+    SequenceNumber upper_bound_;
+    SequenceNumber lower_bound_;
+  };
+
+  const InternalKeyComparator* icmp_;
+
+ private:
+  std::set<uint64_t> files_seen_;
+};
+
+class ReadRangeDelAggregator final : public RangeDelAggregator {
+ public:
+  ReadRangeDelAggregator(const InternalKeyComparator* icmp,
+                         SequenceNumber upper_bound)
+      : RangeDelAggregator(icmp),
+        rep_(icmp, upper_bound, 0 /* lower_bound */) {}
+  ~ReadRangeDelAggregator() override {}
+
+  using RangeDelAggregator::ShouldDelete;
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) final override {
+    if (rep_.IsEmpty()) {
+      return false;
+    }
+    return ShouldDeleteImpl(parsed, mode);
+  }
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override { rep_.Invalidate(); }
+
+  bool IsEmpty() const override { return rep_.IsEmpty(); }
+
+ private:
+  StripeRep rep_;
+
+  bool ShouldDeleteImpl(const ParsedInternalKey& parsed,
+                        RangeDelPositioningMode mode);
+};
+
+class CompactionRangeDelAggregator : public RangeDelAggregator {
+ public:
+  CompactionRangeDelAggregator(const InternalKeyComparator* icmp,
+                               const std::vector<SequenceNumber>& snapshots,
+                               const std::string* full_history_ts_low = nullptr,
+                               const std::string* trim_ts = nullptr)
+      : RangeDelAggregator(icmp), snapshots_(&snapshots) {
+    if (full_history_ts_low) {
+      ts_upper_bound_ = *full_history_ts_low;
+    }
+    if (trim_ts) {
+      trim_ts_ = *trim_ts;
+      // Range tombstone newer than `trim_ts` or `full_history_ts_low` should
+      // not be considered in ShouldDelete().
+      if (ts_upper_bound_.empty()) {
+        ts_upper_bound_ = trim_ts_;
+      } else if (!trim_ts_.empty() && icmp->user_comparator()->CompareTimestamp(
+                                          trim_ts_, ts_upper_bound_) < 0) {
+        ts_upper_bound_ = trim_ts_;
+      }
+    }
+  }
+  ~CompactionRangeDelAggregator() override {}
+
+  void AddTombstones(
+      std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter,
+      const InternalKey* smallest = nullptr,
+      const InternalKey* largest = nullptr) override;
+
+  using RangeDelAggregator::ShouldDelete;
+  bool ShouldDelete(const ParsedInternalKey& parsed,
+                    RangeDelPositioningMode mode) override;
+
+  bool IsRangeOverlapped(const Slice& start, const Slice& end);
+
+  void InvalidateRangeDelMapPositions() override {
+    for (auto& rep : reps_) {
+      rep.second.Invalidate();
+    }
+  }
+
+  bool IsEmpty() const override {
+    for (const auto& rep : reps_) {
+      if (!rep.second.IsEmpty()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Creates an iterator over all the range tombstones in the aggregator, for
+  // use in compaction. Nullptr arguments indicate that the iterator range is
+  // unbounded.
+  // NOTE: the boundaries are used for optimization purposes to reduce the
+  // number of tombstones that are passed to the fragmenter; they do not
+  // guarantee that the resulting iterator only contains range tombstones that
+  // cover keys in the provided range. If required, these bounds must be
+  // enforced during iteration.
+  std::unique_ptr<FragmentedRangeTombstoneIterator> NewIterator(
+      const Slice* lower_bound = nullptr, const Slice* upper_bound = nullptr,
+      bool upper_bound_inclusive = false);
+
+ private:
+  std::vector<std::unique_ptr<TruncatedRangeDelIterator>> parent_iters_;
+  std::map<SequenceNumber, StripeRep> reps_;
+
+  const std::vector<SequenceNumber>* snapshots_;
+  // min over full_history_ts_low and trim_ts_
+  Slice ts_upper_bound_{};
+  Slice trim_ts_{};
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_del_aggregator_bench.cc b/src/rocksdb/db/range_del_aggregator_bench.cc
new file mode 100644
index 000000000..9dca707e5
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_bench.cc
@@ -0,0 +1,280 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef GFLAGS
+#include <cstdio>
+int main() {
+  fprintf(stderr, "Please install gflags to run rocksdb tools\n");
+  return 1;
+}
+#else
+
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/system_clock.h"
+#include "util/coding.h"
+#include "util/gflags_compat.h"
+#include "util/random.h"
+#include "util/stop_watch.h"
+#include "util/vector_iterator.h"
+
+using GFLAGS_NAMESPACE::ParseCommandLineFlags;
+
+DEFINE_int32(num_range_tombstones, 1000, "number of range tombstones created");
+
+DEFINE_int32(num_runs, 1000, "number of test runs");
+
+DEFINE_int32(tombstone_start_upper_bound, 1000,
+             "exclusive upper bound on range tombstone start keys");
+
+DEFINE_int32(should_delete_upper_bound, 1000,
+             "exclusive upper bound on keys passed to ShouldDelete");
+
+DEFINE_double(tombstone_width_mean, 100.0, "average range tombstone width");
+
+DEFINE_double(tombstone_width_stddev, 0.0,
+              "standard deviation of range tombstone width");
+
+DEFINE_int32(seed, 0, "random number generator seed");
+
+DEFINE_int32(should_deletes_per_run, 1, "number of ShouldDelete calls per run");
+
+DEFINE_int32(add_tombstones_per_run, 1,
+             "number of AddTombstones calls per run");
+
+DEFINE_bool(use_compaction_range_del_aggregator, false,
+            "Whether to use CompactionRangeDelAggregator. Default is to use "
+            "ReadRangeDelAggregator.");
+
+namespace {
+
+struct Stats {
+  uint64_t time_add_tombstones = 0;
+  uint64_t time_first_should_delete = 0;
+  uint64_t time_rest_should_delete = 0;
+  uint64_t time_fragment_tombstones = 0;
+};
+
+std::ostream& operator<<(std::ostream& os, const Stats& s) {
+  std::ios fmt_holder(nullptr);
+  fmt_holder.copyfmt(os);
+
+  os << std::left;
+  os << std::setw(25) << "Fragment Tombstones: "
+     << s.time_fragment_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "AddTombstones: "
+     << s.time_add_tombstones /
+            (FLAGS_add_tombstones_per_run * FLAGS_num_runs * 1.0e3)
+     << " us\n";
+  os << std::setw(25) << "ShouldDelete (first): "
+     << s.time_first_should_delete / (FLAGS_num_runs * 1.0e3) << " us\n";
+  if (FLAGS_should_deletes_per_run > 1) {
+    os << std::setw(25) << "ShouldDelete (rest): "
+       << s.time_rest_should_delete /
+              ((FLAGS_should_deletes_per_run - 1) * FLAGS_num_runs * 1.0e3)
+       << " us\n";
+  }
+
+  os.copyfmt(fmt_holder);
+  return os;
+}
+
+auto icmp = ROCKSDB_NAMESPACE::InternalKeyComparator(
+    ROCKSDB_NAMESPACE::BytewiseComparator());
+
+}  // anonymous namespace
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// A wrapper around RangeTombstones and the underlying data of its start and end
+// keys.
+struct PersistentRangeTombstone {
+  std::string start_key;
+  std::string end_key;
+  RangeTombstone tombstone;
+
+  PersistentRangeTombstone(std::string start, std::string end,
+                           SequenceNumber seq)
+      : start_key(std::move(start)), end_key(std::move(end)) {
+    tombstone = RangeTombstone(start_key, end_key, seq);
+  }
+
+  PersistentRangeTombstone() = default;
+
+  PersistentRangeTombstone(const PersistentRangeTombstone& t) { *this = t; }
+
+  PersistentRangeTombstone& operator=(const PersistentRangeTombstone& t) {
+    start_key = t.start_key;
+    end_key = t.end_key;
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+
+  PersistentRangeTombstone(PersistentRangeTombstone&& t) noexcept { *this = t; }
+
+  PersistentRangeTombstone& operator=(PersistentRangeTombstone&& t) {
+    start_key = std::move(t.start_key);
+    end_key = std::move(t.end_key);
+    tombstone = RangeTombstone(start_key, end_key, t.tombstone.seq_);
+
+    return *this;
+  }
+};
+
+struct TombstoneStartKeyComparator {
+  explicit TombstoneStartKeyComparator(const Comparator* c) : cmp(c) {}
+
+  bool operator()(const RangeTombstone& a, const RangeTombstone& b) const {
+    return cmp->Compare(a.start_key_, b.start_key_) < 0;
+  }
+
+  const Comparator* cmp;
+};
+
+std::unique_ptr<InternalIterator> MakeRangeDelIterator(
+    const std::vector<PersistentRangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.tombstone.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &icmp));
+}
+
+// convert long to a big-endian slice key
+static std::string Key(int64_t val) {
+  std::string little_endian_key;
+  std::string big_endian_key;
+  PutFixed64(&little_endian_key, val);
+  assert(little_endian_key.size() == sizeof(val));
+  big_endian_key.resize(sizeof(val));
+  for (size_t i = 0; i < sizeof(val); ++i) {
+    big_endian_key[i] = little_endian_key[sizeof(val) - 1 - i];
+  }
+  return big_endian_key;
+}
+
+}  // anonymous namespace
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ParseCommandLineFlags(&argc, &argv, true);
+
+  Stats stats;
+  ROCKSDB_NAMESPACE::SystemClock* clock =
+      ROCKSDB_NAMESPACE::SystemClock::Default().get();
+  ROCKSDB_NAMESPACE::Random64 rnd(FLAGS_seed);
+  std::default_random_engine random_gen(FLAGS_seed);
+  std::normal_distribution<double> normal_dist(FLAGS_tombstone_width_mean,
+                                               FLAGS_tombstone_width_stddev);
+  std::vector<std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone> >
+      all_persistent_range_tombstones(FLAGS_add_tombstones_per_run);
+  for (int i = 0; i < FLAGS_add_tombstones_per_run; i++) {
+    all_persistent_range_tombstones[i] =
+        std::vector<ROCKSDB_NAMESPACE::PersistentRangeTombstone>(
+            FLAGS_num_range_tombstones);
+  }
+  auto mode = ROCKSDB_NAMESPACE::RangeDelPositioningMode::kForwardTraversal;
+  std::vector<ROCKSDB_NAMESPACE::SequenceNumber> snapshots{0};
+  for (int i = 0; i < FLAGS_num_runs; i++) {
+    std::unique_ptr<ROCKSDB_NAMESPACE::RangeDelAggregator> range_del_agg =
+        nullptr;
+    if (FLAGS_use_compaction_range_del_aggregator) {
+      range_del_agg.reset(new ROCKSDB_NAMESPACE::CompactionRangeDelAggregator(
+          &icmp, snapshots));
+    } else {
+      range_del_agg.reset(new ROCKSDB_NAMESPACE::ReadRangeDelAggregator(
+          &icmp, ROCKSDB_NAMESPACE::kMaxSequenceNumber /* upper_bound */));
+    }
+
+    std::vector<
+        std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList> >
+        fragmented_range_tombstone_lists(FLAGS_add_tombstones_per_run);
+
+    for (auto& persistent_range_tombstones : all_persistent_range_tombstones) {
+      // TODO(abhimadan): consider whether creating the range tombstones right
+      // before AddTombstones is artificially warming the cache compared to
+      // real workloads.
+      for (int j = 0; j < FLAGS_num_range_tombstones; j++) {
+        uint64_t start = rnd.Uniform(FLAGS_tombstone_start_upper_bound);
+        uint64_t end = static_cast<uint64_t>(
+            std::round(start + std::max(1.0, normal_dist(random_gen))));
+        persistent_range_tombstones[j] =
+            ROCKSDB_NAMESPACE::PersistentRangeTombstone(
+                ROCKSDB_NAMESPACE::Key(start), ROCKSDB_NAMESPACE::Key(end), j);
+      }
+      auto iter =
+          ROCKSDB_NAMESPACE::MakeRangeDelIterator(persistent_range_tombstones);
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_fragment_tombstones(
+          clock, true /* auto_start */);
+      fragmented_range_tombstone_lists.emplace_back(
+          new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneList(
+              std::move(iter), icmp, FLAGS_use_compaction_range_del_aggregator,
+              snapshots));
+      stats.time_fragment_tombstones +=
+          stop_watch_fragment_tombstones.ElapsedNanos();
+      std::unique_ptr<ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator>
+          fragmented_range_del_iter(
+              new ROCKSDB_NAMESPACE::FragmentedRangeTombstoneIterator(
+                  fragmented_range_tombstone_lists.back().get(), icmp,
+                  ROCKSDB_NAMESPACE::kMaxSequenceNumber));
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_add_tombstones(
+          clock, true /* auto_start */);
+      range_del_agg->AddTombstones(std::move(fragmented_range_del_iter));
+      stats.time_add_tombstones += stop_watch_add_tombstones.ElapsedNanos();
+    }
+
+    ROCKSDB_NAMESPACE::ParsedInternalKey parsed_key;
+    parsed_key.sequence = FLAGS_num_range_tombstones / 2;
+    parsed_key.type = ROCKSDB_NAMESPACE::kTypeValue;
+
+    uint64_t first_key = rnd.Uniform(FLAGS_should_delete_upper_bound -
+                                     FLAGS_should_deletes_per_run + 1);
+
+    for (int j = 0; j < FLAGS_should_deletes_per_run; j++) {
+      std::string key_string = ROCKSDB_NAMESPACE::Key(first_key + j);
+      parsed_key.user_key = key_string;
+
+      ROCKSDB_NAMESPACE::StopWatchNano stop_watch_should_delete(
+          clock, true /* auto_start */);
+      range_del_agg->ShouldDelete(parsed_key, mode);
+      uint64_t call_time = stop_watch_should_delete.ElapsedNanos();
+
+      if (j == 0) {
+        stats.time_first_should_delete += call_time;
+      } else {
+        stats.time_rest_should_delete += call_time;
+      }
+    }
+  }
+
+  std::cout << "=========================\n"
+            << "Results:\n"
+            << "=========================\n"
+            << stats;
+
+  return 0;
+}
+
+#endif  // GFLAGS
diff --git a/src/rocksdb/db/range_del_aggregator_test.cc b/src/rocksdb/db/range_del_aggregator_test.cc
new file mode 100644
index 000000000..7fe35276a
--- /dev/null
+++ b/src/rocksdb/db/range_del_aggregator_test.cc
@@ -0,0 +1,715 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_del_aggregator.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeDelAggregatorTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+std::vector<std::unique_ptr<FragmentedRangeTombstoneList>>
+MakeFragmentedTombstoneLists(
+    const std::vector<std::vector<RangeTombstone>>& range_dels_list) {
+  std::vector<std::unique_ptr<FragmentedRangeTombstoneList>> fragment_lists;
+  for (const auto& range_dels : range_dels_list) {
+    auto range_del_iter = MakeRangeDelIter(range_dels);
+    fragment_lists.emplace_back(new FragmentedRangeTombstoneList(
+        std::move(range_del_iter), bytewise_icmp));
+  }
+  return fragment_lists;
+}
+
+struct TruncatedIterScanTestCase {
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+};
+
+struct TruncatedIterSeekTestCase {
+  Slice target;
+  ParsedInternalKey start;
+  ParsedInternalKey end;
+  SequenceNumber seq;
+  bool invalid;
+};
+
+struct ShouldDeleteTestCase {
+  ParsedInternalKey lookup_key;
+  bool result;
+};
+
+struct IsRangeOverlappedTestCase {
+  Slice start;
+  Slice end;
+  bool result;
+};
+
+ParsedInternalKey UncutEndpoint(const Slice& s) {
+  return ParsedInternalKey(s, kMaxSequenceNumber, kTypeRangeDeletion);
+}
+
+ParsedInternalKey InternalValue(const Slice& key, SequenceNumber seq,
+                                ValueType type = kTypeValue) {
+  return ParsedInternalKey(key, seq, type);
+}
+
+void VerifyIterator(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterScanTestCase>& expected_range_dels) {
+  // Test forward iteration.
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_range_dels.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(), expected_range_dels[i].start));
+    EXPECT_EQ(0, icmp.Compare(iter->end_key(), expected_range_dels[i].end));
+    EXPECT_EQ(expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+
+  // Test reverse iteration.
+  iter->SeekToLast();
+  std::vector<TruncatedIterScanTestCase> reverse_expected_range_dels(
+      expected_range_dels.rbegin(), expected_range_dels.rend());
+  for (size_t i = 0; i < reverse_expected_range_dels.size();
+       i++, iter->Prev()) {
+    ASSERT_TRUE(iter->Valid());
+    EXPECT_EQ(0, icmp.Compare(iter->start_key(),
+                              reverse_expected_range_dels[i].start));
+    EXPECT_EQ(
+        0, icmp.Compare(iter->end_key(), reverse_expected_range_dels[i].end));
+    EXPECT_EQ(reverse_expected_range_dels[i].seq, iter->seq());
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifySeek(TruncatedRangeDelIterator* iter,
+                const InternalKeyComparator& icmp,
+                const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->Seek(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifySeekForPrev(
+    TruncatedRangeDelIterator* iter, const InternalKeyComparator& icmp,
+    const std::vector<TruncatedIterSeekTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    iter->SeekForPrev(test_case.target);
+    if (test_case.invalid) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      EXPECT_EQ(0, icmp.Compare(iter->start_key(), test_case.start));
+      EXPECT_EQ(0, icmp.Compare(iter->end_key(), test_case.end));
+      EXPECT_EQ(test_case.seq, iter->seq());
+    }
+  }
+}
+
+void VerifyShouldDelete(RangeDelAggregator* range_del_agg,
+                        const std::vector<ShouldDeleteTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kForwardTraversal));
+  }
+  for (auto it = test_cases.rbegin(); it != test_cases.rend(); ++it) {
+    const auto& test_case = *it;
+    EXPECT_EQ(
+        test_case.result,
+        range_del_agg->ShouldDelete(
+            test_case.lookup_key, RangeDelPositioningMode::kBackwardTraversal));
+  }
+}
+
+void VerifyIsRangeOverlapped(
+    ReadRangeDelAggregator* range_del_agg,
+    const std::vector<IsRangeOverlappedTestCase>& test_cases) {
+  for (const auto& test_case : test_cases) {
+    EXPECT_EQ(test_case.result,
+              range_del_agg->IsRangeOverlapped(test_case.start, test_case.end));
+  }
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeDelAggregatorTest, EmptyTruncatedIter) {
+  auto range_del_iter = MakeRangeDelIter({});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  iter.SeekToFirst();
+  ASSERT_FALSE(iter.Valid());
+
+  iter.SeekToLast();
+  ASSERT_FALSE(iter.Valid());
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIter) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("a"), UncutEndpoint("e"), 10},
+                  {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("a"), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("a"), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, UntruncatedIterWithSnapshot) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           9 /* snapshot */));
+
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp, nullptr,
+                                 nullptr);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{UncutEndpoint("e"), UncutEndpoint("g"), 8},
+                  {UncutEndpoint("j"), UncutEndpoint("n"), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", UncutEndpoint("e"), UncutEndpoint("g"), 8}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), UncutEndpoint("n"), 4},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterPartiallyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("d", 7, kTypeValue);
+  InternalKey largest("m", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(
+      &iter, bytewise_icmp,
+      {{InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+        false /* invalid */},
+       {"n", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"", InternalValue("d", 7), UncutEndpoint("e"), 10}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("d", 7), UncutEndpoint("e"), 10},
+       {"e", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"ia", UncutEndpoint("e"), UncutEndpoint("g"), 8},
+       {"n", UncutEndpoint("j"), InternalValue("m", 8, kValueTypeForSeek), 4,
+        false /* invalid */},
+       {"", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+}
+
+TEST_F(RangeDelAggregatorTest, TruncatedIterFullyCutTombstones) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"e", "g", 8}, {"j", "n", 4}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  InternalKey smallest("f", 7, kTypeValue);
+  InternalKey largest("i", 9, kTypeValue);
+  TruncatedRangeDelIterator iter(std::move(input_iter), &bytewise_icmp,
+                                 &smallest, &largest);
+
+  VerifyIterator(&iter, bytewise_icmp,
+                 {{InternalValue("f", 7), UncutEndpoint("g"), 8}});
+
+  VerifySeek(
+      &iter, bytewise_icmp,
+      {{"d", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */}});
+
+  VerifySeekForPrev(
+      &iter, bytewise_icmp,
+      {{"d", UncutEndpoint(""), UncutEndpoint(""), 0, true /* invalid */},
+       {"f", InternalValue("f", 7), UncutEndpoint("g"), 8},
+       {"j", InternalValue("f", 7), UncutEndpoint("g"), 8}});
+}
+
+TEST_F(RangeDelAggregatorTest, SingleIterInAggregator) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 8}});
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+      new FragmentedRangeTombstoneIterator(&fragment_list, bytewise_icmp,
+                                           kMaxSequenceNumber));
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  range_del_agg.AddTombstones(std::move(input_iter));
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, kMaxSequenceNumber);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleItersInAggregatorWithUpperBound) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), false},
+                                      {InternalValue("a", 9), true},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), false},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "c", true},
+                                           {"d", "f", true},
+                                           {"g", "l", true},
+                                           {"x", "y", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+  for (size_t i = 0; i < fragment_lists.size(); i++) {
+    const auto& fragment_list = fragment_lists[i];
+    const auto& bounds = iter_bounds[i];
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &bounds.first,
+                                &bounds.second);
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true},
+                                      {InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true},
+                                      {InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, MultipleTruncatedItersInAggregatorSameLevel) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "z", 10}}, {{"a", "z", 10}}, {{"a", "z", 10}}});
+  std::vector<std::pair<InternalKey, InternalKey>> iter_bounds = {
+      {InternalKey("a", 4, kTypeValue),
+       InternalKey("m", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("m", 20, kTypeValue),
+       InternalKey("x", kMaxSequenceNumber, kTypeRangeDeletion)},
+      {InternalKey("x", 5, kTypeValue), InternalKey("zz", 30, kTypeValue)}};
+
+  ReadRangeDelAggregator range_del_agg(&bytewise_icmp, 19);
+
+  auto add_iter_to_agg = [&](size_t i) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_lists[i].get(),
+                                             bytewise_icmp, 19 /* snapshot */));
+    range_del_agg.AddTombstones(std::move(input_iter), &iter_bounds[i].first,
+                                &iter_bounds[i].second);
+  };
+
+  add_iter_to_agg(0);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 10), false},
+                                      {InternalValue("a", 9), false},
+                                      {InternalValue("a", 4), true}});
+
+  add_iter_to_agg(1);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("m", 10), false},
+                                      {InternalValue("m", 9), true}});
+
+  add_iter_to_agg(2);
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("x", 10), false},
+                                      {InternalValue("x", 9), false},
+                                      {InternalValue("x", 5), true},
+                                      {InternalValue("z", 9), false}});
+
+  VerifyIsRangeOverlapped(&range_del_agg, {{"", "_", false},
+                                           {"_", "a", true},
+                                           {"a", "n", true},
+                                           {"l", "x", true},
+                                           {"w", "z", true},
+                                           {"zzz", "zz", false},
+                                           {"zz", "zzz", false}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorNoSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots;
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(&range_del_agg, {{InternalValue("a", 19), true},
+                                      {InternalValue("b", 19), false},
+                                      {InternalValue("b", 9), true},
+                                      {InternalValue("d", 9), true},
+                                      {InternalValue("e", 7), true},
+                                      {InternalValue("g", 7), false},
+                                      {InternalValue("h", 24), true},
+                                      {InternalValue("i", 24), false},
+                                      {InternalValue("ii", 14), true},
+                                      {InternalValue("j", 14), false}});
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorWithSnapshots) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  VerifyShouldDelete(
+      &range_del_agg,
+      {
+          {InternalValue("a", 19), false},  // [10, 19]
+          {InternalValue("a", 9), false},   // [0, 9]
+          {InternalValue("b", 9), false},   // [0, 9]
+          {InternalValue("d", 9), false},   // [0, 9]
+          {InternalValue("d", 7), true},    // [0, 9]
+          {InternalValue("e", 7), true},    // [0, 9]
+          {InternalValue("g", 7), false},   // [0, 9]
+          {InternalValue("h", 24), true},   // [20, kMaxSequenceNumber]
+          {InternalValue("i", 24), false},  // [20, kMaxSequenceNumber]
+          {InternalValue("ii", 14), true},  // [10, 19]
+          {InternalValue("j", 14), false}   // [10, 19]
+      });
+
+  auto range_del_compaction_iter = range_del_agg.NewIterator();
+  VerifyFragmentedRangeDels(range_del_compaction_iter.get(), {{"a", "b", 20},
+                                                              {"a", "b", 10},
+                                                              {"b", "c", 10},
+                                                              {"c", "e", 10},
+                                                              {"c", "e", 8},
+                                                              {"e", "g", 8},
+                                                              {"h", "i", 25},
+                                                              {"ii", "j", 15}});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorLeft) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("_");
+  Slice end("__");
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorEmptyIteratorRight) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("p");
+  Slice end("q");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {});
+}
+
+TEST_F(RangeDelAggregatorTest, CompactionAggregatorBoundedIterator) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "e", 10}, {"c", "g", 8}},
+       {{"a", "b", 20}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(),
+                            {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(
+      range_del_compaction_iter2.get(),
+      {{"a", "c", 10}, {"c", "e", 10}, {"c", "e", 8}, {"e", "g", 8}});
+}
+
+TEST_F(RangeDelAggregatorTest,
+       CompactionAggregatorBoundedIteratorExtraFragments) {
+  auto fragment_lists = MakeFragmentedTombstoneLists(
+      {{{"a", "d", 10}, {"c", "g", 8}},
+       {{"b", "c", 20}, {"d", "f", 30}, {"h", "i", 25}, {"ii", "j", 15}}});
+
+  std::vector<SequenceNumber> snapshots{9, 19};
+  CompactionRangeDelAggregator range_del_agg(&bytewise_icmp, snapshots);
+  for (const auto& fragment_list : fragment_lists) {
+    std::unique_ptr<FragmentedRangeTombstoneIterator> input_iter(
+        new FragmentedRangeTombstoneIterator(fragment_list.get(), bytewise_icmp,
+                                             kMaxSequenceNumber));
+    range_del_agg.AddTombstones(std::move(input_iter));
+  }
+
+  Slice start("bb");
+  Slice end("e");
+  auto range_del_compaction_iter1 =
+      range_del_agg.NewIterator(&start, &end, false /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter1.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+
+  auto range_del_compaction_iter2 =
+      range_del_agg.NewIterator(&start, &end, true /* end_key_inclusive */);
+  VerifyFragmentedRangeDels(range_del_compaction_iter2.get(), {{"a", "b", 10},
+                                                               {"b", "c", 20},
+                                                               {"b", "c", 10},
+                                                               {"c", "d", 10},
+                                                               {"c", "d", 8},
+                                                               {"d", "f", 30},
+                                                               {"d", "f", 8},
+                                                               {"f", "g", 8}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.cc b/src/rocksdb/db/range_tombstone_fragmenter.cc
new file mode 100644
index 000000000..7e7cedeca
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.cc
@@ -0,0 +1,502 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <functional>
+#include <set>
+
+#include "util/autovector.h"
+#include "util/kv_map.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+FragmentedRangeTombstoneList::FragmentedRangeTombstoneList(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  if (unfragmented_tombstones == nullptr) {
+    return;
+  }
+  bool is_sorted = true;
+  InternalKey pinned_last_start_key;
+  Slice last_start_key;
+  num_unfragmented_tombstones_ = 0;
+  total_tombstone_payload_bytes_ = 0;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next(), num_unfragmented_tombstones_++) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    if (num_unfragmented_tombstones_ > 0 &&
+        icmp.Compare(last_start_key, unfragmented_tombstones->key()) > 0) {
+      is_sorted = false;
+      break;
+    }
+    if (unfragmented_tombstones->IsKeyPinned()) {
+      last_start_key = unfragmented_tombstones->key();
+    } else {
+      pinned_last_start_key.DecodeFrom(unfragmented_tombstones->key());
+      last_start_key = pinned_last_start_key.Encode();
+    }
+  }
+  if (is_sorted) {
+    FragmentTombstones(std::move(unfragmented_tombstones), icmp, for_compaction,
+                       snapshots);
+    return;
+  }
+
+  // Sort the tombstones before fragmenting them.
+  std::vector<std::string> keys, values;
+  keys.reserve(num_unfragmented_tombstones_);
+  values.reserve(num_unfragmented_tombstones_);
+  // Reset the counter to zero for the next iteration over keys.
+  total_tombstone_payload_bytes_ = 0;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    total_tombstone_payload_bytes_ += unfragmented_tombstones->key().size() +
+                                      unfragmented_tombstones->value().size();
+    keys.emplace_back(unfragmented_tombstones->key().data(),
+                      unfragmented_tombstones->key().size());
+    values.emplace_back(unfragmented_tombstones->value().data(),
+                        unfragmented_tombstones->value().size());
+  }
+  // VectorIterator implicitly sorts by key during construction.
+  auto iter = std::make_unique<VectorIterator>(std::move(keys),
+                                               std::move(values), &icmp);
+  FragmentTombstones(std::move(iter), icmp, for_compaction, snapshots);
+}
+
+void FragmentedRangeTombstoneList::FragmentTombstones(
+    std::unique_ptr<InternalIterator> unfragmented_tombstones,
+    const InternalKeyComparator& icmp, bool for_compaction,
+    const std::vector<SequenceNumber>& snapshots) {
+  Slice cur_start_key(nullptr, 0);
+  auto cmp = ParsedInternalKeyComparator(&icmp);
+
+  // Stores the end keys and sequence numbers of range tombstones with a start
+  // key less than or equal to cur_start_key. Provides an ordering by end key
+  // for use in flush_current_tombstones.
+  std::set<ParsedInternalKey, ParsedInternalKeyComparator> cur_end_keys(cmp);
+
+  size_t ts_sz = icmp.user_comparator()->timestamp_size();
+  // Given the next start key in unfragmented_tombstones,
+  // flush_current_tombstones writes every tombstone fragment that starts
+  // and ends with a key before next_start_key, and starts with a key greater
+  // than or equal to cur_start_key.
+  auto flush_current_tombstones = [&](const Slice& next_start_key) {
+    auto it = cur_end_keys.begin();
+    bool reached_next_start_key = false;
+    for (; it != cur_end_keys.end() && !reached_next_start_key; ++it) {
+      Slice cur_end_key = it->user_key;
+      if (icmp.user_comparator()->CompareWithoutTimestamp(cur_start_key,
+                                                          cur_end_key) == 0) {
+        // Empty tombstone.
+        continue;
+      }
+      if (icmp.user_comparator()->CompareWithoutTimestamp(next_start_key,
+                                                          cur_end_key) <= 0) {
+        // All the end keys in [it, cur_end_keys.end()) are after
+        // next_start_key, so the tombstones they represent can be used in
+        // fragments that start with keys greater than or equal to
+        // next_start_key. However, the end keys we already passed will not be
+        // used in any more tombstone fragments.
+        //
+        // Remove the fully fragmented tombstones and stop iteration after a
+        // final round of flushing to preserve the tombstones we can create more
+        // fragments from.
+        reached_next_start_key = true;
+        cur_end_keys.erase(cur_end_keys.begin(), it);
+        cur_end_key = next_start_key;
+      }
+
+      // Flush a range tombstone fragment [cur_start_key, cur_end_key), which
+      // should not overlap with the last-flushed tombstone fragment.
+      assert(tombstones_.empty() ||
+             icmp.user_comparator()->CompareWithoutTimestamp(
+                 tombstones_.back().end_key, cur_start_key) <= 0);
+
+      // Sort the sequence numbers of the tombstones being fragmented in
+      // descending order, and then flush them in that order.
+      autovector<SequenceNumber> seqnums_to_flush;
+      autovector<Slice> timestamps_to_flush;
+      for (auto flush_it = it; flush_it != cur_end_keys.end(); ++flush_it) {
+        seqnums_to_flush.push_back(flush_it->sequence);
+        if (ts_sz) {
+          timestamps_to_flush.push_back(
+              ExtractTimestampFromUserKey(flush_it->user_key, ts_sz));
+        }
+      }
+      // TODO: bind the two sorting together to be more efficient
+      std::sort(seqnums_to_flush.begin(), seqnums_to_flush.end(),
+                std::greater<SequenceNumber>());
+      if (ts_sz) {
+        std::sort(timestamps_to_flush.begin(), timestamps_to_flush.end(),
+                  [icmp](const Slice& ts1, const Slice& ts2) {
+                    return icmp.user_comparator()->CompareTimestamp(ts1, ts2) >
+                           0;
+                  });
+      }
+
+      size_t start_idx = tombstone_seqs_.size();
+      size_t end_idx = start_idx + seqnums_to_flush.size();
+
+      // If user-defined timestamp is enabled, we should not drop tombstones
+      // from any snapshot stripe. Garbage collection of range tombstones
+      // happens in CompactionOutputs::AddRangeDels().
+      if (for_compaction && ts_sz == 0) {
+        // Drop all tombstone seqnums that are not preserved by a snapshot.
+        SequenceNumber next_snapshot = kMaxSequenceNumber;
+        for (auto seq : seqnums_to_flush) {
+          if (seq <= next_snapshot) {
+            // This seqnum is visible by a lower snapshot.
+            tombstone_seqs_.push_back(seq);
+            auto upper_bound_it =
+                std::lower_bound(snapshots.begin(), snapshots.end(), seq);
+            if (upper_bound_it == snapshots.begin()) {
+              // This seqnum is the topmost one visible by the earliest
+              // snapshot. None of the seqnums below it will be visible, so we
+              // can skip them.
+              break;
+            }
+            next_snapshot = *std::prev(upper_bound_it);
+          }
+        }
+        end_idx = tombstone_seqs_.size();
+      } else {
+        // The fragmentation is being done for reads, so preserve all seqnums.
+        tombstone_seqs_.insert(tombstone_seqs_.end(), seqnums_to_flush.begin(),
+                               seqnums_to_flush.end());
+        if (ts_sz) {
+          tombstone_timestamps_.insert(tombstone_timestamps_.end(),
+                                       timestamps_to_flush.begin(),
+                                       timestamps_to_flush.end());
+        }
+      }
+
+      assert(start_idx < end_idx);
+      if (ts_sz) {
+        std::string start_key_with_max_ts;
+        AppendUserKeyWithMaxTimestamp(&start_key_with_max_ts, cur_start_key,
+                                      ts_sz);
+        pinned_slices_.emplace_back(std::move(start_key_with_max_ts));
+        Slice start_key = pinned_slices_.back();
+
+        std::string end_key_with_max_ts;
+        AppendUserKeyWithMaxTimestamp(&end_key_with_max_ts, cur_end_key, ts_sz);
+        pinned_slices_.emplace_back(std::move(end_key_with_max_ts));
+        Slice end_key = pinned_slices_.back();
+
+        // RangeTombstoneStack expects start_key and end_key to have max
+        // timestamp.
+        tombstones_.emplace_back(start_key, end_key, start_idx, end_idx);
+      } else {
+        tombstones_.emplace_back(cur_start_key, cur_end_key, start_idx,
+                                 end_idx);
+      }
+
+      cur_start_key = cur_end_key;
+    }
+    if (!reached_next_start_key) {
+      // There is a gap between the last flushed tombstone fragment and
+      // the next tombstone's start key. Remove all the end keys in
+      // the working set, since we have fully fragmented their corresponding
+      // tombstones.
+      cur_end_keys.clear();
+    }
+    cur_start_key = next_start_key;
+  };
+
+  pinned_iters_mgr_.StartPinning();
+
+  bool no_tombstones = true;
+  for (unfragmented_tombstones->SeekToFirst(); unfragmented_tombstones->Valid();
+       unfragmented_tombstones->Next()) {
+    const Slice& ikey = unfragmented_tombstones->key();
+    Slice tombstone_start_key = ExtractUserKey(ikey);
+    SequenceNumber tombstone_seq = GetInternalKeySeqno(ikey);
+    if (!unfragmented_tombstones->IsKeyPinned()) {
+      pinned_slices_.emplace_back(tombstone_start_key.data(),
+                                  tombstone_start_key.size());
+      tombstone_start_key = pinned_slices_.back();
+    }
+    no_tombstones = false;
+
+    Slice tombstone_end_key = unfragmented_tombstones->value();
+    if (!unfragmented_tombstones->IsValuePinned()) {
+      pinned_slices_.emplace_back(tombstone_end_key.data(),
+                                  tombstone_end_key.size());
+      tombstone_end_key = pinned_slices_.back();
+    }
+    if (!cur_end_keys.empty() &&
+        icmp.user_comparator()->CompareWithoutTimestamp(
+            cur_start_key, tombstone_start_key) != 0) {
+      // The start key has changed. Flush all tombstones that start before
+      // this new start key.
+      flush_current_tombstones(tombstone_start_key);
+    }
+    cur_start_key = tombstone_start_key;
+
+    cur_end_keys.emplace(tombstone_end_key, tombstone_seq, kTypeRangeDeletion);
+  }
+  if (!cur_end_keys.empty()) {
+    ParsedInternalKey last_end_key = *std::prev(cur_end_keys.end());
+    flush_current_tombstones(last_end_key.user_key);
+  }
+
+  if (!no_tombstones) {
+    pinned_iters_mgr_.PinIterator(unfragmented_tombstones.release(),
+                                  false /* arena */);
+  }
+}
+
+bool FragmentedRangeTombstoneList::ContainsRange(SequenceNumber lower,
+                                                 SequenceNumber upper) {
+  std::call_once(seq_set_init_once_flag_, [this]() {
+    for (auto s : tombstone_seqs_) {
+      seq_set_.insert(s);
+    }
+  });
+  auto seq_it = seq_set_.lower_bound(lower);
+  return seq_it != seq_set_.end() && *seq_it <= upper;
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    FragmentedRangeTombstoneList* tombstones, const InternalKeyComparator& icmp,
+    SequenceNumber _upper_bound, const Slice* ts_upper_bound,
+    SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_(tombstones),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound),
+      ts_upper_bound_(ts_upper_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_ref_(tombstones),
+      tombstones_(tombstones_ref_.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound),
+      ts_upper_bound_(ts_upper_bound) {
+  assert(tombstones_ != nullptr);
+  Invalidate();
+}
+
+FragmentedRangeTombstoneIterator::FragmentedRangeTombstoneIterator(
+    const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones_cache,
+    const InternalKeyComparator& icmp, SequenceNumber _upper_bound,
+    const Slice* ts_upper_bound, SequenceNumber _lower_bound)
+    : tombstone_start_cmp_(icmp.user_comparator()),
+      tombstone_end_cmp_(icmp.user_comparator()),
+      icmp_(&icmp),
+      ucmp_(icmp.user_comparator()),
+      tombstones_cache_ref_(tombstones_cache),
+      tombstones_(tombstones_cache_ref_->tombstones.get()),
+      upper_bound_(_upper_bound),
+      lower_bound_(_lower_bound) {
+  assert(tombstones_ != nullptr);
+  if (!ts_upper_bound || ts_upper_bound->empty()) {
+    ts_upper_bound_ = nullptr;
+  } else {
+    ts_upper_bound_ = ts_upper_bound;
+  }
+  Invalidate();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToFirst() {
+  pos_ = tombstones_->begin();
+  seq_pos_ = tombstones_->seq_begin();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopFirst() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = tombstones_->begin();
+  SetMaxVisibleSeqAndTimestamp();
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToLast() {
+  pos_ = std::prev(tombstones_->end());
+  seq_pos_ = std::prev(tombstones_->seq_end());
+}
+
+void FragmentedRangeTombstoneIterator::SeekToTopLast() {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::prev(tombstones_->end());
+  SetMaxVisibleSeqAndTimestamp();
+  ScanBackwardToVisibleTombstone();
+}
+
+// @param `target` is a user key, with timestamp if user-defined timestamp is
+// enabled.
+void FragmentedRangeTombstoneIterator::Seek(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekToCoveringTombstone(target);
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrev(const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  SeekForPrevToCoveringTombstone(target);
+  ScanBackwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::SeekToCoveringTombstone(
+    const Slice& target) {
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_end_cmp_);
+  if (pos_ == tombstones_->end()) {
+    // All tombstones end before target.
+    seq_pos_ = tombstones_->seq_end();
+    return;
+  }
+  SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::SeekForPrevToCoveringTombstone(
+    const Slice& target) {
+  if (tombstones_->empty()) {
+    Invalidate();
+    return;
+  }
+  pos_ = std::upper_bound(tombstones_->begin(), tombstones_->end(), target,
+                          tombstone_start_cmp_);
+  if (pos_ == tombstones_->begin()) {
+    // All tombstones start after target.
+    Invalidate();
+    return;
+  }
+  --pos_;
+  SetMaxVisibleSeqAndTimestamp();
+}
+
+void FragmentedRangeTombstoneIterator::ScanForwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    ++pos_;
+    if (pos_ == tombstones_->end()) {
+      Invalidate();
+      return;
+    }
+    SetMaxVisibleSeqAndTimestamp();
+  }
+}
+
+void FragmentedRangeTombstoneIterator::ScanBackwardToVisibleTombstone() {
+  while (pos_ != tombstones_->end() &&
+         (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx) ||
+          *seq_pos_ < lower_bound_)) {
+    if (pos_ == tombstones_->begin()) {
+      Invalidate();
+      return;
+    }
+    --pos_;
+    SetMaxVisibleSeqAndTimestamp();
+  }
+}
+
+void FragmentedRangeTombstoneIterator::Next() {
+  ++seq_pos_;
+  if (seq_pos_ == tombstones_->seq_iter(pos_->seq_end_idx)) {
+    ++pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopNext() {
+  ++pos_;
+  if (pos_ == tombstones_->end()) {
+    return;
+  }
+  SetMaxVisibleSeqAndTimestamp();
+  ScanForwardToVisibleTombstone();
+}
+
+void FragmentedRangeTombstoneIterator::Prev() {
+  if (seq_pos_ == tombstones_->seq_begin()) {
+    Invalidate();
+    return;
+  }
+  --seq_pos_;
+  if (pos_ == tombstones_->end() ||
+      seq_pos_ == tombstones_->seq_iter(pos_->seq_start_idx - 1)) {
+    --pos_;
+  }
+}
+
+void FragmentedRangeTombstoneIterator::TopPrev() {
+  if (pos_ == tombstones_->begin()) {
+    Invalidate();
+    return;
+  }
+  --pos_;
+  SetMaxVisibleSeqAndTimestamp();
+  ScanBackwardToVisibleTombstone();
+}
+
+bool FragmentedRangeTombstoneIterator::Valid() const {
+  return tombstones_ != nullptr && pos_ != tombstones_->end();
+}
+
+SequenceNumber FragmentedRangeTombstoneIterator::MaxCoveringTombstoneSeqnum(
+    const Slice& target_user_key) {
+  SeekToCoveringTombstone(target_user_key);
+  return ValidPos() && ucmp_->CompareWithoutTimestamp(start_key(),
+                                                      target_user_key) <= 0
+             ? seq()
+             : 0;
+}
+
+std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+FragmentedRangeTombstoneIterator::SplitBySnapshot(
+    const std::vector<SequenceNumber>& snapshots) {
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+      splits;
+  SequenceNumber lower = 0;
+  SequenceNumber upper;
+  for (size_t i = 0; i <= snapshots.size(); i++) {
+    if (i >= snapshots.size()) {
+      upper = kMaxSequenceNumber;
+    } else {
+      upper = snapshots[i];
+    }
+    if (tombstones_->ContainsRange(lower, upper)) {
+      splits.emplace(upper,
+                     std::make_unique<FragmentedRangeTombstoneIterator>(
+                         tombstones_, *icmp_, upper, ts_upper_bound_, lower));
+    }
+    lower = upper + 1;
+  }
+  return splits;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter.h b/src/rocksdb/db/range_tombstone_fragmenter.h
new file mode 100644
index 000000000..df07fa894
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter.h
@@ -0,0 +1,357 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <list>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/pinned_iterators_manager.h"
+#include "rocksdb/status.h"
+#include "table/internal_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+struct FragmentedRangeTombstoneList;
+
+struct FragmentedRangeTombstoneListCache {
+  // ensure only the first reader needs to initialize l
+  std::mutex reader_mutex;
+  std::unique_ptr<FragmentedRangeTombstoneList> tombstones = nullptr;
+  // readers will first check this bool to avoid
+  std::atomic<bool> initialized = false;
+};
+
+struct FragmentedRangeTombstoneList {
+ public:
+  // A compact representation of a "stack" of range tombstone fragments, which
+  // start and end at the same user keys but have different sequence numbers.
+  // The members seq_start_idx and seq_end_idx are intended to be parameters to
+  // seq_iter().
+  // If user-defined timestamp is enabled, `start` and `end` should be user keys
+  // with timestamp, and the timestamps are set to max timestamp to be returned
+  // by parsed_start_key()/parsed_end_key(). seq_start_idx and seq_end_idx will
+  // also be used as parameters to ts_iter().
+  struct RangeTombstoneStack {
+    RangeTombstoneStack(const Slice& start, const Slice& end, size_t start_idx,
+                        size_t end_idx)
+        : start_key(start),
+          end_key(end),
+          seq_start_idx(start_idx),
+          seq_end_idx(end_idx) {}
+    Slice start_key;
+    Slice end_key;
+    size_t seq_start_idx;
+    size_t seq_end_idx;
+  };
+  // Assumes unfragmented_tombstones->key() and unfragmented_tombstones->value()
+  // both contain timestamp if enabled.
+  FragmentedRangeTombstoneList(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction = false,
+      const std::vector<SequenceNumber>& snapshots = {});
+
+  std::vector<RangeTombstoneStack>::const_iterator begin() const {
+    return tombstones_.begin();
+  }
+
+  std::vector<RangeTombstoneStack>::const_iterator end() const {
+    return tombstones_.end();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_iter(size_t idx) const {
+    return std::next(tombstone_seqs_.begin(), idx);
+  }
+
+  std::vector<Slice>::const_iterator ts_iter(size_t idx) const {
+    return std::next(tombstone_timestamps_.begin(), idx);
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_begin() const {
+    return tombstone_seqs_.begin();
+  }
+
+  std::vector<SequenceNumber>::const_iterator seq_end() const {
+    return tombstone_seqs_.end();
+  }
+
+  bool empty() const { return tombstones_.empty(); }
+
+  // Returns true if the stored tombstones contain with one with a sequence
+  // number in [lower, upper].
+  // This method is not const as it internally lazy initialize a set of
+  // sequence numbers (`seq_set_`).
+  bool ContainsRange(SequenceNumber lower, SequenceNumber upper);
+
+  uint64_t num_unfragmented_tombstones() const {
+    return num_unfragmented_tombstones_;
+  }
+
+  uint64_t total_tombstone_payload_bytes() const {
+    return total_tombstone_payload_bytes_;
+  }
+
+ private:
+  // Given an ordered range tombstone iterator unfragmented_tombstones,
+  // "fragment" the tombstones into non-overlapping pieces. Each
+  // "non-overlapping piece" is a RangeTombstoneStack in tombstones_, which
+  // contains start_key, end_key, and indices that points to sequence numbers
+  // (in tombstone_seqs_) and timestamps (in tombstone_timestamps_). If
+  // for_compaction is true, then `snapshots` should be provided. Range
+  // tombstone fragments are dropped if they are not visible in any snapshot and
+  // user-defined timestamp is not enabled. That is, for each snapshot stripe
+  // [lower, upper], the range tombstone fragment with largest seqno in [lower,
+  // upper] is preserved, and all the other range tombstones are dropped.
+  void FragmentTombstones(
+      std::unique_ptr<InternalIterator> unfragmented_tombstones,
+      const InternalKeyComparator& icmp, bool for_compaction,
+      const std::vector<SequenceNumber>& snapshots);
+
+  std::vector<RangeTombstoneStack> tombstones_;
+  std::vector<SequenceNumber> tombstone_seqs_;
+  std::vector<Slice> tombstone_timestamps_;
+  std::once_flag seq_set_init_once_flag_;
+  std::set<SequenceNumber> seq_set_;
+  std::list<std::string> pinned_slices_;
+  PinnedIteratorsManager pinned_iters_mgr_;
+  uint64_t num_unfragmented_tombstones_;
+  uint64_t total_tombstone_payload_bytes_;
+};
+
+// FragmentedRangeTombstoneIterator converts an InternalIterator of a range-del
+// meta block into an iterator over non-overlapping tombstone fragments. The
+// tombstone fragmentation process should be more efficient than the range
+// tombstone collapsing algorithm in RangeDelAggregator because this leverages
+// the internal key ordering already provided by the input iterator, if
+// applicable (when the iterator is unsorted, a new sorted iterator is created
+// before proceeding). If there are few overlaps, creating a
+// FragmentedRangeTombstoneIterator should be O(n), while the RangeDelAggregator
+// tombstone collapsing is always O(n log n).
+class FragmentedRangeTombstoneIterator : public InternalIterator {
+ public:
+  FragmentedRangeTombstoneIterator(FragmentedRangeTombstoneList* tombstones,
+                                   const InternalKeyComparator& icmp,
+                                   SequenceNumber upper_bound,
+                                   const Slice* ts_upper_bound = nullptr,
+                                   SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<FragmentedRangeTombstoneList>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+  FragmentedRangeTombstoneIterator(
+      const std::shared_ptr<FragmentedRangeTombstoneListCache>& tombstones,
+      const InternalKeyComparator& icmp, SequenceNumber upper_bound,
+      const Slice* ts_upper_bound = nullptr, SequenceNumber lower_bound = 0);
+
+  void SeekToFirst() override;
+  void SeekToLast() override;
+
+  void SeekToTopFirst();
+  void SeekToTopLast();
+
+  // NOTE: Seek and SeekForPrev do not behave in the way InternalIterator
+  // seeking should behave. This is OK because they are not currently used, but
+  // eventually FragmentedRangeTombstoneIterator should no longer implement
+  // InternalIterator.
+  //
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the earliest tombstone in
+  // the snapshot that ends after target.
+  void Seek(const Slice& target) override;
+  // Seeks to the range tombstone that covers target at a seqnum in the
+  // snapshot. If no such tombstone exists, seek to the latest tombstone in the
+  // snapshot that starts before target.
+  void SeekForPrev(const Slice& target) override;
+
+  void Next() override;
+  void Prev() override;
+
+  void TopNext();
+  void TopPrev();
+
+  bool Valid() const override;
+  // Note that key() and value() do not return correct timestamp.
+  // Caller should call timestamp() to get the current timestamp.
+  Slice key() const override {
+    MaybePinKey();
+    return current_start_key_.Encode();
+  }
+  Slice value() const override { return pos_->end_key; }
+  bool IsKeyPinned() const override { return false; }
+  bool IsValuePinned() const override { return true; }
+  Status status() const override { return Status::OK(); }
+
+  bool empty() const { return tombstones_->empty(); }
+  void Invalidate() {
+    pos_ = tombstones_->end();
+    seq_pos_ = tombstones_->seq_end();
+    pinned_pos_ = tombstones_->end();
+    pinned_seq_pos_ = tombstones_->seq_end();
+  }
+
+  RangeTombstone Tombstone() const {
+    assert(Valid());
+    if (icmp_->user_comparator()->timestamp_size()) {
+      return RangeTombstone(start_key(), end_key(), seq(), timestamp());
+    }
+    return RangeTombstone(start_key(), end_key(), seq());
+  }
+  // Note that start_key() and end_key() are not guaranteed to have the
+  // correct timestamp. User can call timestamp() to get the correct
+  // timestamp().
+  Slice start_key() const { return pos_->start_key; }
+  Slice end_key() const { return pos_->end_key; }
+  SequenceNumber seq() const { return *seq_pos_; }
+  Slice timestamp() const {
+    // seqno and timestamp are stored in the same order.
+    return *tombstones_->ts_iter(seq_pos_ - tombstones_->seq_begin());
+  }
+  // Current use case is by CompactionRangeDelAggregator to set
+  // full_history_ts_low_.
+  void SetTimestampUpperBound(const Slice* ts_upper_bound) {
+    ts_upper_bound_ = ts_upper_bound;
+  }
+
+  ParsedInternalKey parsed_start_key() const {
+    return ParsedInternalKey(pos_->start_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+  ParsedInternalKey parsed_end_key() const {
+    return ParsedInternalKey(pos_->end_key, kMaxSequenceNumber,
+                             kTypeRangeDeletion);
+  }
+
+  // Return the max sequence number of a range tombstone that covers
+  // the given user key.
+  // If there is no covering tombstone, then 0 is returned.
+  SequenceNumber MaxCoveringTombstoneSeqnum(const Slice& user_key);
+
+  // Splits the iterator into n+1 iterators (where n is the number of
+  // snapshots), each providing a view over a "stripe" of sequence numbers. The
+  // iterators are keyed by the upper bound of their ranges (the provided
+  // snapshots + kMaxSequenceNumber).
+  //
+  // NOTE: the iterators in the returned map are no longer valid if their
+  // parent iterator is deleted, since they do not modify the refcount of the
+  // underlying tombstone list. Therefore, this map should be deleted before
+  // the parent iterator.
+  std::map<SequenceNumber, std::unique_ptr<FragmentedRangeTombstoneIterator>>
+  SplitBySnapshot(const std::vector<SequenceNumber>& snapshots);
+
+  SequenceNumber upper_bound() const { return upper_bound_; }
+  SequenceNumber lower_bound() const { return lower_bound_; }
+
+  uint64_t num_unfragmented_tombstones() const {
+    return tombstones_->num_unfragmented_tombstones();
+  }
+  uint64_t total_tombstone_payload_bytes() const {
+    return tombstones_->total_tombstone_payload_bytes();
+  }
+
+ private:
+  using RangeTombstoneStack = FragmentedRangeTombstoneList::RangeTombstoneStack;
+
+  struct RangeTombstoneStackStartComparator {
+    explicit RangeTombstoneStackStartComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a.start_key, b.start_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->CompareWithoutTimestamp(a.start_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a, b.start_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  struct RangeTombstoneStackEndComparator {
+    explicit RangeTombstoneStackEndComparator(const Comparator* c) : cmp(c) {}
+
+    bool operator()(const RangeTombstoneStack& a,
+                    const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a.end_key, b.end_key) < 0;
+    }
+
+    bool operator()(const RangeTombstoneStack& a, const Slice& b) const {
+      return cmp->CompareWithoutTimestamp(a.end_key, b) < 0;
+    }
+
+    bool operator()(const Slice& a, const RangeTombstoneStack& b) const {
+      return cmp->CompareWithoutTimestamp(a, b.end_key) < 0;
+    }
+
+    const Comparator* cmp;
+  };
+
+  void MaybePinKey() const {
+    if (pos_ != tombstones_->end() && seq_pos_ != tombstones_->seq_end() &&
+        (pinned_pos_ != pos_ || pinned_seq_pos_ != seq_pos_)) {
+      current_start_key_.Set(pos_->start_key, *seq_pos_, kTypeRangeDeletion);
+      pinned_pos_ = pos_;
+      pinned_seq_pos_ = seq_pos_;
+    }
+  }
+
+  void SeekToCoveringTombstone(const Slice& key);
+  void SeekForPrevToCoveringTombstone(const Slice& key);
+  void ScanForwardToVisibleTombstone();
+  void ScanBackwardToVisibleTombstone();
+  bool ValidPos() const {
+    return Valid() && seq_pos_ != tombstones_->seq_iter(pos_->seq_end_idx);
+  }
+
+  const RangeTombstoneStackStartComparator tombstone_start_cmp_;
+  const RangeTombstoneStackEndComparator tombstone_end_cmp_;
+  const InternalKeyComparator* icmp_;
+  const Comparator* ucmp_;
+  std::shared_ptr<FragmentedRangeTombstoneList> tombstones_ref_;
+  std::shared_ptr<FragmentedRangeTombstoneListCache> tombstones_cache_ref_;
+  FragmentedRangeTombstoneList* tombstones_;
+  SequenceNumber upper_bound_;
+  SequenceNumber lower_bound_;
+  // Only consider timestamps <= ts_upper_bound_.
+  const Slice* ts_upper_bound_;
+  std::vector<RangeTombstoneStack>::const_iterator pos_;
+  std::vector<SequenceNumber>::const_iterator seq_pos_;
+  mutable std::vector<RangeTombstoneStack>::const_iterator pinned_pos_;
+  mutable std::vector<SequenceNumber>::const_iterator pinned_seq_pos_;
+  mutable InternalKey current_start_key_;
+
+  // Check the current RangeTombstoneStack `pos_` against timestamp
+  // upper bound `ts_upper_bound_` and sequence number upper bound
+  // `upper_bound_`. Update the sequence number (and timestamp) pointer
+  // `seq_pos_` to the first valid position satisfying both bounds.
+  void SetMaxVisibleSeqAndTimestamp() {
+    seq_pos_ = std::lower_bound(tombstones_->seq_iter(pos_->seq_start_idx),
+                                tombstones_->seq_iter(pos_->seq_end_idx),
+                                upper_bound_, std::greater<SequenceNumber>());
+    if (ts_upper_bound_ && !ts_upper_bound_->empty()) {
+      auto ts_pos = std::lower_bound(
+          tombstones_->ts_iter(pos_->seq_start_idx),
+          tombstones_->ts_iter(pos_->seq_end_idx), *ts_upper_bound_,
+          [this](const Slice& s1, const Slice& s2) {
+            return ucmp_->CompareTimestamp(s1, s2) > 0;
+          });
+      auto ts_idx = ts_pos - tombstones_->ts_iter(pos_->seq_start_idx);
+      auto seq_idx = seq_pos_ - tombstones_->seq_iter(pos_->seq_start_idx);
+      if (seq_idx < ts_idx) {
+        // seq and ts are ordered in non-increasing order. Only updates seq_pos_
+        // to a larger index for smaller sequence number and timestamp.
+        seq_pos_ = tombstones_->seq_iter(pos_->seq_start_idx + ts_idx);
+      }
+    }
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/range_tombstone_fragmenter_test.cc b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
new file mode 100644
index 000000000..46b3c99b5
--- /dev/null
+++ b/src/rocksdb/db/range_tombstone_fragmenter_test.cc
@@ -0,0 +1,555 @@
+//  Copyright (c) 2018-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/range_tombstone_fragmenter.h"
+
+#include "db/db_test_util.h"
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "test_util/testutil.h"
+#include "util/vector_iterator.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class RangeTombstoneFragmenterTest : public testing::Test {};
+
+namespace {
+
+static auto bytewise_icmp = InternalKeyComparator(BytewiseComparator());
+
+std::unique_ptr<InternalIterator> MakeRangeDelIter(
+    const std::vector<RangeTombstone>& range_dels) {
+  std::vector<std::string> keys, values;
+  for (const auto& range_del : range_dels) {
+    auto key_and_value = range_del.Serialize();
+    keys.push_back(key_and_value.first.Encode().ToString());
+    values.push_back(key_and_value.second.ToString());
+  }
+  return std::unique_ptr<VectorIterator>(
+      new VectorIterator(keys, values, &bytewise_icmp));
+}
+
+void CheckIterPosition(const RangeTombstone& tombstone,
+                       const FragmentedRangeTombstoneIterator* iter) {
+  // Test InternalIterator interface.
+  EXPECT_EQ(tombstone.start_key_, ExtractUserKey(iter->key()));
+  EXPECT_EQ(tombstone.end_key_, iter->value());
+  EXPECT_EQ(tombstone.seq_, iter->seq());
+
+  // Test FragmentedRangeTombstoneIterator interface.
+  EXPECT_EQ(tombstone.start_key_, iter->start_key());
+  EXPECT_EQ(tombstone.end_key_, iter->end_key());
+  EXPECT_EQ(tombstone.seq_, GetInternalKeySeqno(iter->key()));
+}
+
+void VerifyFragmentedRangeDels(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->Next()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+void VerifyVisibleTombstones(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<RangeTombstone>& expected_tombstones) {
+  iter->SeekToTopFirst();
+  for (size_t i = 0; i < expected_tombstones.size(); i++, iter->TopNext()) {
+    ASSERT_TRUE(iter->Valid());
+    CheckIterPosition(expected_tombstones[i], iter);
+  }
+  EXPECT_FALSE(iter->Valid());
+}
+
+struct SeekTestCase {
+  Slice seek_target;
+  RangeTombstone expected_position;
+  bool out_of_range;
+};
+
+void VerifySeek(FragmentedRangeTombstoneIterator* iter,
+                const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->Seek(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+void VerifySeekForPrev(FragmentedRangeTombstoneIterator* iter,
+                       const std::vector<SeekTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    iter->SeekForPrev(testcase.seek_target);
+    if (testcase.out_of_range) {
+      ASSERT_FALSE(iter->Valid());
+    } else {
+      ASSERT_TRUE(iter->Valid());
+      CheckIterPosition(testcase.expected_position, iter);
+    }
+  }
+}
+
+struct MaxCoveringTombstoneSeqnumTestCase {
+  Slice user_key;
+  SequenceNumber result;
+};
+
+void VerifyMaxCoveringTombstoneSeqnum(
+    FragmentedRangeTombstoneIterator* iter,
+    const std::vector<MaxCoveringTombstoneSeqnumTestCase>& cases) {
+  for (const auto& testcase : cases) {
+    EXPECT_EQ(testcase.result,
+              iter->MaxCoveringTombstoneSeqnum(testcase.user_key));
+  }
+}
+
+}  // anonymous namespace
+
+TEST_F(RangeTombstoneFragmenterTest, NonOverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "b", 10}, {"c", "d", 5}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "b", 10}, {"c", "d", 5}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"", 0}, {"a", 10}, {"b", 0}, {"c", 5}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlappingTombstones) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10}, {"c", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 15}, {"c", "e", 10}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 15}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, ContiguousTombstones) {
+  auto range_del_iter = MakeRangeDelIter(
+      {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(
+      &iter, {{"a", "c", 10}, {"c", "e", 20}, {"c", "e", 5}, {"e", "g", 15}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 20}, {"e", 15}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartAndEndKey) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter,
+                            {{"a", "c", 10}, {"a", "c", 7}, {"a", "c", 3}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter, {{"a", 10}, {"b", 10}, {"c", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyDifferentEndKeys) {
+  auto range_del_iter =
+      MakeRangeDelIter({{"a", "e", 10}, {"a", "g", 7}, {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 10}, {"c", 10}, {"e", 7}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, RepeatedStartKeyMixedEndKeys) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "c", 30},
+                                          {"a", "g", 20},
+                                          {"a", "e", 10},
+                                          {"a", "g", 7},
+                                          {"a", "c", 3}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 30},
+                                    {"a", "c", 20},
+                                    {"a", "c", 10},
+                                    {"a", "c", 7},
+                                    {"a", "c", 3},
+                                    {"c", "e", 20},
+                                    {"c", "e", 10},
+                                    {"c", "e", 7},
+                                    {"e", "g", 20},
+                                    {"e", "g", 7}});
+  VerifyMaxCoveringTombstoneSeqnum(&iter,
+                                   {{"a", 30}, {"c", 20}, {"e", 20}, {"g", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKey) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         9 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter3(&fragment_list, bytewise_icmp,
+                                         7 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter4(&fragment_list, bytewise_icmp,
+                                         5 /* upper_bound */);
+  FragmentedRangeTombstoneIterator iter5(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  for (auto* iter : {&iter1, &iter2, &iter3, &iter4, &iter5}) {
+    VerifyFragmentedRangeDels(iter, {{"a", "c", 10},
+                                     {"c", "e", 10},
+                                     {"c", "e", 8},
+                                     {"c", "e", 6},
+                                     {"e", "g", 8},
+                                     {"e", "g", 6},
+                                     {"g", "i", 6},
+                                     {"j", "l", 4},
+                                     {"j", "l", 2},
+                                     {"l", "n", 4}});
+  }
+
+  ASSERT_EQ(0, iter1.lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, iter1.upper_bound());
+  VerifyVisibleTombstones(&iter1, {{"a", "c", 10},
+                                   {"c", "e", 10},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter1, {{"a", 10}, {"c", 10}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter2.lower_bound());
+  ASSERT_EQ(9, iter2.upper_bound());
+  VerifyVisibleTombstones(&iter2, {{"c", "e", 8},
+                                   {"e", "g", 8},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter2, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter3.lower_bound());
+  ASSERT_EQ(7, iter3.upper_bound());
+  VerifyVisibleTombstones(&iter3, {{"c", "e", 6},
+                                   {"e", "g", 6},
+                                   {"g", "i", 6},
+                                   {"j", "l", 4},
+                                   {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter3, {{"a", 0}, {"c", 6}, {"e", 6}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter4.lower_bound());
+  ASSERT_EQ(5, iter4.upper_bound());
+  VerifyVisibleTombstones(&iter4, {{"j", "l", 4}, {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter4, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 4}, {"m", 4}});
+
+  ASSERT_EQ(0, iter5.lower_bound());
+  ASSERT_EQ(3, iter5.upper_bound());
+  VerifyVisibleTombstones(&iter5, {{"j", "l", 2}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter5, {{"a", 0}, {"c", 0}, {"e", 0}, {"i", 0}, {"j", 2}, {"m", 0}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyUnordered) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        9 /* upper_bound */);
+  ASSERT_EQ(0, iter.lower_bound());
+  ASSERT_EQ(9, iter.upper_bound());
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"c", "e", 6},
+                                    {"e", "g", 8},
+                                    {"e", "g", 6},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"j", "l", 2},
+                                    {"l", "n", 4}});
+  VerifyMaxCoveringTombstoneSeqnum(
+      &iter, {{"a", 0}, {"c", 8}, {"e", 8}, {"i", 0}, {"j", 4}, {"m", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, OverlapAndRepeatedStartKeyForCompaction) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {} /* snapshots */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest,
+       OverlapAndRepeatedStartKeyForCompactionWithSnapshot) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(
+      std::move(range_del_iter), bytewise_icmp, true /* for_compaction */,
+      {20, 9} /* upper_bounds */);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+  VerifyFragmentedRangeDels(&iter, {{"a", "c", 10},
+                                    {"c", "e", 10},
+                                    {"c", "e", 8},
+                                    {"e", "g", 8},
+                                    {"g", "i", 6},
+                                    {"j", "l", 4},
+                                    {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitNoSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({} /* snapshots */);
+  ASSERT_EQ(1, split_iters.size());
+
+  auto* split_iter = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(0, split_iter->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter->upper_bound());
+  VerifyVisibleTombstones(split_iter, {{"a", "c", 10},
+                                       {"c", "e", 10},
+                                       {"e", "g", 8},
+                                       {"g", "i", 6},
+                                       {"j", "l", 4},
+                                       {"l", "n", 4}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, IteratorSplitWithSnapshots) {
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"j", "n", 4},
+                                          {"c", "i", 6},
+                                          {"c", "g", 8},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber /* upper_bound */);
+
+  auto split_iters = iter.SplitBySnapshot({3, 5, 7, 9} /* snapshots */);
+  ASSERT_EQ(5, split_iters.size());
+
+  auto* split_iter1 = split_iters[3].get();
+  ASSERT_EQ(0, split_iter1->lower_bound());
+  ASSERT_EQ(3, split_iter1->upper_bound());
+  VerifyVisibleTombstones(split_iter1, {{"j", "l", 2}});
+
+  auto* split_iter2 = split_iters[5].get();
+  ASSERT_EQ(4, split_iter2->lower_bound());
+  ASSERT_EQ(5, split_iter2->upper_bound());
+  VerifyVisibleTombstones(split_iter2, {{"j", "l", 4}, {"l", "n", 4}});
+
+  auto* split_iter3 = split_iters[7].get();
+  ASSERT_EQ(6, split_iter3->lower_bound());
+  ASSERT_EQ(7, split_iter3->upper_bound());
+  VerifyVisibleTombstones(split_iter3,
+                          {{"c", "e", 6}, {"e", "g", 6}, {"g", "i", 6}});
+
+  auto* split_iter4 = split_iters[9].get();
+  ASSERT_EQ(8, split_iter4->lower_bound());
+  ASSERT_EQ(9, split_iter4->upper_bound());
+  VerifyVisibleTombstones(split_iter4, {{"c", "e", 8}, {"e", "g", 8}});
+
+  auto* split_iter5 = split_iters[kMaxSequenceNumber].get();
+  ASSERT_EQ(10, split_iter5->lower_bound());
+  ASSERT_EQ(kMaxSequenceNumber, split_iter5->upper_bound());
+  VerifyVisibleTombstones(split_iter5, {{"a", "c", 10}, {"c", "e", 10}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekStartKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"a", {"a", "c", 10}}, {"e", {"e", "g", 8}}, {"l", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"a", {"j", "l", 2}},
+                      {"e", {"j", "l", 2}},
+                      {"l", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"a", {}, true /* out of range */},
+                             {"e", {}, true /* out of range */},
+                             {"l", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekCovered) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+  VerifySeekForPrev(
+      &iter1,
+      {{"b", {"a", "c", 10}}, {"f", {"e", "g", 8}}, {"m", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"b", {"j", "l", 2}},
+                      {"f", {"j", "l", 2}},
+                      {"m", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"b", {}, true /* out of range */},
+                             {"f", {}, true /* out of range */},
+                             {"m", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekEndKey) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter1(&fragment_list, bytewise_icmp,
+                                         kMaxSequenceNumber);
+  VerifySeek(&iter1, {{"c", {"c", "e", 10}},
+                      {"g", {"g", "i", 6}},
+                      {"i", {"j", "l", 4}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter1, {{"c", {"c", "e", 10}},
+                             {"g", {"g", "i", 6}},
+                             {"i", {"g", "i", 6}},
+                             {"n", {"l", "n", 4}}});
+
+  FragmentedRangeTombstoneIterator iter2(&fragment_list, bytewise_icmp,
+                                         3 /* upper_bound */);
+  VerifySeek(&iter2, {{"c", {"j", "l", 2}},
+                      {"g", {"j", "l", 2}},
+                      {"i", {"j", "l", 2}},
+                      {"n", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter2, {{"c", {}, true /* out of range */},
+                             {"g", {}, true /* out of range */},
+                             {"i", {}, true /* out of range */},
+                             {"n", {"j", "l", 2}}});
+}
+
+TEST_F(RangeTombstoneFragmenterTest, SeekOutOfBounds) {
+  // Same tombstones as OverlapAndRepeatedStartKey.
+  auto range_del_iter = MakeRangeDelIter({{"a", "e", 10},
+                                          {"c", "g", 8},
+                                          {"c", "i", 6},
+                                          {"j", "n", 4},
+                                          {"j", "l", 2}});
+
+  FragmentedRangeTombstoneList fragment_list(std::move(range_del_iter),
+                                             bytewise_icmp);
+
+  FragmentedRangeTombstoneIterator iter(&fragment_list, bytewise_icmp,
+                                        kMaxSequenceNumber);
+  VerifySeek(&iter, {{"", {"a", "c", 10}}, {"z", {}, true /* out of range */}});
+  VerifySeekForPrev(&iter,
+                    {{"", {}, true /* out of range */}, {"z", {"l", "n", 4}}});
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/read_callback.h b/src/rocksdb/db/read_callback.h
new file mode 100644
index 000000000..c042352db
--- /dev/null
+++ b/src/rocksdb/db/read_callback.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/dbformat.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ReadCallback {
+ public:
+  explicit ReadCallback(SequenceNumber last_visible_seq)
+      : max_visible_seq_(last_visible_seq) {}
+  ReadCallback(SequenceNumber last_visible_seq, SequenceNumber min_uncommitted)
+      : max_visible_seq_(last_visible_seq), min_uncommitted_(min_uncommitted) {}
+
+  virtual ~ReadCallback() {}
+
+  // Will be called to see if the seq number visible; if not it moves on to
+  // the next seq number.
+  virtual bool IsVisibleFullCheck(SequenceNumber seq) = 0;
+
+  inline bool IsVisible(SequenceNumber seq) {
+    assert(min_uncommitted_ > 0);
+    assert(min_uncommitted_ >= kMinUnCommittedSeq);
+    if (seq < min_uncommitted_) {  // handles seq == 0 as well
+      assert(seq <= max_visible_seq_);
+      return true;
+    } else if (max_visible_seq_ < seq) {
+      assert(seq != 0);
+      return false;
+    } else {
+      assert(seq != 0);  // already handled in the first if-then clause
+      return IsVisibleFullCheck(seq);
+    }
+  }
+
+  inline SequenceNumber max_visible_seq() { return max_visible_seq_; }
+
+  // Refresh to a more recent visible seq
+  virtual void Refresh(SequenceNumber seq) { max_visible_seq_ = seq; }
+
+ protected:
+  // The max visible seq, it is usually the snapshot but could be larger if
+  // transaction has its own writes written to db.
+  SequenceNumber max_visible_seq_ = kMaxSequenceNumber;
+  // Any seq less than min_uncommitted_ is committed.
+  const SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/repair.cc b/src/rocksdb/db/repair.cc
new file mode 100644
index 000000000..1829a79f2
--- /dev/null
+++ b/src/rocksdb/db/repair.cc
@@ -0,0 +1,771 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Repairer does best effort recovery to recover as much data as possible after
+// a disaster without compromising consistency. It does not guarantee bringing
+// the database to a time consistent state.
+//
+// Repair process is broken into 4 phases:
+// (a) Find files
+// (b) Convert logs to tables
+// (c) Extract metadata
+// (d) Write Descriptor
+//
+// (a) Find files
+//
+// The repairer goes through all the files in the directory, and classifies them
+// based on their file name. Any file that cannot be identified by name will be
+// ignored.
+//
+// (b) Convert logs to table
+//
+// Every log file that is active is replayed. All sections of the file where the
+// checksum does not match is skipped over. We intentionally give preference to
+// data consistency.
+//
+// (c) Extract metadata
+//
+// We scan every table to compute
+// (1) smallest/largest for the table
+// (2) largest sequence number in the table
+// (3) oldest blob file referred to by the table (if applicable)
+//
+// If we are unable to scan the file, then we ignore the table.
+//
+// (d) Write Descriptor
+//
+// We generate descriptor contents:
+//  - log number is set to zero
+//  - next-file-number is set to 1 + largest file number we found
+//  - last-sequence-number is set to largest sequence# found across
+//    all tables (see 2c)
+//  - compaction pointers are cleared
+//  - every table file is added at level 0
+//
+// Possible optimization 1:
+//   (a) Compute total size and use to pick appropriate max-level M
+//   (b) Sort tables by largest sequence# in the table
+//   (c) For each table: if it overlaps earlier table, place in level-0,
+//       else place in level-M.
+//   (d) We can provide options for time consistent recovery and unsafe recovery
+//       (ignore checksum failure when applicable)
+// Possible optimization 2:
+//   Store per-table metadata (smallest, largest, largest-seq#, ...)
+//   in the table's meta section to speed up ScanTable.
+
+#ifndef ROCKSDB_LITE
+
+#include <cinttypes>
+
+#include "db/builder.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/table_cache.h"
+#include "db/version_edit.h"
+#include "db/write_batch_internal.h"
+#include "file/filename.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "options/cf_options.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+class Repairer {
+ public:
+  Repairer(const std::string& dbname, const DBOptions& db_options,
+           const std::vector<ColumnFamilyDescriptor>& column_families,
+           const ColumnFamilyOptions& default_cf_opts,
+           const ColumnFamilyOptions& unknown_cf_opts, bool create_unknown_cfs)
+      : dbname_(dbname),
+        db_session_id_(DBImpl::GenerateDbSessionId(db_options.env)),
+        env_(db_options.env),
+        file_options_(),
+        db_options_(SanitizeOptions(dbname_, db_options)),
+        immutable_db_options_(ImmutableDBOptions(db_options_)),
+        icmp_(default_cf_opts.comparator),
+        default_cf_opts_(
+            SanitizeOptions(immutable_db_options_, default_cf_opts)),
+        default_iopts_(
+            ImmutableOptions(immutable_db_options_, default_cf_opts_)),
+        unknown_cf_opts_(
+            SanitizeOptions(immutable_db_options_, unknown_cf_opts)),
+        create_unknown_cfs_(create_unknown_cfs),
+        raw_table_cache_(
+            // TableCache can be small since we expect each table to be opened
+            // once.
+            NewLRUCache(10, db_options_.table_cache_numshardbits)),
+        table_cache_(new TableCache(default_iopts_, &file_options_,
+                                    raw_table_cache_.get(),
+                                    /*block_cache_tracer=*/nullptr,
+                                    /*io_tracer=*/nullptr, db_session_id_)),
+        wb_(db_options_.db_write_buffer_size),
+        wc_(db_options_.delayed_write_rate),
+        vset_(dbname_, &immutable_db_options_, file_options_,
+              raw_table_cache_.get(), &wb_, &wc_,
+              /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+              /*db_id=*/"", db_session_id_),
+        next_file_number_(1),
+        db_lock_(nullptr),
+        closed_(false) {
+    for (const auto& cfd : column_families) {
+      cf_name_to_opts_[cfd.name] = cfd.options;
+    }
+  }
+
+  const ColumnFamilyOptions* GetColumnFamilyOptions(
+      const std::string& cf_name) {
+    if (cf_name_to_opts_.find(cf_name) == cf_name_to_opts_.end()) {
+      if (create_unknown_cfs_) {
+        return &unknown_cf_opts_;
+      }
+      return nullptr;
+    }
+    return &cf_name_to_opts_[cf_name];
+  }
+
+  // Adds a column family to the VersionSet with cf_options_ and updates
+  // manifest.
+  Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) {
+    const auto* cf_opts = GetColumnFamilyOptions(cf_name);
+    if (cf_opts == nullptr) {
+      return Status::Corruption("Encountered unknown column family with name=" +
+                                cf_name + ", id=" + std::to_string(cf_id));
+    }
+    Options opts(db_options_, *cf_opts);
+    MutableCFOptions mut_cf_opts(opts);
+
+    VersionEdit edit;
+    edit.SetComparatorName(opts.comparator->Name());
+    edit.SetLogNumber(0);
+    edit.SetColumnFamily(cf_id);
+    ColumnFamilyData* cfd;
+    cfd = nullptr;
+    edit.AddColumnFamily(cf_name);
+
+    mutex_.Lock();
+    std::unique_ptr<FSDirectory> db_dir;
+    Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+                                                        &db_dir, nullptr);
+    if (status.ok()) {
+      status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(),
+                                 false /* new_descriptor_log */, cf_opts);
+    }
+    mutex_.Unlock();
+    return status;
+  }
+
+  Status Close() {
+    Status s = Status::OK();
+    if (!closed_) {
+      if (db_lock_ != nullptr) {
+        s = env_->UnlockFile(db_lock_);
+        db_lock_ = nullptr;
+      }
+      closed_ = true;
+    }
+    return s;
+  }
+
+  ~Repairer() { Close().PermitUncheckedError(); }
+
+  Status Run() {
+    Status status = env_->LockFile(LockFileName(dbname_), &db_lock_);
+    if (!status.ok()) {
+      return status;
+    }
+    status = FindFiles();
+    DBImpl* db_impl = nullptr;
+    if (status.ok()) {
+      // Discard older manifests and start a fresh one
+      for (size_t i = 0; i < manifests_.size(); i++) {
+        ArchiveFile(dbname_ + "/" + manifests_[i]);
+      }
+      // Just create a DBImpl temporarily so we can reuse NewDB()
+      db_impl = new DBImpl(db_options_, dbname_);
+      status = db_impl->NewDB(/*new_filenames=*/nullptr);
+    }
+    delete db_impl;
+
+    if (status.ok()) {
+      // Recover using the fresh manifest created by NewDB()
+      status =
+          vset_.Recover({{kDefaultColumnFamilyName, default_cf_opts_}}, false);
+    }
+    if (status.ok()) {
+      // Need to scan existing SST files first so the column families are
+      // created before we process WAL files
+      ExtractMetaData();
+
+      // ExtractMetaData() uses table_fds_ to know which SST files' metadata to
+      // extract -- we need to clear it here since metadata for existing SST
+      // files has been extracted already
+      table_fds_.clear();
+      ConvertLogFilesToTables();
+      ExtractMetaData();
+      status = AddTables();
+    }
+    if (status.ok()) {
+      uint64_t bytes = 0;
+      for (size_t i = 0; i < tables_.size(); i++) {
+        bytes += tables_[i].meta.fd.GetFileSize();
+      }
+      ROCKS_LOG_WARN(db_options_.info_log,
+                     "**** Repaired rocksdb %s; "
+                     "recovered %" ROCKSDB_PRIszt " files; %" PRIu64
+                     " bytes. "
+                     "Some data may have been lost. "
+                     "****",
+                     dbname_.c_str(), tables_.size(), bytes);
+    }
+    return status;
+  }
+
+ private:
+  struct TableInfo {
+    FileMetaData meta;
+    uint32_t column_family_id;
+    std::string column_family_name;
+  };
+
+  std::string const dbname_;
+  std::string db_session_id_;
+  Env* const env_;
+  const FileOptions file_options_;
+  const DBOptions db_options_;
+  const ImmutableDBOptions immutable_db_options_;
+  const InternalKeyComparator icmp_;
+  const ColumnFamilyOptions default_cf_opts_;
+  const ImmutableOptions default_iopts_;  // table_cache_ holds reference
+  const ColumnFamilyOptions unknown_cf_opts_;
+  const bool create_unknown_cfs_;
+  std::shared_ptr<Cache> raw_table_cache_;
+  std::unique_ptr<TableCache> table_cache_;
+  WriteBufferManager wb_;
+  WriteController wc_;
+  VersionSet vset_;
+  std::unordered_map<std::string, ColumnFamilyOptions> cf_name_to_opts_;
+  InstrumentedMutex mutex_;
+
+  std::vector<std::string> manifests_;
+  std::vector<FileDescriptor> table_fds_;
+  std::vector<uint64_t> logs_;
+  std::vector<TableInfo> tables_;
+  uint64_t next_file_number_;
+  // Lock over the persistent DB state. Non-nullptr iff successfully
+  // acquired.
+  FileLock* db_lock_;
+  bool closed_;
+
+  Status FindFiles() {
+    std::vector<std::string> filenames;
+    bool found_file = false;
+    std::vector<std::string> to_search_paths;
+
+    for (size_t path_id = 0; path_id < db_options_.db_paths.size(); path_id++) {
+      to_search_paths.push_back(db_options_.db_paths[path_id].path);
+    }
+
+    // search wal_dir if user uses a customize wal_dir
+    bool same = immutable_db_options_.IsWalDirSameAsDBPath(dbname_);
+    if (!same) {
+      to_search_paths.push_back(immutable_db_options_.wal_dir);
+    }
+
+    for (size_t path_id = 0; path_id < to_search_paths.size(); path_id++) {
+      ROCKS_LOG_INFO(db_options_.info_log, "Searching path %s\n",
+                     to_search_paths[path_id].c_str());
+      Status status = env_->GetChildren(to_search_paths[path_id], &filenames);
+      if (!status.ok()) {
+        return status;
+      }
+      if (!filenames.empty()) {
+        found_file = true;
+      }
+
+      uint64_t number;
+      FileType type;
+      for (size_t i = 0; i < filenames.size(); i++) {
+        if (ParseFileName(filenames[i], &number, &type)) {
+          if (type == kDescriptorFile) {
+            manifests_.push_back(filenames[i]);
+          } else {
+            if (number + 1 > next_file_number_) {
+              next_file_number_ = number + 1;
+            }
+            if (type == kWalFile) {
+              logs_.push_back(number);
+            } else if (type == kTableFile) {
+              table_fds_.emplace_back(number, static_cast<uint32_t>(path_id),
+                                      0);
+            } else {
+              // Ignore other files
+            }
+          }
+        }
+      }
+    }
+    if (!found_file) {
+      return Status::Corruption(dbname_, "repair found no files");
+    }
+    return Status::OK();
+  }
+
+  void ConvertLogFilesToTables() {
+    const auto& wal_dir = immutable_db_options_.GetWalDir();
+    for (size_t i = 0; i < logs_.size(); i++) {
+      // we should use LogFileName(wal_dir, logs_[i]) here. user might uses
+      // wal_dir option.
+      std::string logname = LogFileName(wal_dir, logs_[i]);
+      Status status = ConvertLogToTable(wal_dir, logs_[i]);
+      if (!status.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Log #%" PRIu64 ": ignoring conversion error: %s",
+                       logs_[i], status.ToString().c_str());
+      }
+      ArchiveFile(logname);
+    }
+  }
+
+  Status ConvertLogToTable(const std::string& wal_dir, uint64_t log) {
+    struct LogReporter : public log::Reader::Reporter {
+      Env* env;
+      std::shared_ptr<Logger> info_log;
+      uint64_t lognum;
+      void Corruption(size_t bytes, const Status& s) override {
+        // We print error messages for corruption, but continue repairing.
+        ROCKS_LOG_ERROR(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s",
+                        lognum, static_cast<int>(bytes), s.ToString().c_str());
+      }
+    };
+
+    // Open the log file
+    std::string logname = LogFileName(wal_dir, log);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<SequentialFileReader> lfile_reader;
+    Status status = SequentialFileReader::Create(
+        fs, logname, fs->OptimizeForLogRead(file_options_), &lfile_reader,
+        nullptr /* dbg */, nullptr /* rate limiter */);
+    if (!status.ok()) {
+      return status;
+    }
+
+    // Create the log reader.
+    LogReporter reporter;
+    reporter.env = env_;
+    reporter.info_log = db_options_.info_log;
+    reporter.lognum = log;
+    // We intentionally make log::Reader do checksumming so that
+    // corruptions cause entire commits to be skipped instead of
+    // propagating bad information (like overly large sequence
+    // numbers).
+    log::Reader reader(db_options_.info_log, std::move(lfile_reader), &reporter,
+                       true /*enable checksum*/, log);
+
+    // Initialize per-column family memtables
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
+                             kMaxSequenceNumber);
+    }
+    auto cf_mems = new ColumnFamilyMemTablesImpl(vset_.GetColumnFamilySet());
+
+    // Read all the records and add to a memtable
+    std::string scratch;
+    Slice record;
+    WriteBatch batch;
+    int counter = 0;
+    while (reader.ReadRecord(&record, &scratch)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter.Corruption(record.size(),
+                            Status::Corruption("log record too small"));
+        continue;
+      }
+      Status record_status = WriteBatchInternal::SetContents(&batch, record);
+      if (record_status.ok()) {
+        record_status =
+            WriteBatchInternal::InsertInto(&batch, cf_mems, nullptr, nullptr);
+      }
+      if (record_status.ok()) {
+        counter += WriteBatchInternal::Count(&batch);
+      } else {
+        ROCKS_LOG_WARN(db_options_.info_log, "Log #%" PRIu64 ": ignoring %s",
+                       log, record_status.ToString().c_str());
+      }
+    }
+
+    // Dump a table for each column family with entries in this log file.
+    for (auto* cfd : *vset_.GetColumnFamilySet()) {
+      // Do not record a version edit for this conversion to a Table
+      // since ExtractMetaData() will also generate edits.
+      MemTable* mem = cfd->mem();
+      if (mem->IsEmpty()) {
+        continue;
+      }
+
+      FileMetaData meta;
+      meta.fd = FileDescriptor(next_file_number_++, 0, 0);
+      ReadOptions ro;
+      ro.total_order_seek = true;
+      Arena arena;
+      ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
+      int64_t _current_time = 0;
+      immutable_db_options_.clock->GetCurrentTime(&_current_time)
+          .PermitUncheckedError();  // ignore error
+      const uint64_t current_time = static_cast<uint64_t>(_current_time);
+      meta.file_creation_time = current_time;
+      SnapshotChecker* snapshot_checker = DisableGCSnapshotChecker::Instance();
+
+      auto write_hint = cfd->CalculateSSTWriteHint(0);
+      std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
+          range_del_iters;
+      auto range_del_iter = mem->NewRangeTombstoneIterator(
+          ro, kMaxSequenceNumber, false /* immutable_memtable */);
+      if (range_del_iter != nullptr) {
+        range_del_iters.emplace_back(range_del_iter);
+      }
+
+      IOStatus io_s;
+      CompressionOptions default_compression;
+      TableBuilderOptions tboptions(
+          *cfd->ioptions(), *cfd->GetLatestMutableCFOptions(),
+          cfd->internal_comparator(), cfd->int_tbl_prop_collector_factories(),
+          kNoCompression, default_compression, cfd->GetID(), cfd->GetName(),
+          -1 /* level */, false /* is_bottommost */,
+          TableFileCreationReason::kRecovery, 0 /* oldest_key_time */,
+          0 /* file_creation_time */, "DB Repairer" /* db_id */, db_session_id_,
+          0 /*target_file_size*/, meta.fd.GetNumber());
+
+      SeqnoToTimeMapping empty_seqno_time_mapping;
+      status = BuildTable(
+          dbname_, /* versions */ nullptr, immutable_db_options_, tboptions,
+          file_options_, table_cache_.get(), iter.get(),
+          std::move(range_del_iters), &meta, nullptr /* blob_file_additions */,
+          {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker,
+          false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s,
+          nullptr /*IOTracer*/, BlobFileCreationReason::kRecovery,
+          empty_seqno_time_mapping, nullptr /* event_logger */, 0 /* job_id */,
+          Env::IO_HIGH, nullptr /* table_properties */, write_hint);
+      ROCKS_LOG_INFO(db_options_.info_log,
+                     "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s",
+                     log, counter, meta.fd.GetNumber(),
+                     status.ToString().c_str());
+      if (status.ok()) {
+        if (meta.fd.GetFileSize() > 0) {
+          table_fds_.push_back(meta.fd);
+        }
+      } else {
+        break;
+      }
+    }
+    delete cf_mems;
+    return status;
+  }
+
+  void ExtractMetaData() {
+    for (size_t i = 0; i < table_fds_.size(); i++) {
+      TableInfo t;
+      t.meta.fd = table_fds_[i];
+      Status status = ScanTable(&t);
+      if (!status.ok()) {
+        std::string fname = TableFileName(
+            db_options_.db_paths, t.meta.fd.GetNumber(), t.meta.fd.GetPathId());
+        char file_num_buf[kFormatFileNumberBufSize];
+        FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(),
+                         file_num_buf, sizeof(file_num_buf));
+        ROCKS_LOG_WARN(db_options_.info_log, "Table #%s: ignoring %s",
+                       file_num_buf, status.ToString().c_str());
+        ArchiveFile(fname);
+      } else {
+        tables_.push_back(t);
+      }
+    }
+  }
+
+  Status ScanTable(TableInfo* t) {
+    std::string fname = TableFileName(
+        db_options_.db_paths, t->meta.fd.GetNumber(), t->meta.fd.GetPathId());
+    int counter = 0;
+    uint64_t file_size;
+    Status status = env_->GetFileSize(fname, &file_size);
+    t->meta.fd = FileDescriptor(t->meta.fd.GetNumber(), t->meta.fd.GetPathId(),
+                                file_size);
+    std::shared_ptr<const TableProperties> props;
+    if (status.ok()) {
+      status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta,
+                                                &props);
+    }
+    if (status.ok()) {
+      auto s =
+          GetSstInternalUniqueId(props->db_id, props->db_session_id,
+                                 props->orig_file_number, &t->meta.unique_id);
+      if (!s.ok()) {
+        ROCKS_LOG_WARN(db_options_.info_log,
+                       "Table #%" PRIu64
+                       ": unable to get unique id, default to Unknown.",
+                       t->meta.fd.GetNumber());
+      }
+      t->column_family_id = static_cast<uint32_t>(props->column_family_id);
+      if (t->column_family_id ==
+          TablePropertiesCollectorFactory::Context::kUnknownColumnFamily) {
+        ROCKS_LOG_WARN(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": column family unknown (probably due to legacy format); "
+            "adding to default column family id 0.",
+            t->meta.fd.GetNumber());
+        t->column_family_id = 0;
+      }
+
+      if (vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id) ==
+          nullptr) {
+        status =
+            AddColumnFamily(props->column_family_name, t->column_family_id);
+      }
+      t->meta.oldest_ancester_time = props->creation_time;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (status.ok()) {
+      cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id);
+      if (cfd->GetName() != props->column_family_name) {
+        ROCKS_LOG_ERROR(
+            db_options_.info_log,
+            "Table #%" PRIu64
+            ": inconsistent column family name '%s'; expected '%s' for column "
+            "family id %" PRIu32 ".",
+            t->meta.fd.GetNumber(), props->column_family_name.c_str(),
+            cfd->GetName().c_str(), t->column_family_id);
+        status = Status::Corruption(dbname_, "inconsistent column family name");
+      }
+    }
+    if (status.ok()) {
+      ReadOptions ropts;
+      ropts.total_order_seek = true;
+      InternalIterator* iter = table_cache_->NewIterator(
+          ropts, file_options_, cfd->internal_comparator(), t->meta,
+          nullptr /* range_del_agg */,
+          cfd->GetLatestMutableCFOptions()->prefix_extractor,
+          /*table_reader_ptr=*/nullptr, /*file_read_hist=*/nullptr,
+          TableReaderCaller::kRepair, /*arena=*/nullptr, /*skip_filters=*/false,
+          /*level=*/-1, /*max_file_size_for_l0_meta_pin=*/0,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false);
+      ParsedInternalKey parsed;
+      for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+        Slice key = iter->key();
+        Status pik_status =
+            ParseInternalKey(key, &parsed, db_options_.allow_data_in_errors);
+        if (!pik_status.ok()) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Table #%" PRIu64 ": unparsable key - %s",
+                          t->meta.fd.GetNumber(), pik_status.getState());
+          continue;
+        }
+
+        counter++;
+
+        status = t->meta.UpdateBoundaries(key, iter->value(), parsed.sequence,
+                                          parsed.type);
+        if (!status.ok()) {
+          break;
+        }
+      }
+      if (status.ok() && !iter->status().ok()) {
+        status = iter->status();
+      }
+      delete iter;
+
+      ROCKS_LOG_INFO(db_options_.info_log, "Table #%" PRIu64 ": %d entries %s",
+                     t->meta.fd.GetNumber(), counter,
+                     status.ToString().c_str());
+    }
+    if (status.ok()) {
+      // XXX/FIXME: This is just basic, naive handling of range tombstones,
+      // like call to UpdateBoundariesForRange in builder.cc where we assume
+      // an SST file is a full sorted run. This probably needs the extra logic
+      // from compaction_job.cc around call to UpdateBoundariesForRange (to
+      // handle range tombstones extendingg beyond range of other entries).
+      ReadOptions ropts;
+      std::unique_ptr<FragmentedRangeTombstoneIterator> r_iter;
+      status = table_cache_->GetRangeTombstoneIterator(
+          ropts, cfd->internal_comparator(), t->meta, &r_iter);
+
+      if (r_iter) {
+        r_iter->SeekToFirst();
+
+        while (r_iter->Valid()) {
+          auto tombstone = r_iter->Tombstone();
+          auto kv = tombstone.Serialize();
+          t->meta.UpdateBoundariesForRange(
+              kv.first, tombstone.SerializeEndKey(), tombstone.seq_,
+              cfd->internal_comparator());
+          r_iter->Next();
+        }
+      }
+    }
+    return status;
+  }
+
+  Status AddTables() {
+    std::unordered_map<uint32_t, std::vector<const TableInfo*>> cf_id_to_tables;
+    SequenceNumber max_sequence = 0;
+    for (size_t i = 0; i < tables_.size(); i++) {
+      cf_id_to_tables[tables_[i].column_family_id].push_back(&tables_[i]);
+      if (max_sequence < tables_[i].meta.fd.largest_seqno) {
+        max_sequence = tables_[i].meta.fd.largest_seqno;
+      }
+    }
+    vset_.SetLastAllocatedSequence(max_sequence);
+    vset_.SetLastPublishedSequence(max_sequence);
+    vset_.SetLastSequence(max_sequence);
+
+    for (const auto& cf_id_and_tables : cf_id_to_tables) {
+      auto* cfd =
+          vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first);
+      VersionEdit edit;
+      edit.SetComparatorName(cfd->user_comparator()->Name());
+      edit.SetLogNumber(0);
+      edit.SetNextFile(next_file_number_);
+      edit.SetColumnFamily(cfd->GetID());
+
+      // TODO(opt): separate out into multiple levels
+      for (const auto* table : cf_id_and_tables.second) {
+        edit.AddFile(
+            0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(),
+            table->meta.fd.GetFileSize(), table->meta.smallest,
+            table->meta.largest, table->meta.fd.smallest_seqno,
+            table->meta.fd.largest_seqno, table->meta.marked_for_compaction,
+            table->meta.temperature, table->meta.oldest_blob_file_number,
+            table->meta.oldest_ancester_time, table->meta.file_creation_time,
+            table->meta.file_checksum, table->meta.file_checksum_func_name,
+            table->meta.unique_id);
+      }
+      assert(next_file_number_ > 0);
+      vset_.MarkFileNumberUsed(next_file_number_ - 1);
+      mutex_.Lock();
+      std::unique_ptr<FSDirectory> db_dir;
+      Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(),
+                                                          &db_dir, nullptr);
+      if (status.ok()) {
+        status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                   &edit, &mutex_, db_dir.get(),
+                                   false /* new_descriptor_log */);
+      }
+      mutex_.Unlock();
+      if (!status.ok()) {
+        return status;
+      }
+    }
+    return Status::OK();
+  }
+
+  void ArchiveFile(const std::string& fname) {
+    // Move into another directory.  E.g., for
+    //    dir/foo
+    // rename to
+    //    dir/lost/foo
+    const char* slash = strrchr(fname.c_str(), '/');
+    std::string new_dir;
+    if (slash != nullptr) {
+      new_dir.assign(fname.data(), slash - fname.data());
+    }
+    new_dir.append("/lost");
+    env_->CreateDir(new_dir).PermitUncheckedError();  // Ignore error
+    std::string new_file = new_dir;
+    new_file.append("/");
+    new_file.append((slash == nullptr) ? fname.c_str() : slash + 1);
+    Status s = env_->RenameFile(fname, new_file);
+    ROCKS_LOG_INFO(db_options_.info_log, "Archiving %s: %s\n", fname.c_str(),
+                   s.ToString().c_str());
+  }
+};
+
+Status GetDefaultCFOptions(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    ColumnFamilyOptions* res) {
+  assert(res != nullptr);
+  auto iter = std::find_if(column_families.begin(), column_families.end(),
+                           [](const ColumnFamilyDescriptor& cfd) {
+                             return cfd.name == kDefaultColumnFamilyName;
+                           });
+  if (iter == column_families.end()) {
+    return Status::InvalidArgument(
+        "column_families", "Must contain entry for default column family");
+  }
+  *res = iter->options;
+  return Status::OK();
+}
+}  // anonymous namespace
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    ColumnFamilyOptions() /* unknown_cf_opts */,
+                    false /* create_unknown_cfs */);
+  status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const DBOptions& db_options,
+                const std::vector<ColumnFamilyDescriptor>& column_families,
+                const ColumnFamilyOptions& unknown_cf_opts) {
+  ColumnFamilyOptions default_cf_opts;
+  Status status = GetDefaultCFOptions(column_families, &default_cf_opts);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Repairer repairer(dbname, db_options, column_families, default_cf_opts,
+                    unknown_cf_opts, true /* create_unknown_cfs */);
+  status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+Status RepairDB(const std::string& dbname, const Options& options) {
+  Options opts(options);
+  DBOptions db_options(opts);
+  ColumnFamilyOptions cf_options(opts);
+
+  Repairer repairer(dbname, db_options, {}, cf_options /* default_cf_opts */,
+                    cf_options /* unknown_cf_opts */,
+                    true /* create_unknown_cfs */);
+  Status status = repairer.Run();
+  if (status.ok()) {
+    status = repairer.Close();
+  }
+  return status;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/repair_test.cc b/src/rocksdb/db/repair_test.cc
new file mode 100644
index 000000000..644a9270d
--- /dev/null
+++ b/src/rocksdb/db/repair_test.cc
@@ -0,0 +1,442 @@
+//  Copyright (c) 2016-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/options.h"
+#ifndef ROCKSDB_LITE
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/transaction_log.h"
+#include "table/unique_id_impl.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+class RepairTest : public DBTestBase {
+ public:
+  RepairTest() : DBTestBase("repair_test", /*env_do_fsync=*/true) {}
+
+  Status GetFirstSstPath(std::string* first_sst_path) {
+    assert(first_sst_path != nullptr);
+    first_sst_path->clear();
+    uint64_t manifest_size;
+    std::vector<std::string> files;
+    Status s = db_->GetLiveFiles(files, &manifest_size);
+    if (s.ok()) {
+      auto sst_iter =
+          std::find_if(files.begin(), files.end(), [](const std::string& file) {
+            uint64_t number;
+            FileType type;
+            bool ok = ParseFileName(file, &number, &type);
+            return ok && type == kTableFile;
+          });
+      *first_sst_path = sst_iter == files.end() ? "" : dbname_ + *sst_iter;
+    }
+    return s;
+  }
+
+  void ReopenWithSstIdVerify() {
+    std::atomic_int verify_passed{0};
+    SyncPoint::GetInstance()->SetCallBack(
+        "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* arg) {
+          // override job status
+          auto id = static_cast<UniqueId64x2*>(arg);
+          assert(*id != kNullUniqueId64x2);
+          verify_passed++;
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+    auto options = CurrentOptions();
+    options.verify_sst_unique_id_in_manifest = true;
+    Reopen(options);
+
+    ASSERT_GT(verify_passed, 0);
+    SyncPoint::GetInstance()->DisableProcessing();
+  }
+};
+
+TEST_F(RepairTest, LostManifest) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, LostManifestMoreDbFeatures) {
+  // Add a couple SST files, delete the manifest, and verify RepairDB() saves
+  // the day.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Put("key3", "val3"));
+  ASSERT_OK(Put("key4", "val4"));
+  ASSERT_OK(Flush());
+  // Test an SST file containing only a range tombstone
+  ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "key2",
+                             "key3z"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  // repair from sst should work with unique_id verification
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "NOT_FOUND");
+  ASSERT_EQ(Get("key3"), "NOT_FOUND");
+  ASSERT_EQ(Get("key4"), "val4");
+}
+
+TEST_F(RepairTest, CorruptManifest) {
+  // Manifest is in an invalid format. Expect a full recovery.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), manifest_path, "blah",
+                       false /* use_fsync */));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, IncompleteManifest) {
+  // In this case, the manifest is valid but does not reference all of the SST
+  // files. Expect a full recovery.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  std::string orig_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  CopyFile(orig_manifest_path, orig_manifest_path + ".tmp");
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  // Need to get path before Close() deletes db_, but overwrite it after Close()
+  // to ensure Close() didn't change the manifest.
+  std::string new_manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(new_manifest_path));
+  // Replace the manifest with one that is only aware of the first SST file.
+  CopyFile(orig_manifest_path + ".tmp", new_manifest_path);
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  ASSERT_EQ(Get("key"), "val");
+  ASSERT_EQ(Get("key2"), "val2");
+}
+
+TEST_F(RepairTest, PostRepairSstFileNumbering) {
+  // Verify after a DB is repaired, new files will be assigned higher numbers
+  // than old files.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  uint64_t pre_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithSstIdVerify();
+
+  uint64_t post_repair_file_num = dbfull()->TEST_Current_Next_FileNo();
+  ASSERT_GE(post_repair_file_num, pre_repair_file_num);
+}
+
+TEST_F(RepairTest, LostSst) {
+  // Delete one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
+  ASSERT_FALSE(sst_path.empty());
+  ASSERT_OK(env_->DeleteFile(sst_path));
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, CorruptSst) {
+  // Corrupt one of the SST files but preserve the manifest that refers to it,
+  // then verify the DB is still usable for the intact SST.
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(Put("key2", "val2"));
+  ASSERT_OK(Flush());
+  std::string sst_path;
+  ASSERT_OK(GetFirstSstPath(&sst_path));
+  ASSERT_FALSE(sst_path.empty());
+
+  ASSERT_OK(CreateFile(env_->GetFileSystem(), sst_path, "blah",
+                       false /* use_fsync */));
+
+  Close();
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  // Exactly one of the key-value pairs should be in the DB now.
+  ASSERT_TRUE((Get("key") == "val") != (Get("key2") == "val2"));
+}
+
+TEST_F(RepairTest, UnflushedSst) {
+  // This test case invokes repair while some data is unflushed, then verifies
+  // that data is in the db.
+  ASSERT_OK(Put("key", "val"));
+  VectorLogPtr wal_files;
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 1);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_EQ(total_ssts_size, 0);
+  }
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() didn't change the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+  ReopenWithSstIdVerify();
+
+  ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+  ASSERT_EQ(wal_files.size(), 0);
+  {
+    uint64_t total_ssts_size;
+    std::unordered_map<std::string, uint64_t> sst_files;
+    ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+    ASSERT_GT(total_ssts_size, 0);
+  }
+  ASSERT_EQ(Get("key"), "val");
+}
+
+TEST_F(RepairTest, SeparateWalDir) {
+  do {
+    Options options = CurrentOptions();
+    DestroyAndReopen(options);
+    ASSERT_OK(Put("key", "val"));
+    ASSERT_OK(Put("foo", "bar"));
+    VectorLogPtr wal_files;
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 1);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_EQ(total_ssts_size, 0);
+    }
+    std::string manifest_path =
+        DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+
+    Close();
+    ASSERT_OK(env_->FileExists(manifest_path));
+    ASSERT_OK(env_->DeleteFile(manifest_path));
+    ASSERT_OK(RepairDB(dbname_, options));
+
+    // make sure that all WALs are converted to SSTables.
+    options.wal_dir = "";
+
+    ReopenWithSstIdVerify();
+    ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files));
+    ASSERT_EQ(wal_files.size(), 0);
+    {
+      uint64_t total_ssts_size;
+      std::unordered_map<std::string, uint64_t> sst_files;
+      ASSERT_OK(GetAllDataFiles(kTableFile, &sst_files, &total_ssts_size));
+      ASSERT_GT(total_ssts_size, 0);
+    }
+    ASSERT_EQ(Get("key"), "val");
+    ASSERT_EQ(Get("foo"), "bar");
+
+  } while (ChangeWalOptions());
+}
+
+TEST_F(RepairTest, RepairMultipleColumnFamilies) {
+  // Verify repair logic associates SST files with their original column
+  // families.
+  const int kNumCfs = 3;
+  const int kEntriesPerCf = 2;
+  DestroyAndReopen(CurrentOptions());
+  CreateAndReopenWithCF({"pikachu1", "pikachu2"}, CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+      if (j == kEntriesPerCf - 1 && i == kNumCfs - 1) {
+        // Leave one unflushed so we can verify WAL entries are properly
+        // associated with column families.
+        continue;
+      }
+      ASSERT_OK(Flush(i));
+    }
+  }
+
+  // Need to get path before Close() deletes db_, but delete it after Close() to
+  // ensure Close() doesn't re-create the manifest.
+  std::string manifest_path =
+      DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo());
+  Close();
+  ASSERT_OK(env_->FileExists(manifest_path));
+  ASSERT_OK(env_->DeleteFile(manifest_path));
+
+  ASSERT_OK(RepairDB(dbname_, CurrentOptions()));
+
+  ReopenWithColumnFamilies({"default", "pikachu1", "pikachu2"},
+                           CurrentOptions());
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, RepairColumnFamilyOptions) {
+  // Verify repair logic uses correct ColumnFamilyOptions when repairing a
+  // database with different options for column families.
+  const int kNumCfs = 2;
+  const int kEntriesPerCf = 2;
+
+  Options opts(CurrentOptions()), rev_opts(CurrentOptions());
+  opts.comparator = BytewiseComparator();
+  rev_opts.comparator = ReverseBytewiseComparator();
+
+  DestroyAndReopen(opts);
+  CreateColumnFamilies({"reverse"}, rev_opts);
+  ReopenWithColumnFamilies({"default", "reverse"},
+                           std::vector<Options>{opts, rev_opts});
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_OK(Put(i, "key" + std::to_string(j), "val" + std::to_string(j)));
+      if (i == kNumCfs - 1 && j == kEntriesPerCf - 1) {
+        // Leave one unflushed so we can verify RepairDB's flush logic
+        continue;
+      }
+      ASSERT_OK(Flush(i));
+    }
+  }
+  Close();
+
+  // RepairDB() records the comparator in the manifest, and DB::Open would fail
+  // if a different comparator were used.
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}, {"reverse", rev_opts}},
+                     opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+
+  // Examine table properties to verify RepairDB() used the right options when
+  // converting WAL->SST
+  TablePropertiesCollection fname_to_props;
+  ASSERT_OK(db_->GetPropertiesOfAllTables(handles_[1], &fname_to_props));
+  ASSERT_EQ(fname_to_props.size(), 2U);
+  for (const auto& fname_and_props : fname_to_props) {
+    std::string comparator_name(rev_opts.comparator->Name());
+    ASSERT_EQ(comparator_name, fname_and_props.second->comparator_name);
+  }
+  Close();
+
+  // Also check comparator when it's provided via "unknown" CF options
+  ASSERT_OK(RepairDB(dbname_, opts, {{"default", opts}},
+                     rev_opts /* unknown_cf_opts */));
+  ASSERT_OK(TryReopenWithColumnFamilies({"default", "reverse"},
+                                        std::vector<Options>{opts, rev_opts}));
+  for (int i = 0; i < kNumCfs; ++i) {
+    for (int j = 0; j < kEntriesPerCf; ++j) {
+      ASSERT_EQ(Get(i, "key" + std::to_string(j)), "val" + std::to_string(j));
+    }
+  }
+}
+
+TEST_F(RepairTest, DbNameContainsTrailingSlash) {
+  {
+    bool tmp;
+    if (env_->AreFilesSame("", "", &tmp).IsNotSupported()) {
+      fprintf(stderr,
+              "skipping RepairTest.DbNameContainsTrailingSlash due to "
+              "unsupported Env::AreFilesSame\n");
+      return;
+    }
+  }
+
+  ASSERT_OK(Put("key", "val"));
+  ASSERT_OK(Flush());
+  Close();
+
+  ASSERT_OK(RepairDB(dbname_ + "/", CurrentOptions()));
+  ReopenWithSstIdVerify();
+  ASSERT_EQ(Get("key"), "val");
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as RepairDB is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/seqno_time_test.cc b/src/rocksdb/db/seqno_time_test.cc
new file mode 100644
index 000000000..12394a368
--- /dev/null
+++ b/src/rocksdb/db/seqno_time_test.cc
@@ -0,0 +1,996 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/db_test_util.h"
+#include "db/periodic_task_scheduler.h"
+#include "db/seqno_to_time_mapping.h"
+#include "port/stack_trace.h"
+#include "rocksdb/iostats_context.h"
+#include "rocksdb/utilities/debug.h"
+#include "test_util/mock_time_env.h"
+
+#ifndef ROCKSDB_LITE
+
+namespace ROCKSDB_NAMESPACE {
+
+class SeqnoTimeTest : public DBTestBase {
+ public:
+  SeqnoTimeTest() : DBTestBase("seqno_time_test", /*env_do_fsync=*/false) {
+    mock_clock_ = std::make_shared<MockSystemClock>(env_->GetSystemClock());
+    mock_env_ = std::make_unique<CompositeEnvWrapper>(env_, mock_clock_);
+  }
+
+ protected:
+  std::unique_ptr<Env> mock_env_;
+  std::shared_ptr<MockSystemClock> mock_clock_;
+
+  void SetUp() override {
+    mock_clock_->InstallTimedWaitFixCallback();
+    SyncPoint::GetInstance()->SetCallBack(
+        "DBImpl::StartPeriodicTaskScheduler:Init", [&](void* arg) {
+          auto periodic_task_scheduler_ptr =
+              reinterpret_cast<PeriodicTaskScheduler*>(arg);
+          periodic_task_scheduler_ptr->TEST_OverrideTimer(mock_clock_.get());
+        });
+  }
+
+  // make sure the file is not in cache, otherwise it won't have IO info
+  void AssertKeyTemperature(int key_id, Temperature expected_temperature) {
+    get_iostats_context()->Reset();
+    IOStatsContext* iostats = get_iostats_context();
+    std::string result = Get(Key(key_id));
+    ASSERT_FALSE(result.empty());
+    ASSERT_GT(iostats->bytes_read, 0);
+    switch (expected_temperature) {
+      case Temperature::kUnknown:
+        ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count,
+                  0);
+        ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+                  0);
+        break;
+      case Temperature::kCold:
+        ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_read_count,
+                  0);
+        ASSERT_GT(iostats->file_io_stats_by_temperature.cold_file_bytes_read,
+                  0);
+        break;
+      default:
+        // the test only support kCold now for the bottommost temperature
+        FAIL();
+    }
+  }
+};
+
+TEST_F(SeqnoTimeTest, TemperatureBasicUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+  const int kKeyPerSec = 10;
+
+  Options options = CurrentOptions();
+  options.compaction_style = kCompactionStyleUniversal;
+  options.preclude_last_level_data_seconds = 10000;
+  options.env = mock_env_.get();
+  options.bottommost_temperature = Temperature::kCold;
+  options.num_levels = kNumLevels;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec)); });
+
+  int sst_num = 0;
+  // Write files that are overlap and enough to trigger compaction
+  for (; sst_num < kNumTrigger; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(dbfull()->WaitForCompact(true));
+
+  // All data is hot, only output to penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // read a random key, which should be hot (kUnknown)
+  AssertKeyTemperature(20, Temperature::kUnknown);
+
+  // Write more data, but still all hot until the 10th SST, as:
+  // write a key every 10 seconds, 100 keys per SST, each SST takes 1000 seconds
+  // The preclude_last_level_data_seconds is 10k
+  for (; sst_num < kNumTrigger * 2; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+    ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+    ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+  }
+
+  // Now we have both hot data and cold data
+  for (; sst_num < kNumTrigger * 3; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun([&] {
+        mock_clock_->MockSleepForSeconds(static_cast<int>(kKeyPerSec));
+      });
+    }
+    ASSERT_OK(Flush());
+    ASSERT_OK(dbfull()->WaitForCompact(true));
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+  uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(hot_data_size, 0);
+  ASSERT_GT(cold_data_size, 0);
+  // the first a few key should be cold
+  AssertKeyTemperature(20, Temperature::kCold);
+
+  for (int i = 0; i < 30; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun([&] {
+      mock_clock_->MockSleepForSeconds(static_cast<int>(20 * kKeyPerSec));
+    });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+    // the hot/cold data cut off range should be between i * 20 + 200 -> 250
+    AssertKeyTemperature(i * 20 + 250, Temperature::kUnknown);
+    AssertKeyTemperature(i * 20 + 200, Temperature::kCold);
+  }
+
+  ASSERT_LT(GetSstSizeHelper(Temperature::kUnknown), hot_data_size);
+  ASSERT_GT(GetSstSizeHelper(Temperature::kCold), cold_data_size);
+
+  // Wait again, the most of the data should be cold after that
+  // but it may not be all cold, because if there's no new data write to SST,
+  // the compaction will not get the new seqno->time sampling to decide the last
+  // a few data's time.
+  for (int i = 0; i < 5; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  // any random data close to the end should be cold
+  AssertKeyTemperature(1000, Temperature::kCold);
+
+  // close explicitly, because the env is local variable which will be released
+  // first.
+  Close();
+}
+
+TEST_F(SeqnoTimeTest, TemperatureBasicLevel) {
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 10000;
+  options.env = mock_env_.get();
+  options.bottommost_temperature = Temperature::kCold;
+  options.num_levels = kNumLevels;
+  options.level_compaction_dynamic_level_bytes = true;
+  // TODO(zjay): for level compaction, auto-compaction may stuck in deadloop, if
+  //  the penultimate level score > 1, but the hot is not cold enough to compact
+  //  to last level, which will keep triggering compaction.
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // pass some time first, otherwise the first a few keys write time are going
+  // to be zero, and internally zero has special meaning: kUnknownSeqnoTime
+  dbfull()->TEST_WaitForPeridicTaskRun(
+      [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+
+  int sst_num = 0;
+  // Write files that are overlap
+  for (; sst_num < 4; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  // All data is hot, only output to penultimate level
+  ASSERT_EQ("0,0,0,0,0,1", FilesPerLevel());
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // read a random key, which should be hot (kUnknown)
+  AssertKeyTemperature(20, Temperature::kUnknown);
+
+  // Adding more data to have mixed hot and cold data
+  for (; sst_num < 14; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0);
+  ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0);
+
+  // Compact the files to the last level which should split the hot/cold data
+  MoveFilesToLevel(6);
+  uint64_t hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+  uint64_t cold_data_size = GetSstSizeHelper(Temperature::kCold);
+  ASSERT_GT(hot_data_size, 0);
+  ASSERT_GT(cold_data_size, 0);
+  // the first a few key should be cold
+  AssertKeyTemperature(20, Temperature::kCold);
+
+  // Wait some time, with each wait, the cold data is increasing and hot data is
+  // decreasing
+  for (int i = 0; i < 30; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+    uint64_t pre_hot = hot_data_size;
+    uint64_t pre_cold = cold_data_size;
+    hot_data_size = GetSstSizeHelper(Temperature::kUnknown);
+    cold_data_size = GetSstSizeHelper(Temperature::kCold);
+    ASSERT_LT(hot_data_size, pre_hot);
+    ASSERT_GT(cold_data_size, pre_cold);
+
+    // the hot/cold cut_off key should be around i * 20 + 400 -> 450
+    AssertKeyTemperature(i * 20 + 450, Temperature::kUnknown);
+    AssertKeyTemperature(i * 20 + 400, Temperature::kCold);
+  }
+
+  // Wait again, the most of the data should be cold after that
+  // hot data might not be empty, because if we don't write new data, there's
+  // no seqno->time sampling available to the compaction
+  for (int i = 0; i < 5; i++) {
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1000)); });
+    ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  }
+
+  // any random data close to the end should be cold
+  AssertKeyTemperature(1000, Temperature::kCold);
+
+  Close();
+}
+
+enum class SeqnoTimeTestType : char {
+  kTrackInternalTimeSeconds = 0,
+  kPrecludeLastLevel = 1,
+  kBothSetTrackSmaller = 2,
+};
+
+class SeqnoTimeTablePropTest
+    : public SeqnoTimeTest,
+      public ::testing::WithParamInterface<SeqnoTimeTestType> {
+ public:
+  SeqnoTimeTablePropTest() : SeqnoTimeTest() {}
+
+  void SetTrackTimeDurationOptions(uint64_t track_time_duration,
+                                   Options& options) const {
+    // either option set will enable the time tracking feature
+    switch (GetParam()) {
+      case SeqnoTimeTestType::kTrackInternalTimeSeconds:
+        options.preclude_last_level_data_seconds = 0;
+        options.preserve_internal_time_seconds = track_time_duration;
+        break;
+      case SeqnoTimeTestType::kPrecludeLastLevel:
+        options.preclude_last_level_data_seconds = track_time_duration;
+        options.preserve_internal_time_seconds = 0;
+        break;
+      case SeqnoTimeTestType::kBothSetTrackSmaller:
+        options.preclude_last_level_data_seconds = track_time_duration;
+        options.preserve_internal_time_seconds = track_time_duration / 10;
+        break;
+    }
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SeqnoTimeTablePropTest, SeqnoTimeTablePropTest,
+    ::testing::Values(SeqnoTimeTestType::kTrackInternalTimeSeconds,
+                      SeqnoTimeTestType::kPrecludeLastLevel,
+                      SeqnoTimeTestType::kBothSetTrackSmaller));
+
+TEST_P(SeqnoTimeTablePropTest, BasicSeqnoToTimeMapping) {
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+
+  options.env = mock_env_.get();
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  std::set<uint64_t> checked_file_nums;
+  SequenceNumber start_seq = dbfull()->GetLatestSequenceNumber();
+  // Write a key every 10 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  ASSERT_OK(Flush());
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  auto it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  // about ~20 seqs->time entries, because the sample rate is 10000/100, and it
+  // passes 2k time.
+  ASSERT_GE(seqs.size(), 19);
+  ASSERT_LE(seqs.size(), 21);
+  SequenceNumber seq_end = dbfull()->GetLatestSequenceNumber();
+  for (auto i = start_seq; i < start_seq + 10; i++) {
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 1) * 10);
+  }
+  start_seq += 10;
+  for (auto i = start_seq; i < seq_end; i++) {
+    // The result is within the range
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - 10) * 10);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i + 10) * 10);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 1 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 190), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(1)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 2);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // There only a few time sample
+  ASSERT_GE(seqs.size(), 1);
+  ASSERT_LE(seqs.size(), 3);
+  for (auto i = start_seq; i < seq_end; i++) {
+    // The result is not very accurate, as there is more data write within small
+    // range of time
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 1000);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 200 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 380), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(200)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 3);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // The sequence number -> time entries should be maxed
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+  for (auto i = start_seq; i < seq_end - 99; i++) {
+    // likely the first 100 entries reports 0
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i), (i - start_seq) + 3000);
+  }
+  start_seq += 101;
+
+  for (auto i = start_seq; i < seq_end; i++) {
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 200 + 22200);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 200 + 22600);
+  }
+  checked_file_nums.insert(it->second->orig_file_number);
+  start_seq = seq_end;
+
+  // Write a key every 100 seconds
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i + 570), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seq_end = dbfull()->GetLatestSequenceNumber();
+  ASSERT_OK(Flush());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 4);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+
+  checked_file_nums.insert(it->second->orig_file_number);
+
+  // re-enable compaction
+  ASSERT_OK(dbfull()->SetOptions({
+      {"disable_auto_compactions", "false"},
+  }));
+
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_GE(tables_props.size(), 1);
+  it = tables_props.begin();
+  while (it != tables_props.end()) {
+    if (!checked_file_nums.count(it->second->orig_file_number)) {
+      break;
+    }
+    it++;
+  }
+  ASSERT_TRUE(it != tables_props.end());
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+  for (auto i = start_seq; i < seq_end - 99; i++) {
+    // likely the first 100 entries reports 0
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 50000);
+  }
+  start_seq += 101;
+
+  for (auto i = start_seq; i < seq_end; i++) {
+    ASSERT_GE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 52200);
+    ASSERT_LE(tp_mapping.GetOldestApproximateTime(i),
+              (i - start_seq) * 100 + 52400);
+  }
+  ASSERT_OK(db_->Close());
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiCFs) {
+  Options options = CurrentOptions();
+  options.preclude_last_level_data_seconds = 0;
+  options.preserve_internal_time_seconds = 0;
+  options.env = mock_env_.get();
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
+  ReopenWithColumnFamilies({"default"}, options);
+
+  const PeriodicTaskScheduler& scheduler =
+      dbfull()->TEST_GetPeriodicTaskScheduler();
+  ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+  // Write some data and increase the current time
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_OK(Flush());
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  auto it = tables_props.begin();
+  ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+  ASSERT_TRUE(dbfull()->TEST_GetSeqnoToTimeMapping().Empty());
+
+  Options options_1 = options;
+  SetTrackTimeDurationOptions(10000, options_1);
+  CreateColumnFamilies({"one"}, options_1);
+  ASSERT_TRUE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+
+  // Write some data to the default CF (without preclude_last_level feature)
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_OK(Flush());
+
+  // Write some data to the CF one
+  for (int i = 0; i < 20; i++) {
+    ASSERT_OK(Put(1, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  ASSERT_OK(Flush(1));
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[1], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  ASSERT_FALSE(tp_mapping.Empty());
+  auto seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 1);
+  ASSERT_LE(seqs.size(), 4);
+
+  // Create one more CF with larger preclude_last_level time
+  Options options_2 = options;
+  SetTrackTimeDurationOptions(1000000, options_2);  // 1m
+  CreateColumnFamilies({"two"}, options_2);
+
+  // Add more data to CF "two" to fill the in memory mapping
+  for (int i = 0; i < 2000; i++) {
+    ASSERT_OK(Put(2, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 1000 - 1);
+  ASSERT_LE(seqs.size(), 1000 + 1);
+
+  ASSERT_OK(Flush(2));
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  // the max encoded entries is 100
+  ASSERT_GE(seqs.size(), 100 - 1);
+  ASSERT_LE(seqs.size(), 100 + 1);
+
+  // Write some data to default CF, as all memtable with preclude_last_level
+  // enabled have flushed, the in-memory seqno->time mapping should be cleared
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(0, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  seqs = dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping();
+  ASSERT_OK(Flush(0));
+
+  // trigger compaction for CF "two" and make sure the compaction output has
+  // seqno_to_time_mapping
+  for (int j = 0; j < 3; j++) {
+    for (int i = 0; i < 200; i++) {
+      ASSERT_OK(Put(2, Key(i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+    }
+    ASSERT_OK(Flush(2));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[2], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  tp_mapping.Clear();
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+  ASSERT_OK(tp_mapping.Sort());
+  seqs = tp_mapping.TEST_GetInternalMapping();
+  ASSERT_GE(seqs.size(), 99);
+  ASSERT_LE(seqs.size(), 101);
+
+  for (int j = 0; j < 2; j++) {
+    for (int i = 0; i < 200; i++) {
+      ASSERT_OK(Put(0, Key(i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+    }
+    ASSERT_OK(Flush(0));
+  }
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(handles_[0], &tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+  it = tables_props.begin();
+  ASSERT_TRUE(it->second->seqno_to_time_mapping.empty());
+
+  // Write some data to CF "two", but don't flush to accumulate
+  for (int i = 0; i < 1000; i++) {
+    ASSERT_OK(Put(2, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  ASSERT_GE(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      500);
+  // After dropping CF "one", the in-memory mapping will be change to only
+  // follow CF "two" options.
+  ASSERT_OK(db_->DropColumnFamily(handles_[1]));
+  ASSERT_LE(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      100 + 5);
+
+  // After dropping CF "two", the in-memory mapping is also clear.
+  ASSERT_OK(db_->DropColumnFamily(handles_[2]));
+  ASSERT_EQ(
+      dbfull()->TEST_GetSeqnoToTimeMapping().TEST_GetInternalMapping().size(),
+      0);
+
+  // And the timer worker is stopped
+  ASSERT_FALSE(scheduler.TEST_HasTask(PeriodicTaskType::kRecordSeqnoTime));
+  Close();
+}
+
+TEST_P(SeqnoTimeTablePropTest, MultiInstancesBasic) {
+  const int kInstanceNum = 2;
+
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+  options.env = mock_env_.get();
+  options.stats_dump_period_sec = 0;
+  options.stats_persist_period_sec = 0;
+
+  auto dbs = std::vector<DB*>(kInstanceNum);
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(
+        DB::Open(options, test::PerThreadDBPath(std::to_string(i)), &(dbs[i])));
+  }
+
+  // Make sure the second instance has the worker enabled
+  auto dbi = static_cast_with_check<DBImpl>(dbs[1]);
+  WriteOptions wo;
+  for (int i = 0; i < 200; i++) {
+    ASSERT_OK(dbi->Put(wo, Key(i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(100)); });
+  }
+  SeqnoToTimeMapping seqno_to_time_mapping = dbi->TEST_GetSeqnoToTimeMapping();
+  ASSERT_GT(seqno_to_time_mapping.Size(), 10);
+
+  for (int i = 0; i < kInstanceNum; i++) {
+    ASSERT_OK(dbs[i]->Close());
+    delete dbs[i];
+  }
+}
+
+TEST_P(SeqnoTimeTablePropTest, SeqnoToTimeMappingUniversal) {
+  const int kNumTrigger = 4;
+  const int kNumLevels = 7;
+  const int kNumKeys = 100;
+
+  Options options = CurrentOptions();
+  SetTrackTimeDurationOptions(10000, options);
+  options.compaction_style = kCompactionStyleUniversal;
+  options.num_levels = kNumLevels;
+  options.env = mock_env_.get();
+
+  DestroyAndReopen(options);
+
+  std::atomic_uint64_t num_seqno_zeroing{0};
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "CompactionIterator::PrepareOutput:ZeroingSeq",
+      [&](void* /*arg*/) { num_seqno_zeroing++; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  int sst_num = 0;
+  for (; sst_num < kNumTrigger - 1; sst_num++) {
+    for (int i = 0; i < kNumKeys; i++) {
+      ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+      dbfull()->TEST_WaitForPeridicTaskRun(
+          [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+    }
+    ASSERT_OK(Flush());
+  }
+  TablePropertiesCollection tables_props;
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 3);
+  for (const auto& props : tables_props) {
+    ASSERT_FALSE(props.second->seqno_to_time_mapping.empty());
+    SeqnoToTimeMapping tp_mapping;
+    ASSERT_OK(tp_mapping.Add(props.second->seqno_to_time_mapping));
+    ASSERT_OK(tp_mapping.Sort());
+    ASSERT_FALSE(tp_mapping.Empty());
+    auto seqs = tp_mapping.TEST_GetInternalMapping();
+    ASSERT_GE(seqs.size(), 10 - 1);
+    ASSERT_LE(seqs.size(), 10 + 1);
+  }
+
+  // Trigger a compaction
+  for (int i = 0; i < kNumKeys; i++) {
+    ASSERT_OK(Put(Key(sst_num * (kNumKeys - 1) + i), "value"));
+    dbfull()->TEST_WaitForPeridicTaskRun(
+        [&] { mock_clock_->MockSleepForSeconds(static_cast<int>(10)); });
+  }
+  sst_num++;
+  ASSERT_OK(Flush());
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());
+  tables_props.clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+  ASSERT_EQ(tables_props.size(), 1);
+
+  auto it = tables_props.begin();
+  SeqnoToTimeMapping tp_mapping;
+  ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+  // compact to the last level
+  CompactRangeOptions cro;
+  cro.bottommost_level_compaction = BottommostLevelCompaction::kForce;
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+  // make sure the data is all compacted to penultimate level if the feature is
+  // on, otherwise, compacted to the last level.
+  if (options.preclude_last_level_data_seconds > 0) {
+    ASSERT_GT(NumTableFilesAtLevel(5), 0);
+    ASSERT_EQ(NumTableFilesAtLevel(6), 0);
+  } else {
+    ASSERT_EQ(NumTableFilesAtLevel(5), 0);
+    ASSERT_GT(NumTableFilesAtLevel(6), 0);
+  }
+
+  // regardless the file is on the last level or not, it should keep the time
+  // information and sequence number are not set
+  tables_props.clear();
+  tp_mapping.Clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+  ASSERT_EQ(tables_props.size(), 1);
+  ASSERT_EQ(num_seqno_zeroing, 0);
+
+  it = tables_props.begin();
+  ASSERT_FALSE(it->second->seqno_to_time_mapping.empty());
+  ASSERT_OK(tp_mapping.Add(it->second->seqno_to_time_mapping));
+
+  // make half of the data expired
+  mock_clock_->MockSleepForSeconds(static_cast<int>(8000));
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  tables_props.clear();
+  tp_mapping.Clear();
+  ASSERT_OK(dbfull()->GetPropertiesOfAllTables(&tables_props));
+
+  if (options.preclude_last_level_data_seconds > 0) {
+    ASSERT_EQ(tables_props.size(), 2);
+  } else {
+    ASSERT_EQ(tables_props.size(), 1);
+  }
+  ASSERT_GT(num_seqno_zeroing, 0);
+  std::vector<KeyVersion> key_versions;
+  ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(),
+                              std::numeric_limits<size_t>::max(),
+                              &key_versions));
+  // make sure there're more than 300 keys and first 100 keys are having seqno
+  // zeroed out, the last 100 key seqno not zeroed out
+  ASSERT_GT(key_versions.size(), 300);
+  for (int i = 0; i < 100; i++) {
+    ASSERT_EQ(key_versions[i].sequence, 0);
+  }
+  auto rit = key_versions.rbegin();
+  for (int i = 0; i < 100; i++) {
+    ASSERT_GT(rit->sequence, 0);
+    rit++;
+  }
+
+  // make all data expired and compact again to push it to the last level
+  // regardless if the tiering feature is enabled or not
+  mock_clock_->MockSleepForSeconds(static_cast<int>(20000));
+
+  ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr));
+
+  ASSERT_GT(num_seqno_zeroing, 0);
+  ASSERT_GT(NumTableFilesAtLevel(6), 0);
+
+  Close();
+}
+
+TEST_F(SeqnoTimeTest, MappingAppend) {
+  SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+  // ignore seqno == 0, as it may mean the seqno is zeroed out
+  ASSERT_FALSE(test.Append(0, 9));
+
+  ASSERT_TRUE(test.Append(3, 10));
+  auto size = test.Size();
+  // normal add
+  ASSERT_TRUE(test.Append(10, 11));
+  size++;
+  ASSERT_EQ(size, test.Size());
+
+  // Append unsorted
+  ASSERT_FALSE(test.Append(8, 12));
+  ASSERT_EQ(size, test.Size());
+
+  // Append with the same seqno, newer time will be accepted
+  ASSERT_TRUE(test.Append(10, 12));
+  ASSERT_EQ(size, test.Size());
+  // older time will be ignored
+  ASSERT_FALSE(test.Append(10, 9));
+  ASSERT_EQ(size, test.Size());
+
+  // new seqno with old time will be ignored
+  ASSERT_FALSE(test.Append(12, 8));
+  ASSERT_EQ(size, test.Size());
+}
+
+TEST_F(SeqnoTimeTest, GetOldestApproximateTime) {
+  SeqnoToTimeMapping test(/*max_time_duration=*/100, /*max_capacity=*/10);
+
+  ASSERT_EQ(test.GetOldestApproximateTime(10), kUnknownSeqnoTime);
+
+  test.Append(3, 10);
+
+  ASSERT_EQ(test.GetOldestApproximateTime(2), kUnknownSeqnoTime);
+  ASSERT_EQ(test.GetOldestApproximateTime(3), 10);
+  ASSERT_EQ(test.GetOldestApproximateTime(10), 10);
+
+  test.Append(10, 100);
+
+  test.Append(100, 1000);
+  ASSERT_EQ(test.GetOldestApproximateTime(10), 100);
+  ASSERT_EQ(test.GetOldestApproximateTime(40), 100);
+  ASSERT_EQ(test.GetOldestApproximateTime(111), 1000);
+}
+
+TEST_F(SeqnoTimeTest, Sort) {
+  SeqnoToTimeMapping test;
+
+  // single entry
+  test.Add(10, 11);
+  ASSERT_OK(test.Sort());
+  ASSERT_EQ(test.Size(), 1);
+
+  // duplicate, should be removed by sort
+  test.Add(10, 11);
+  // same seqno, but older time, should be removed
+  test.Add(10, 9);
+
+  // unuseful ones, should be removed by sort
+  test.Add(11, 9);
+  test.Add(9, 8);
+
+  // Good ones
+  test.Add(1, 10);
+  test.Add(100, 100);
+
+  ASSERT_OK(test.Sort());
+
+  auto seqs = test.TEST_GetInternalMapping();
+
+  std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+  expected.emplace_back(1, 10);
+  expected.emplace_back(10, 11);
+  expected.emplace_back(100, 100);
+
+  ASSERT_EQ(expected, seqs);
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodeBasic) {
+  SeqnoToTimeMapping test(0, 1000);
+
+  std::string output;
+  test.Encode(output, 0, 1000, 100);
+  ASSERT_TRUE(output.empty());
+
+  for (int i = 1; i <= 1000; i++) {
+    ASSERT_TRUE(test.Append(i, i * 10));
+  }
+  test.Encode(output, 0, 1000, 100);
+
+  ASSERT_FALSE(output.empty());
+
+  SeqnoToTimeMapping decoded;
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+  ASSERT_EQ(decoded.Size(), SeqnoToTimeMapping::kMaxSeqnoTimePairsPerSST);
+  ASSERT_EQ(test.Size(), 1000);
+
+  for (SequenceNumber seq = 0; seq <= 1000; seq++) {
+    // test has the more accurate time mapping, encode only pick
+    // kMaxSeqnoTimePairsPerSST number of entries, which is less accurate
+    uint64_t target_time = test.GetOldestApproximateTime(seq);
+    ASSERT_GE(decoded.GetOldestApproximateTime(seq),
+              target_time < 200 ? 0 : target_time - 200);
+    ASSERT_LE(decoded.GetOldestApproximateTime(seq), target_time);
+  }
+}
+
+TEST_F(SeqnoTimeTest, EncodeDecodePerferNewTime) {
+  SeqnoToTimeMapping test(0, 10);
+
+  test.Append(1, 10);
+  test.Append(5, 17);
+  test.Append(6, 25);
+  test.Append(8, 30);
+
+  std::string output;
+  test.Encode(output, 1, 10, 0, 3);
+
+  SeqnoToTimeMapping decoded;
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+
+  ASSERT_EQ(decoded.Size(), 3);
+
+  auto seqs = decoded.TEST_GetInternalMapping();
+  std::deque<SeqnoToTimeMapping::SeqnoTimePair> expected;
+  expected.emplace_back(1, 10);
+  expected.emplace_back(6, 25);
+  expected.emplace_back(8, 30);
+  ASSERT_EQ(expected, seqs);
+
+  // Add a few large time number
+  test.Append(10, 100);
+  test.Append(13, 200);
+  test.Append(16, 300);
+
+  output.clear();
+  test.Encode(output, 1, 20, 0, 4);
+  decoded.Clear();
+  ASSERT_OK(decoded.Add(output));
+  ASSERT_OK(decoded.Sort());
+  ASSERT_EQ(decoded.Size(), 4);
+
+  expected.clear();
+  expected.emplace_back(1, 10);
+  // entry #6, #8 are skipped as they are too close to #1.
+  // entry #100 is also within skip range, but if it's skipped, there not enough
+  // number to fill 4 entries, so select it.
+  expected.emplace_back(10, 100);
+  expected.emplace_back(13, 200);
+  expected.emplace_back(16, 300);
+  seqs = decoded.TEST_GetInternalMapping();
+  ASSERT_EQ(expected, seqs);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+#endif  // ROCKSDB_LITE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/seqno_to_time_mapping.cc b/src/rocksdb/db/seqno_to_time_mapping.cc
new file mode 100644
index 000000000..c69209929
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.cc
@@ -0,0 +1,341 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/seqno_to_time_mapping.h"
+
+#include "db/version_edit.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+uint64_t SeqnoToTimeMapping::GetOldestApproximateTime(
+    const SequenceNumber seqno) const {
+  assert(is_sorted_);
+  auto it = std::upper_bound(seqno_time_mapping_.begin(),
+                             seqno_time_mapping_.end(), seqno);
+  if (it == seqno_time_mapping_.begin()) {
+    return 0;
+  }
+  it--;
+  return it->time;
+}
+
+void SeqnoToTimeMapping::Add(SequenceNumber seqno, uint64_t time) {
+  if (seqno == 0) {
+    return;
+  }
+  is_sorted_ = false;
+  seqno_time_mapping_.emplace_back(seqno, time);
+}
+
+void SeqnoToTimeMapping::TruncateOldEntries(const uint64_t now) {
+  assert(is_sorted_);
+
+  if (max_time_duration_ == 0) {
+    return;
+  }
+
+  const uint64_t cut_off_time =
+      now > max_time_duration_ ? now - max_time_duration_ : 0;
+  assert(cut_off_time <= now);  // no overflow
+
+  auto it = std::upper_bound(
+      seqno_time_mapping_.begin(), seqno_time_mapping_.end(), cut_off_time,
+      [](uint64_t target, const SeqnoTimePair& other) -> bool {
+        return target < other.time;
+      });
+  if (it == seqno_time_mapping_.begin()) {
+    return;
+  }
+  it--;
+  seqno_time_mapping_.erase(seqno_time_mapping_.begin(), it);
+}
+
+SequenceNumber SeqnoToTimeMapping::GetOldestSequenceNum(uint64_t time) {
+  assert(is_sorted_);
+
+  auto it = std::upper_bound(
+      seqno_time_mapping_.begin(), seqno_time_mapping_.end(), time,
+      [](uint64_t target, const SeqnoTimePair& other) -> bool {
+        return target < other.time;
+      });
+  if (it == seqno_time_mapping_.begin()) {
+    return 0;
+  }
+  it--;
+  return it->seqno;
+}
+
+// The encoded format is:
+//  [num_of_entries][[seqno][time],[seqno][time],...]
+//      ^                                 ^
+//    var_int                      delta_encoded (var_int)
+void SeqnoToTimeMapping::Encode(std::string& dest, const SequenceNumber start,
+                                const SequenceNumber end, const uint64_t now,
+                                const uint64_t output_size) const {
+  assert(is_sorted_);
+  if (start > end) {
+    // It could happen when the SST file is empty, the initial value of min
+    // sequence number is kMaxSequenceNumber and max is 0.
+    // The empty output file will be removed in the final step of compaction.
+    return;
+  }
+
+  auto start_it = std::upper_bound(seqno_time_mapping_.begin(),
+                                   seqno_time_mapping_.end(), start);
+  if (start_it != seqno_time_mapping_.begin()) {
+    start_it--;
+  }
+
+  auto end_it = std::upper_bound(seqno_time_mapping_.begin(),
+                                 seqno_time_mapping_.end(), end);
+  if (end_it == seqno_time_mapping_.begin()) {
+    return;
+  }
+  if (start_it >= end_it) {
+    return;
+  }
+
+  // truncate old entries that are not needed
+  if (max_time_duration_ > 0) {
+    const uint64_t cut_off_time =
+        now > max_time_duration_ ? now - max_time_duration_ : 0;
+    while (start_it < end_it && start_it->time < cut_off_time) {
+      start_it++;
+    }
+  }
+  // to include the first element
+  if (start_it != seqno_time_mapping_.begin()) {
+    start_it--;
+  }
+
+  // If there are more data than needed, pick the entries for encoding.
+  // It's not the most optimized algorithm for selecting the best representative
+  // entries over the time.
+  // It starts from the beginning and makes sure the distance is larger than
+  // `(end - start) / size` before selecting the number. For example, for the
+  // following list, pick 3 entries (it will pick seqno #1, #6, #8):
+  //    1 -> 10
+  //    5 -> 17
+  //    6 -> 25
+  //    8 -> 30
+  // first, it always picks the first one, then there are 2 num_entries_to_fill
+  // and the time difference between current one vs. the last one is
+  // (30 - 10) = 20. 20/2 = 10. So it will skip until 10+10 = 20. => it skips
+  // #5 and pick #6.
+  // But the most optimized solution is picking #1 #5 #8, as it will be more
+  // evenly distributed for time. Anyway the following algorithm is simple and
+  // may over-select new data, which is good. We do want more accurate time
+  // information for recent data.
+  std::deque<SeqnoTimePair> output_copy;
+  if (std::distance(start_it, end_it) > static_cast<int64_t>(output_size)) {
+    int64_t num_entries_to_fill = static_cast<int64_t>(output_size);
+    auto last_it = end_it;
+    last_it--;
+    uint64_t end_time = last_it->time;
+    uint64_t skip_until_time = 0;
+    for (auto it = start_it; it < end_it; it++) {
+      // skip if it's not reach the skip_until_time yet
+      if (std::distance(it, end_it) > num_entries_to_fill &&
+          it->time < skip_until_time) {
+        continue;
+      }
+      output_copy.push_back(*it);
+      num_entries_to_fill--;
+      if (std::distance(it, end_it) > num_entries_to_fill &&
+          num_entries_to_fill > 0) {
+        // If there are more entries than we need, re-calculate the
+        // skip_until_time, which means skip until that time
+        skip_until_time =
+            it->time + ((end_time - it->time) / num_entries_to_fill);
+      }
+    }
+
+    // Make sure all entries are filled
+    assert(num_entries_to_fill == 0);
+    start_it = output_copy.begin();
+    end_it = output_copy.end();
+  }
+
+  // Delta encode the data
+  uint64_t size = std::distance(start_it, end_it);
+  PutVarint64(&dest, size);
+  SeqnoTimePair base;
+  for (auto it = start_it; it < end_it; it++) {
+    assert(base < *it);
+    SeqnoTimePair val = *it - base;
+    base = *it;
+    val.Encode(dest);
+  }
+}
+
+Status SeqnoToTimeMapping::Add(const std::string& seqno_time_mapping_str) {
+  Slice input(seqno_time_mapping_str);
+  if (input.empty()) {
+    return Status::OK();
+  }
+  uint64_t size;
+  if (!GetVarint64(&input, &size)) {
+    return Status::Corruption("Invalid sequence number time size");
+  }
+  is_sorted_ = false;
+  SeqnoTimePair base;
+  for (uint64_t i = 0; i < size; i++) {
+    SeqnoTimePair val;
+    Status s = val.Decode(input);
+    if (!s.ok()) {
+      return s;
+    }
+    val.Add(base);
+    seqno_time_mapping_.emplace_back(val);
+    base = val;
+  }
+  return Status::OK();
+}
+
+void SeqnoToTimeMapping::SeqnoTimePair::Encode(std::string& dest) const {
+  PutVarint64Varint64(&dest, seqno, time);
+}
+
+Status SeqnoToTimeMapping::SeqnoTimePair::Decode(Slice& input) {
+  if (!GetVarint64(&input, &seqno)) {
+    return Status::Corruption("Invalid sequence number");
+  }
+  if (!GetVarint64(&input, &time)) {
+    return Status::Corruption("Invalid time");
+  }
+  return Status::OK();
+}
+
+bool SeqnoToTimeMapping::Append(SequenceNumber seqno, uint64_t time) {
+  assert(is_sorted_);
+
+  // skip seq number 0, which may have special meaning, like zeroed out data
+  if (seqno == 0) {
+    return false;
+  }
+  if (!Empty()) {
+    if (seqno < Last().seqno || time < Last().time) {
+      return false;
+    }
+    if (seqno == Last().seqno) {
+      Last().time = time;
+      return true;
+    }
+    if (time == Last().time) {
+      // new sequence has the same time as old one, no need to add new mapping
+      return false;
+    }
+  }
+
+  seqno_time_mapping_.emplace_back(seqno, time);
+
+  if (seqno_time_mapping_.size() > max_capacity_) {
+    seqno_time_mapping_.pop_front();
+  }
+  return true;
+}
+
+bool SeqnoToTimeMapping::Resize(uint64_t min_time_duration,
+                                uint64_t max_time_duration) {
+  uint64_t new_max_capacity =
+      CalculateMaxCapacity(min_time_duration, max_time_duration);
+  if (new_max_capacity == max_capacity_) {
+    return false;
+  } else if (new_max_capacity < seqno_time_mapping_.size()) {
+    uint64_t delta = seqno_time_mapping_.size() - new_max_capacity;
+    seqno_time_mapping_.erase(seqno_time_mapping_.begin(),
+                              seqno_time_mapping_.begin() + delta);
+  }
+  max_capacity_ = new_max_capacity;
+  return true;
+}
+
+Status SeqnoToTimeMapping::Sort() {
+  if (is_sorted_) {
+    return Status::OK();
+  }
+  if (seqno_time_mapping_.empty()) {
+    is_sorted_ = true;
+    return Status::OK();
+  }
+
+  std::deque<SeqnoTimePair> copy = std::move(seqno_time_mapping_);
+
+  std::sort(copy.begin(), copy.end());
+
+  seqno_time_mapping_.clear();
+
+  // remove seqno = 0, which may have special meaning, like zeroed out data
+  while (copy.front().seqno == 0) {
+    copy.pop_front();
+  }
+
+  SeqnoTimePair prev = copy.front();
+  for (const auto& it : copy) {
+    // If sequence number is the same, pick the one with larger time, which is
+    // more accurate than the older time.
+    if (it.seqno == prev.seqno) {
+      assert(it.time >= prev.time);
+      prev.time = it.time;
+    } else {
+      assert(it.seqno > prev.seqno);
+      // If a larger sequence number has an older time which is not useful, skip
+      if (it.time > prev.time) {
+        seqno_time_mapping_.push_back(prev);
+        prev = it;
+      }
+    }
+  }
+  seqno_time_mapping_.emplace_back(prev);
+
+  is_sorted_ = true;
+  return Status::OK();
+}
+
+std::string SeqnoToTimeMapping::ToHumanString() const {
+  std::string ret;
+  for (const auto& seq_time : seqno_time_mapping_) {
+    AppendNumberTo(&ret, seq_time.seqno);
+    ret.append("->");
+    AppendNumberTo(&ret, seq_time.time);
+    ret.append(",");
+  }
+  return ret;
+}
+
+SeqnoToTimeMapping SeqnoToTimeMapping::Copy(
+    SequenceNumber smallest_seqno) const {
+  SeqnoToTimeMapping ret;
+  auto it = std::upper_bound(seqno_time_mapping_.begin(),
+                             seqno_time_mapping_.end(), smallest_seqno);
+  if (it != seqno_time_mapping_.begin()) {
+    it--;
+  }
+  std::copy(it, seqno_time_mapping_.end(),
+            std::back_inserter(ret.seqno_time_mapping_));
+  return ret;
+}
+
+uint64_t SeqnoToTimeMapping::CalculateMaxCapacity(uint64_t min_time_duration,
+                                                  uint64_t max_time_duration) {
+  if (min_time_duration == 0) {
+    return 0;
+  }
+  return std::min(
+      kMaxSeqnoToTimeEntries,
+      max_time_duration * kMaxSeqnoTimePairsPerCF / min_time_duration);
+}
+
+SeqnoToTimeMapping::SeqnoTimePair SeqnoToTimeMapping::SeqnoTimePair::operator-(
+    const SeqnoTimePair& other) const {
+  SeqnoTimePair res;
+  res.seqno = seqno - other.seqno;
+  res.time = time - other.time;
+  return res;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/seqno_to_time_mapping.h b/src/rocksdb/db/seqno_to_time_mapping.h
new file mode 100644
index 000000000..4ffc9c199
--- /dev/null
+++ b/src/rocksdb/db/seqno_to_time_mapping.h
@@ -0,0 +1,189 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <algorithm>
+#include <cinttypes>
+#include <deque>
+#include <functional>
+#include <iterator>
+#include <string>
+
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+constexpr uint64_t kUnknownSeqnoTime = 0;
+
+// SeqnoToTimeMapping stores the sequence number to time mapping, so given a
+// sequence number it can estimate the oldest possible time for that sequence
+// number. For example:
+//   10 -> 100
+//   50 -> 300
+// then if a key has seqno 19, the OldestApproximateTime would be 100, for 51 it
+// would be 300.
+// As it's a sorted list, the new entry is inserted from the back. The old data
+// will be popped from the front if they're no longer used.
+//
+// Note: the data struct is not thread safe, both read and write need to be
+//  synchronized by caller.
+class SeqnoToTimeMapping {
+ public:
+  // Maximum number of entries can be encoded into SST. The data is delta encode
+  // so the maximum data usage for each SST is < 0.3K
+  static constexpr uint64_t kMaxSeqnoTimePairsPerSST = 100;
+
+  // Maximum number of entries per CF. If there's only CF with this feature on,
+  // the max duration divided by this number, so for example, if
+  // preclude_last_level_data_seconds = 100000 (~1day), then it will sample the
+  // seqno -> time every 1000 seconds (~17minutes). Then the maximum entry it
+  // needs is 100.
+  // When there are multiple CFs having this feature on, the sampling cadence is
+  // determined by the smallest setting, the capacity is determined the largest
+  // setting, also it's caped by kMaxSeqnoTimePairsPerCF * 10.
+  static constexpr uint64_t kMaxSeqnoTimePairsPerCF = 100;
+
+  // A simple struct for sequence number to time pair
+  struct SeqnoTimePair {
+    SequenceNumber seqno = 0;
+    uint64_t time = 0;
+
+    SeqnoTimePair() = default;
+    SeqnoTimePair(SequenceNumber _seqno, uint64_t _time)
+        : seqno(_seqno), time(_time) {}
+
+    // Encode to dest string
+    void Encode(std::string& dest) const;
+
+    // Decode the value from input Slice and remove it from the input
+    Status Decode(Slice& input);
+
+    // subtraction of 2 SeqnoTimePair
+    SeqnoTimePair operator-(const SeqnoTimePair& other) const;
+
+    // Add 2 values together
+    void Add(const SeqnoTimePair& obj) {
+      seqno += obj.seqno;
+      time += obj.time;
+    }
+
+    // Compare SeqnoTimePair with a sequence number, used for binary search a
+    // sequence number in a list of SeqnoTimePair
+    bool operator<(const SequenceNumber& other) const { return seqno < other; }
+
+    // Compare 2 SeqnoTimePair
+    bool operator<(const SeqnoTimePair& other) const {
+      return std::tie(seqno, time) < std::tie(other.seqno, other.time);
+    }
+
+    // Check if 2 SeqnoTimePair is the same
+    bool operator==(const SeqnoTimePair& other) const {
+      return std::tie(seqno, time) == std::tie(other.seqno, other.time);
+    }
+  };
+
+  // constractor of SeqnoToTimeMapping
+  // max_time_duration is the maximum time it should track. For example, if
+  // preclude_last_level_data_seconds is 1 day, then if an entry is older than 1
+  // day, then it can be removed.
+  // max_capacity is the maximum number of entry it can hold. For single CF,
+  // it's caped at 100 (kMaxSeqnoTimePairsPerCF), otherwise
+  // kMaxSeqnoTimePairsPerCF * 10.
+  // If it's set to 0, means it won't truncate any old data.
+  explicit SeqnoToTimeMapping(uint64_t max_time_duration = 0,
+                              uint64_t max_capacity = 0)
+      : max_time_duration_(max_time_duration), max_capacity_(max_capacity) {}
+
+  // Append a new entry to the list. The new entry should be newer than the
+  // existing ones. It maintains the internal sorted status.
+  bool Append(SequenceNumber seqno, uint64_t time);
+
+  // Given a sequence number, estimate it's oldest time
+  uint64_t GetOldestApproximateTime(SequenceNumber seqno) const;
+
+  // Truncate the old entries based on the current time and max_time_duration_
+  void TruncateOldEntries(uint64_t now);
+
+  // Given a time, return it's oldest possible sequence number
+  SequenceNumber GetOldestSequenceNum(uint64_t time);
+
+  // Encode to a binary string
+  void Encode(std::string& des, SequenceNumber start, SequenceNumber end,
+              uint64_t now,
+              uint64_t output_size = kMaxSeqnoTimePairsPerSST) const;
+
+  // Add a new random entry, unlike Append(), it can be any data, but also makes
+  // the list un-sorted.
+  void Add(SequenceNumber seqno, uint64_t time);
+
+  // Decode and add the entries to the current obj. The list will be unsorted
+  Status Add(const std::string& seqno_time_mapping_str);
+
+  // Return the number of entries
+  size_t Size() const { return seqno_time_mapping_.size(); }
+
+  // Reduce the size of internal list
+  bool Resize(uint64_t min_time_duration, uint64_t max_time_duration);
+
+  // Override the max_time_duration_
+  void SetMaxTimeDuration(uint64_t max_time_duration) {
+    max_time_duration_ = max_time_duration;
+  }
+
+  uint64_t GetCapacity() const { return max_capacity_; }
+
+  // Sort the list, which also remove the redundant entries, useless entries,
+  // which makes sure the seqno is sorted, but also the time
+  Status Sort();
+
+  // copy the current obj from the given smallest_seqno.
+  SeqnoToTimeMapping Copy(SequenceNumber smallest_seqno) const;
+
+  // If the internal list is empty
+  bool Empty() const { return seqno_time_mapping_.empty(); }
+
+  // clear all entries
+  void Clear() { seqno_time_mapping_.clear(); }
+
+  // return the string for user message
+  // Note: Not efficient, okay for print
+  std::string ToHumanString() const;
+
+#ifndef NDEBUG
+  const std::deque<SeqnoTimePair>& TEST_GetInternalMapping() const {
+    return seqno_time_mapping_;
+  }
+#endif
+
+ private:
+  static constexpr uint64_t kMaxSeqnoToTimeEntries =
+      kMaxSeqnoTimePairsPerCF * 10;
+
+  uint64_t max_time_duration_;
+  uint64_t max_capacity_;
+
+  std::deque<SeqnoTimePair> seqno_time_mapping_;
+
+  bool is_sorted_ = true;
+
+  static uint64_t CalculateMaxCapacity(uint64_t min_time_duration,
+                                       uint64_t max_time_duration);
+
+  SeqnoTimePair& Last() {
+    assert(!Empty());
+    return seqno_time_mapping_.back();
+  }
+};
+
+// for searching the sequence number from SeqnoToTimeMapping
+inline bool operator<(const SequenceNumber& seqno,
+                      const SeqnoToTimeMapping::SeqnoTimePair& other) {
+  return seqno < other.seqno;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_checker.h b/src/rocksdb/db/snapshot_checker.h
new file mode 100644
index 000000000..0bfb1aa07
--- /dev/null
+++ b/src/rocksdb/db/snapshot_checker.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+enum class SnapshotCheckerResult : int {
+  kInSnapshot = 0,
+  kNotInSnapshot = 1,
+  // In case snapshot is released and the checker has no clue whether
+  // the given sequence is visible to the snapshot.
+  kSnapshotReleased = 2,
+};
+
+// Callback class that control GC of duplicate keys in flush/compaction.
+class SnapshotChecker {
+ public:
+  virtual ~SnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const = 0;
+};
+
+class DisableGCSnapshotChecker : public SnapshotChecker {
+ public:
+  virtual ~DisableGCSnapshotChecker() {}
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber /*sequence*/,
+      SequenceNumber /*snapshot_sequence*/) const override {
+    // By returning kNotInSnapshot, we prevent all the values from being GCed
+    return SnapshotCheckerResult::kNotInSnapshot;
+  }
+  static DisableGCSnapshotChecker* Instance();
+
+ protected:
+  explicit DisableGCSnapshotChecker() {}
+};
+
+class WritePreparedTxnDB;
+
+// Callback class created by WritePreparedTxnDB to check if a key
+// is visible by a snapshot.
+class WritePreparedSnapshotChecker : public SnapshotChecker {
+ public:
+  explicit WritePreparedSnapshotChecker(WritePreparedTxnDB* txn_db);
+  virtual ~WritePreparedSnapshotChecker() {}
+
+  virtual SnapshotCheckerResult CheckInSnapshot(
+      SequenceNumber sequence, SequenceNumber snapshot_sequence) const override;
+
+ private:
+#ifndef ROCKSDB_LITE
+  const WritePreparedTxnDB* const txn_db_;
+#endif  // !ROCKSDB_LITE
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.cc b/src/rocksdb/db/snapshot_impl.cc
new file mode 100644
index 000000000..98b475463
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.cc
@@ -0,0 +1,25 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/db.h"
+#include "rocksdb/snapshot.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+ManagedSnapshot::ManagedSnapshot(DB* db)
+    : db_(db), snapshot_(db->GetSnapshot()) {}
+
+ManagedSnapshot::ManagedSnapshot(DB* db, const Snapshot* _snapshot)
+    : db_(db), snapshot_(_snapshot) {}
+
+ManagedSnapshot::~ManagedSnapshot() {
+  if (snapshot_) {
+    db_->ReleaseSnapshot(snapshot_);
+  }
+}
+
+const Snapshot* ManagedSnapshot::snapshot() { return snapshot_; }
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/snapshot_impl.h b/src/rocksdb/db/snapshot_impl.h
new file mode 100644
index 000000000..23e5e98cd
--- /dev/null
+++ b/src/rocksdb/db/snapshot_impl.h
@@ -0,0 +1,239 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/db.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SnapshotList;
+
+// Snapshots are kept in a doubly-linked list in the DB.
+// Each SnapshotImpl corresponds to a particular sequence number.
+class SnapshotImpl : public Snapshot {
+ public:
+  SequenceNumber number_;  // const after creation
+  // It indicates the smallest uncommitted data at the time the snapshot was
+  // taken. This is currently used by WritePrepared transactions to limit the
+  // scope of queries to IsInSnapshot.
+  SequenceNumber min_uncommitted_ = kMinUnCommittedSeq;
+
+  SequenceNumber GetSequenceNumber() const override { return number_; }
+
+  int64_t GetUnixTime() const override { return unix_time_; }
+
+  uint64_t GetTimestamp() const override { return timestamp_; }
+
+ private:
+  friend class SnapshotList;
+
+  // SnapshotImpl is kept in a doubly-linked circular list
+  SnapshotImpl* prev_;
+  SnapshotImpl* next_;
+
+  SnapshotList* list_;  // just for sanity checks
+
+  int64_t unix_time_;
+
+  uint64_t timestamp_;
+
+  // Will this snapshot be used by a Transaction to do write-conflict checking?
+  bool is_write_conflict_boundary_;
+};
+
+class SnapshotList {
+ public:
+  SnapshotList() {
+    list_.prev_ = &list_;
+    list_.next_ = &list_;
+    list_.number_ = 0xFFFFFFFFL;  // placeholder marker, for debugging
+    // Set all the variables to make UBSAN happy.
+    list_.list_ = nullptr;
+    list_.unix_time_ = 0;
+    list_.timestamp_ = 0;
+    list_.is_write_conflict_boundary_ = false;
+    count_ = 0;
+  }
+
+  // No copy-construct.
+  SnapshotList(const SnapshotList&) = delete;
+
+  bool empty() const {
+    assert(list_.next_ != &list_ || 0 == count_);
+    return list_.next_ == &list_;
+  }
+  SnapshotImpl* oldest() const {
+    assert(!empty());
+    return list_.next_;
+  }
+  SnapshotImpl* newest() const {
+    assert(!empty());
+    return list_.prev_;
+  }
+
+  SnapshotImpl* New(SnapshotImpl* s, SequenceNumber seq, uint64_t unix_time,
+                    bool is_write_conflict_boundary,
+                    uint64_t ts = std::numeric_limits<uint64_t>::max()) {
+    s->number_ = seq;
+    s->unix_time_ = unix_time;
+    s->timestamp_ = ts;
+    s->is_write_conflict_boundary_ = is_write_conflict_boundary;
+    s->list_ = this;
+    s->next_ = &list_;
+    s->prev_ = list_.prev_;
+    s->prev_->next_ = s;
+    s->next_->prev_ = s;
+    count_++;
+    return s;
+  }
+
+  // Do not responsible to free the object.
+  void Delete(const SnapshotImpl* s) {
+    assert(s->list_ == this);
+    s->prev_->next_ = s->next_;
+    s->next_->prev_ = s->prev_;
+    count_--;
+  }
+
+  // retrieve all snapshot numbers up until max_seq. They are sorted in
+  // ascending order (with no duplicates).
+  std::vector<SequenceNumber> GetAll(
+      SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+      const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber> ret;
+    GetAll(&ret, oldest_write_conflict_snapshot, max_seq);
+    return ret;
+  }
+
+  void GetAll(std::vector<SequenceNumber>* snap_vector,
+              SequenceNumber* oldest_write_conflict_snapshot = nullptr,
+              const SequenceNumber& max_seq = kMaxSequenceNumber) const {
+    std::vector<SequenceNumber>& ret = *snap_vector;
+    // So far we have no use case that would pass a non-empty vector
+    assert(ret.size() == 0);
+
+    if (oldest_write_conflict_snapshot != nullptr) {
+      *oldest_write_conflict_snapshot = kMaxSequenceNumber;
+    }
+
+    if (empty()) {
+      return;
+    }
+    const SnapshotImpl* s = &list_;
+    while (s->next_ != &list_) {
+      if (s->next_->number_ > max_seq) {
+        break;
+      }
+      // Avoid duplicates
+      if (ret.empty() || ret.back() != s->next_->number_) {
+        ret.push_back(s->next_->number_);
+      }
+
+      if (oldest_write_conflict_snapshot != nullptr &&
+          *oldest_write_conflict_snapshot == kMaxSequenceNumber &&
+          s->next_->is_write_conflict_boundary_) {
+        // If this is the first write-conflict boundary snapshot in the list,
+        // it is the oldest
+        *oldest_write_conflict_snapshot = s->next_->number_;
+      }
+
+      s = s->next_;
+    }
+    return;
+  }
+
+  // get the sequence number of the most recent snapshot
+  SequenceNumber GetNewest() {
+    if (empty()) {
+      return 0;
+    }
+    return newest()->number_;
+  }
+
+  int64_t GetOldestSnapshotTime() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->unix_time_;
+    }
+  }
+
+  int64_t GetOldestSnapshotSequence() const {
+    if (empty()) {
+      return 0;
+    } else {
+      return oldest()->GetSequenceNumber();
+    }
+  }
+
+  uint64_t count() const { return count_; }
+
+ private:
+  // Dummy head of doubly-linked list of snapshots
+  SnapshotImpl list_;
+  uint64_t count_;
+};
+
+// All operations on TimestampedSnapshotList must be protected by db mutex.
+class TimestampedSnapshotList {
+ public:
+  explicit TimestampedSnapshotList() = default;
+
+  std::shared_ptr<const SnapshotImpl> GetSnapshot(uint64_t ts) const {
+    if (ts == std::numeric_limits<uint64_t>::max() && !snapshots_.empty()) {
+      auto it = snapshots_.rbegin();
+      assert(it != snapshots_.rend());
+      return it->second;
+    }
+    auto it = snapshots_.find(ts);
+    if (it == snapshots_.end()) {
+      return std::shared_ptr<const SnapshotImpl>();
+    }
+    return it->second;
+  }
+
+  void GetSnapshots(
+      uint64_t ts_lb, uint64_t ts_ub,
+      std::vector<std::shared_ptr<const Snapshot>>& snapshots) const {
+    assert(ts_lb < ts_ub);
+    auto it_low = snapshots_.lower_bound(ts_lb);
+    auto it_high = snapshots_.lower_bound(ts_ub);
+    for (auto it = it_low; it != it_high; ++it) {
+      snapshots.emplace_back(it->second);
+    }
+  }
+
+  void AddSnapshot(const std::shared_ptr<const SnapshotImpl>& snapshot) {
+    assert(snapshot);
+    snapshots_.try_emplace(snapshot->GetTimestamp(), snapshot);
+  }
+
+  // snapshots_to_release: the container to where the timestamped snapshots will
+  // be moved so that it retains the last reference to the snapshots and the
+  // snapshots won't be actually released which requires db mutex. The
+  // snapshots will be released by caller of ReleaseSnapshotsOlderThan().
+  void ReleaseSnapshotsOlderThan(
+      uint64_t ts,
+      autovector<std::shared_ptr<const SnapshotImpl>>& snapshots_to_release) {
+    auto ub = snapshots_.lower_bound(ts);
+    for (auto it = snapshots_.begin(); it != ub; ++it) {
+      snapshots_to_release.emplace_back(it->second);
+    }
+    snapshots_.erase(snapshots_.begin(), ub);
+  }
+
+ private:
+  std::map<uint64_t, std::shared_ptr<const SnapshotImpl>> snapshots_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.cc b/src/rocksdb/db/table_cache.cc
new file mode 100644
index 000000000..c44c4bb84
--- /dev/null
+++ b/src/rocksdb/db/table_cache.cc
@@ -0,0 +1,753 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/table_cache.h"
+
+#include "db/dbformat.h"
+#include "db/range_tombstone_fragmenter.h"
+#include "db/snapshot_impl.h"
+#include "db/version_edit.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "monitoring/perf_context_imp.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/statistics.h"
+#include "table/block_based/block_based_table_reader.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/iterator_wrapper.h"
+#include "table/multiget_context.h"
+#include "table/table_builder.h"
+#include "table/table_reader.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/stop_watch.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+template <class T>
+static void DeleteEntry(const Slice& /*key*/, void* value) {
+  T* typed_value = reinterpret_cast<T*>(value);
+  delete typed_value;
+}
+}  // anonymous namespace
+}  // namespace ROCKSDB_NAMESPACE
+
+// Generate the regular and coroutine versions of some methods by
+// including table_cache_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/table_cache_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+static void UnrefEntry(void* arg1, void* arg2) {
+  Cache* cache = reinterpret_cast<Cache*>(arg1);
+  Cache::Handle* h = reinterpret_cast<Cache::Handle*>(arg2);
+  cache->Release(h);
+}
+
+static Slice GetSliceForFileNumber(const uint64_t* file_number) {
+  return Slice(reinterpret_cast<const char*>(file_number),
+               sizeof(*file_number));
+}
+
+#ifndef ROCKSDB_LITE
+
+void AppendVarint64(IterKey* key, uint64_t v) {
+  char buf[10];
+  auto ptr = EncodeVarint64(buf, v);
+  key->TrimAppend(key->Size(), buf, ptr - buf);
+}
+
+#endif  // ROCKSDB_LITE
+
+}  // anonymous namespace
+
+const int kLoadConcurency = 128;
+
+TableCache::TableCache(const ImmutableOptions& ioptions,
+                       const FileOptions* file_options, Cache* const cache,
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_session_id)
+    : ioptions_(ioptions),
+      file_options_(*file_options),
+      cache_(cache),
+      immortal_tables_(false),
+      block_cache_tracer_(block_cache_tracer),
+      loader_mutex_(kLoadConcurency, kGetSliceNPHash64UnseededFnPtr),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {
+  if (ioptions_.row_cache) {
+    // If the same cache is shared by multiple instances, we need to
+    // disambiguate its entries.
+    PutVarint64(&row_cache_id_, ioptions_.row_cache->NewId());
+  }
+}
+
+TableCache::~TableCache() {}
+
+TableReader* TableCache::GetTableReaderFromHandle(Cache::Handle* handle) {
+  return reinterpret_cast<TableReader*>(cache_->Value(handle));
+}
+
+void TableCache::ReleaseHandle(Cache::Handle* handle) {
+  cache_->Release(handle);
+}
+
+Status TableCache::GetTableReader(
+    const ReadOptions& ro, const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, bool sequential_mode, bool record_read_stats,
+    HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+  std::string fname = TableFileName(
+      ioptions_.cf_paths, file_meta.fd.GetNumber(), file_meta.fd.GetPathId());
+  std::unique_ptr<FSRandomAccessFile> file;
+  FileOptions fopts = file_options;
+  fopts.temperature = file_temperature;
+  Status s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+  TEST_SYNC_POINT_CALLBACK("TableCache::GetTableReader:BeforeOpenFile",
+                           const_cast<Status*>(&s));
+  if (s.ok()) {
+    s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
+  }
+  if (s.ok()) {
+    RecordTick(ioptions_.stats, NO_FILE_OPENS);
+  } else if (s.IsPathNotFound()) {
+    fname = Rocks2LevelTableFileName(fname);
+    s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
+    if (s.ok()) {
+      s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
+                                            nullptr);
+    }
+    if (s.ok()) {
+      RecordTick(ioptions_.stats, NO_FILE_OPENS);
+    }
+  }
+
+  if (s.ok()) {
+    if (!sequential_mode && ioptions_.advise_random_on_open) {
+      file->Hint(FSRandomAccessFile::kRandom);
+    }
+    StopWatch sw(ioptions_.clock, ioptions_.stats, TABLE_OPEN_IO_MICROS);
+    std::unique_ptr<RandomAccessFileReader> file_reader(
+        new RandomAccessFileReader(
+            std::move(file), fname, ioptions_.clock, io_tracer_,
+            record_read_stats ? ioptions_.stats : nullptr, SST_READ_MICROS,
+            file_read_hist, ioptions_.rate_limiter.get(), ioptions_.listeners,
+            file_temperature, level == ioptions_.num_levels - 1));
+    UniqueId64x2 expected_unique_id;
+    if (ioptions_.verify_sst_unique_id_in_manifest) {
+      expected_unique_id = file_meta.unique_id;
+    } else {
+      expected_unique_id = kNullUniqueId64x2;  // null ID == no verification
+    }
+    s = ioptions_.table_factory->NewTableReader(
+        ro,
+        TableReaderOptions(ioptions_, prefix_extractor, file_options,
+                           internal_comparator, skip_filters, immortal_tables_,
+                           false /* force_direct_prefetch */, level,
+                           block_cache_tracer_, max_file_size_for_l0_meta_pin,
+                           db_session_id_, file_meta.fd.GetNumber(),
+                           expected_unique_id, file_meta.fd.largest_seqno),
+        std::move(file_reader), file_meta.fd.GetFileSize(), table_reader,
+        prefetch_index_and_filter_in_cache);
+    TEST_SYNC_POINT("TableCache::GetTableReader:0");
+  }
+  return s;
+}
+
+void TableCache::EraseHandle(const FileDescriptor& fd, Cache::Handle* handle) {
+  ReleaseHandle(handle);
+  uint64_t number = fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  cache_->Erase(key);
+}
+
+Status TableCache::FindTable(
+    const ReadOptions& ro, const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, Cache::Handle** handle,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    const bool no_io, bool record_read_stats, HistogramImpl* file_read_hist,
+    bool skip_filters, int level, bool prefetch_index_and_filter_in_cache,
+    size_t max_file_size_for_l0_meta_pin, Temperature file_temperature) {
+  PERF_TIMER_GUARD_WITH_CLOCK(find_table_nanos, ioptions_.clock);
+  uint64_t number = file_meta.fd.GetNumber();
+  Slice key = GetSliceForFileNumber(&number);
+  *handle = cache_->Lookup(key);
+  TEST_SYNC_POINT_CALLBACK("TableCache::FindTable:0",
+                           const_cast<bool*>(&no_io));
+
+  if (*handle == nullptr) {
+    if (no_io) {
+      return Status::Incomplete("Table not found in table_cache, no_io is set");
+    }
+    MutexLock load_lock(loader_mutex_.get(key));
+    // We check the cache again under loading mutex
+    *handle = cache_->Lookup(key);
+    if (*handle != nullptr) {
+      return Status::OK();
+    }
+
+    std::unique_ptr<TableReader> table_reader;
+    Status s =
+        GetTableReader(ro, file_options, internal_comparator, file_meta,
+                       false /* sequential mode */, record_read_stats,
+                       file_read_hist, &table_reader, prefix_extractor,
+                       skip_filters, level, prefetch_index_and_filter_in_cache,
+                       max_file_size_for_l0_meta_pin, file_temperature);
+    if (!s.ok()) {
+      assert(table_reader == nullptr);
+      RecordTick(ioptions_.stats, NO_FILE_ERRORS);
+      // We do not cache error results so that if the error is transient,
+      // or somebody repairs the file, we recover automatically.
+    } else {
+      s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry<TableReader>,
+                         handle);
+      if (s.ok()) {
+        // Release ownership of table reader.
+        table_reader.release();
+      }
+    }
+    return s;
+  }
+  return Status::OK();
+}
+
+InternalIterator* TableCache::NewIterator(
+    const ReadOptions& options, const FileOptions& file_options,
+    const InternalKeyComparator& icomparator, const FileMetaData& file_meta,
+    RangeDelAggregator* range_del_agg,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+    TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin,
+    const InternalKey* smallest_compaction_key,
+    const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+    TruncatedRangeDelIterator** range_del_iter) {
+  PERF_TIMER_GUARD(new_table_iterator_nanos);
+
+  Status s;
+  TableReader* table_reader = nullptr;
+  Cache::Handle* handle = nullptr;
+  if (table_reader_ptr != nullptr) {
+    *table_reader_ptr = nullptr;
+  }
+  bool for_compaction = caller == TableReaderCaller::kCompaction;
+  auto& fd = file_meta.fd;
+  table_reader = fd.table_reader;
+  if (table_reader == nullptr) {
+    s = FindTable(
+        options, file_options, icomparator, file_meta, &handle,
+        prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+        !for_compaction /* record_read_stats */, file_read_hist, skip_filters,
+        level, true /* prefetch_index_and_filter_in_cache */,
+        max_file_size_for_l0_meta_pin, file_meta.temperature);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(handle);
+    }
+  }
+  InternalIterator* result = nullptr;
+  if (s.ok()) {
+    if (options.table_filter &&
+        !options.table_filter(*table_reader->GetTableProperties())) {
+      result = NewEmptyInternalIterator<Slice>(arena);
+    } else {
+      result = table_reader->NewIterator(
+          options, prefix_extractor.get(), arena, skip_filters, caller,
+          file_options.compaction_readahead_size, allow_unprepared_value);
+    }
+    if (handle != nullptr) {
+      result->RegisterCleanup(&UnrefEntry, cache_, handle);
+      handle = nullptr;  // prevent from releasing below
+    }
+
+    if (for_compaction) {
+      table_reader->SetupForCompaction();
+    }
+    if (table_reader_ptr != nullptr) {
+      *table_reader_ptr = table_reader;
+    }
+  }
+  if (s.ok() && !options.ignore_range_deletions) {
+    if (range_del_iter != nullptr) {
+      auto new_range_del_iter =
+          table_reader->NewRangeTombstoneIterator(options);
+      if (new_range_del_iter == nullptr || new_range_del_iter->empty()) {
+        delete new_range_del_iter;
+        *range_del_iter = nullptr;
+      } else {
+        *range_del_iter = new TruncatedRangeDelIterator(
+            std::unique_ptr<FragmentedRangeTombstoneIterator>(
+                new_range_del_iter),
+            &icomparator, &file_meta.smallest, &file_meta.largest);
+      }
+    }
+    if (range_del_agg != nullptr) {
+      if (range_del_agg->AddFile(fd.GetNumber())) {
+        std::unique_ptr<FragmentedRangeTombstoneIterator> new_range_del_iter(
+            static_cast<FragmentedRangeTombstoneIterator*>(
+                table_reader->NewRangeTombstoneIterator(options)));
+        if (new_range_del_iter != nullptr) {
+          s = new_range_del_iter->status();
+        }
+        if (s.ok()) {
+          const InternalKey* smallest = &file_meta.smallest;
+          const InternalKey* largest = &file_meta.largest;
+          if (smallest_compaction_key != nullptr) {
+            smallest = smallest_compaction_key;
+          }
+          if (largest_compaction_key != nullptr) {
+            largest = largest_compaction_key;
+          }
+          range_del_agg->AddTombstones(std::move(new_range_del_iter), smallest,
+                                       largest);
+        }
+      }
+    }
+  }
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  if (!s.ok()) {
+    assert(result == nullptr);
+    result = NewErrorInternalIterator<Slice>(s, arena);
+  }
+  return result;
+}
+
+Status TableCache::GetRangeTombstoneIterator(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter) {
+  assert(out_iter);
+  const FileDescriptor& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(options, file_options_, internal_comparator, file_meta,
+                  &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok()) {
+    // Note: NewRangeTombstoneIterator could return nullptr
+    out_iter->reset(t->NewRangeTombstoneIterator(options));
+  }
+  if (handle) {
+    if (*out_iter) {
+      (*out_iter)->RegisterCleanup(&UnrefEntry, cache_, handle);
+    } else {
+      ReleaseHandle(handle);
+    }
+  }
+  return s;
+}
+
+#ifndef ROCKSDB_LITE
+void TableCache::CreateRowCacheKeyPrefix(const ReadOptions& options,
+                                         const FileDescriptor& fd,
+                                         const Slice& internal_key,
+                                         GetContext* get_context,
+                                         IterKey& row_cache_key) {
+  uint64_t fd_number = fd.GetNumber();
+  // We use the user key as cache key instead of the internal key,
+  // otherwise the whole cache would be invalidated every time the
+  // sequence key increases. However, to support caching snapshot
+  // reads, we append the sequence number (incremented by 1 to
+  // distinguish from 0) only in this case.
+  // If the snapshot is larger than the largest seqno in the file,
+  // all data should be exposed to the snapshot, so we treat it
+  // the same as there is no snapshot. The exception is that if
+  // a seq-checking callback is registered, some internal keys
+  // may still be filtered out.
+  uint64_t seq_no = 0;
+  // Maybe we can include the whole file ifsnapshot == fd.largest_seqno.
+  if (options.snapshot != nullptr &&
+      (get_context->has_callback() ||
+       static_cast_with_check<const SnapshotImpl>(options.snapshot)
+               ->GetSequenceNumber() <= fd.largest_seqno)) {
+    // We should consider to use options.snapshot->GetSequenceNumber()
+    // instead of GetInternalKeySeqno(k), which will make the code
+    // easier to understand.
+    seq_no = 1 + GetInternalKeySeqno(internal_key);
+  }
+
+  // Compute row cache key.
+  row_cache_key.TrimAppend(row_cache_key.Size(), row_cache_id_.data(),
+                           row_cache_id_.size());
+  AppendVarint64(&row_cache_key, fd_number);
+  AppendVarint64(&row_cache_key, seq_no);
+}
+
+bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                                 size_t prefix_size, GetContext* get_context) {
+  bool found = false;
+
+  row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
+  if (auto row_handle =
+          ioptions_.row_cache->Lookup(row_cache_key.GetUserKey())) {
+    // Cleanable routine to release the cache entry
+    Cleanable value_pinner;
+    auto release_cache_entry_func = [](void* cache_to_clean,
+                                       void* cache_handle) {
+      ((Cache*)cache_to_clean)->Release((Cache::Handle*)cache_handle);
+    };
+    auto found_row_cache_entry =
+        static_cast<const std::string*>(ioptions_.row_cache->Value(row_handle));
+    // If it comes here value is located on the cache.
+    // found_row_cache_entry points to the value on cache,
+    // and value_pinner has cleanup procedure for the cached entry.
+    // After replayGetContextLog() returns, get_context.pinnable_slice_
+    // will point to cache entry buffer (or a copy based on that) and
+    // cleanup routine under value_pinner will be delegated to
+    // get_context.pinnable_slice_. Cache entry is released when
+    // get_context.pinnable_slice_ is reset.
+    value_pinner.RegisterCleanup(release_cache_entry_func,
+                                 ioptions_.row_cache.get(), row_handle);
+    replayGetContextLog(*found_row_cache_entry, user_key, get_context,
+                        &value_pinner);
+    RecordTick(ioptions_.stats, ROW_CACHE_HIT);
+    found = true;
+  } else {
+    RecordTick(ioptions_.stats, ROW_CACHE_MISS);
+  }
+  return found;
+}
+#endif  // ROCKSDB_LITE
+
+Status TableCache::Get(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, bool skip_filters, int level,
+    size_t max_file_size_for_l0_meta_pin) {
+  auto& fd = file_meta.fd;
+  std::string* row_cache_entry = nullptr;
+  bool done = false;
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
+    auto user_key = ExtractUserKey(k);
+    CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
+    done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
+                           get_context);
+    if (!done) {
+      row_cache_entry = &row_cache_entry_buffer;
+    }
+  }
+#endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (!done) {
+    assert(s.ok());
+    if (t == nullptr) {
+      s = FindTable(options, file_options_, internal_comparator, file_meta,
+                    &handle, prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    max_file_size_for_l0_meta_pin, file_meta.temperature);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+      }
+    }
+    SequenceNumber* max_covering_tombstone_seq =
+        get_context->max_covering_tombstone_seq();
+    if (s.ok() && max_covering_tombstone_seq != nullptr &&
+        !options.ignore_range_deletions) {
+      std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+          t->NewRangeTombstoneIterator(options));
+      if (range_del_iter != nullptr) {
+        SequenceNumber seq =
+            range_del_iter->MaxCoveringTombstoneSeqnum(ExtractUserKey(k));
+        if (seq > *max_covering_tombstone_seq) {
+          *max_covering_tombstone_seq = seq;
+          if (get_context->NeedTimestamp()) {
+            get_context->SetTimestampFromRangeTombstone(
+                range_del_iter->timestamp());
+          }
+        }
+      }
+    }
+    if (s.ok()) {
+      get_context->SetReplayLog(row_cache_entry);  // nullptr if no cache.
+      s = t->Get(options, k, get_context, prefix_extractor.get(), skip_filters);
+      get_context->SetReplayLog(nullptr);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      // Couldn't find Table in cache but treat as kFound if no_io set
+      get_context->MarkKeyMayExist();
+      s = Status::OK();
+      done = true;
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  // Put the replay log in row cache only if something was found.
+  if (!done && s.ok() && row_cache_entry && !row_cache_entry->empty()) {
+    size_t charge = row_cache_entry->capacity() + sizeof(std::string);
+    void* row_ptr = new std::string(std::move(*row_cache_entry));
+    // If row cache is full, it's OK to continue.
+    ioptions_.row_cache
+        ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                 &DeleteEntry<std::string>)
+        .PermitUncheckedError();
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+void TableCache::UpdateRangeTombstoneSeqnums(
+    const ReadOptions& options, TableReader* t,
+    MultiGetContext::Range& table_range) {
+  std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter(
+      t->NewRangeTombstoneIterator(options));
+  if (range_del_iter != nullptr) {
+    for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+      SequenceNumber* max_covering_tombstone_seq =
+          iter->get_context->max_covering_tombstone_seq();
+      SequenceNumber seq =
+          range_del_iter->MaxCoveringTombstoneSeqnum(iter->ukey_with_ts);
+      if (seq > *max_covering_tombstone_seq) {
+        *max_covering_tombstone_seq = seq;
+        if (iter->get_context->NeedTimestamp()) {
+          iter->get_context->SetTimestampFromRangeTombstone(
+              range_del_iter->timestamp());
+        }
+      }
+    }
+  }
+}
+
+Status TableCache::MultiGetFilter(
+    const ReadOptions& options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    HistogramImpl* file_read_hist, int level,
+    MultiGetContext::Range* mget_range, Cache::Handle** table_handle) {
+  auto& fd = file_meta.fd;
+#ifndef ROCKSDB_LITE
+  IterKey row_cache_key;
+  std::string row_cache_entry_buffer;
+
+  // Check if we need to use the row cache. If yes, then we cannot do the
+  // filtering here, since the filtering needs to happen after the row cache
+  // lookup.
+  KeyContext& first_key = *mget_range->begin();
+  if (ioptions_.row_cache && !first_key.get_context->NeedToReadSequence()) {
+    return Status::NotSupported();
+  }
+#endif  // ROCKSDB_LITE
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  MultiGetContext::Range tombstone_range(*mget_range, mget_range->begin(),
+                                         mget_range->end());
+  if (t == nullptr) {
+    s = FindTable(
+        options, file_options_, internal_comparator, file_meta, &handle,
+        prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */,
+        true /* record_read_stats */, file_read_hist, /*skip_filters=*/false,
+        level, true /* prefetch_index_and_filter_in_cache */,
+        /*max_file_size_for_l0_meta_pin=*/0, file_meta.temperature);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+    *table_handle = handle;
+  }
+  if (s.ok()) {
+    s = t->MultiGetFilter(options, prefix_extractor.get(), mget_range);
+  }
+  if (s.ok() && !options.ignore_range_deletions) {
+    // Update the range tombstone sequence numbers for the keys here
+    // as TableCache::MultiGet may or may not be called, and even if it
+    // is, it may be called with fewer keys in the rangedue to filtering.
+    UpdateRangeTombstoneSeqnums(options, t, tombstone_range);
+  }
+  if (mget_range->empty() && handle) {
+    ReleaseHandle(handle);
+    *table_handle = nullptr;
+  }
+
+  return s;
+}
+
+Status TableCache::GetTableProperties(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    std::shared_ptr<const TableProperties>* properties,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor, bool no_io) {
+  auto table_reader = file_meta.fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    *properties = table_reader->GetTableProperties();
+
+    return Status::OK();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+                       file_meta, &table_handle, prefix_extractor, no_io);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  *properties = table->GetTableProperties();
+  ReleaseHandle(table_handle);
+  return s;
+}
+
+Status TableCache::ApproximateKeyAnchors(
+    const ReadOptions& ro, const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta, std::vector<TableReader::Anchor>& anchors) {
+  Status s;
+  TableReader* t = file_meta.fd.table_reader;
+  Cache::Handle* handle = nullptr;
+  if (t == nullptr) {
+    s = FindTable(ro, file_options_, internal_comparator, file_meta, &handle);
+    if (s.ok()) {
+      t = GetTableReaderFromHandle(handle);
+    }
+  }
+  if (s.ok() && t != nullptr) {
+    s = t->ApproximateKeyAnchors(ro, anchors);
+  }
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  return s;
+}
+
+size_t TableCache::GetMemoryUsageByTableReader(
+    const FileOptions& file_options,
+    const InternalKeyComparator& internal_comparator,
+    const FileMetaData& file_meta,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  auto table_reader = file_meta.fd.table_reader;
+  // table already been pre-loaded?
+  if (table_reader) {
+    return table_reader->ApproximateMemoryUsage();
+  }
+
+  Cache::Handle* table_handle = nullptr;
+  Status s = FindTable(ReadOptions(), file_options, internal_comparator,
+                       file_meta, &table_handle, prefix_extractor, true);
+  if (!s.ok()) {
+    return 0;
+  }
+  assert(table_handle);
+  auto table = GetTableReaderFromHandle(table_handle);
+  auto ret = table->ApproximateMemoryUsage();
+  ReleaseHandle(table_handle);
+  return ret;
+}
+
+bool TableCache::HasEntry(Cache* cache, uint64_t file_number) {
+  Cache::Handle* handle = cache->Lookup(GetSliceForFileNumber(&file_number));
+  if (handle) {
+    cache->Release(handle);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void TableCache::Evict(Cache* cache, uint64_t file_number) {
+  cache->Erase(GetSliceForFileNumber(&file_number));
+}
+
+uint64_t TableCache::ApproximateOffsetOf(
+    const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+    const InternalKeyComparator& internal_comparator,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = file_meta.fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s =
+        FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+                  &table_handle, prefix_extractor, false /* no_io */,
+                  !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateOffsetOf(key, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+
+uint64_t TableCache::ApproximateSize(
+    const Slice& start, const Slice& end, const FileMetaData& file_meta,
+    TableReaderCaller caller, const InternalKeyComparator& internal_comparator,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor) {
+  uint64_t result = 0;
+  TableReader* table_reader = file_meta.fd.table_reader;
+  Cache::Handle* table_handle = nullptr;
+  if (table_reader == nullptr) {
+    const bool for_compaction = (caller == TableReaderCaller::kCompaction);
+    Status s =
+        FindTable(ReadOptions(), file_options_, internal_comparator, file_meta,
+                  &table_handle, prefix_extractor, false /* no_io */,
+                  !for_compaction /* record_read_stats */);
+    if (s.ok()) {
+      table_reader = GetTableReaderFromHandle(table_handle);
+    }
+  }
+
+  if (table_reader != nullptr) {
+    result = table_reader->ApproximateSize(start, end, caller);
+  }
+  if (table_handle != nullptr) {
+    ReleaseHandle(table_handle);
+  }
+
+  return result;
+}
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache.h b/src/rocksdb/db/table_cache.h
new file mode 100644
index 000000000..2e50f2c77
--- /dev/null
+++ b/src/rocksdb/db/table_cache.h
@@ -0,0 +1,275 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// Thread-safe (provides internal synchronization)
+
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/range_del_aggregator.h"
+#include "options/cf_options.h"
+#include "port/port.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/table.h"
+#include "table/table_reader.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/coro_utils.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Env;
+class Arena;
+struct FileDescriptor;
+class GetContext;
+class HistogramImpl;
+
+// Manages caching for TableReader objects for a column family. The actual
+// cache is allocated separately and passed to the constructor. TableCache
+// wraps around the underlying SST file readers by providing Get(),
+// MultiGet() and NewIterator() methods that hide the instantiation,
+// caching and access to the TableReader. The main purpose of this is
+// performance - by caching the TableReader, it avoids unnecessary file opens
+// and object allocation and instantiation. One exception is compaction, where
+// a new TableReader may be instantiated - see NewIterator() comments
+//
+// Another service provided by TableCache is managing the row cache - if the
+// DB is configured with a row cache, and the lookup key is present in the row
+// cache, lookup is very fast. The row cache is obtained from
+// ioptions.row_cache
+class TableCache {
+ public:
+  TableCache(const ImmutableOptions& ioptions,
+             const FileOptions* storage_options, Cache* cache,
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_session_id);
+  ~TableCache();
+
+  // Return an iterator for the specified file number (the corresponding
+  // file length must be exactly "file_size" bytes).  If "table_reader_ptr"
+  // is non-nullptr, also sets "*table_reader_ptr" to point to the Table object
+  // underlying the returned iterator, or nullptr if no Table object underlies
+  // the returned iterator.  The returned "*table_reader_ptr" object is owned
+  // by the cache and should not be deleted, and is valid for as long as the
+  // returned iterator is live.
+  // If !options.ignore_range_deletions, and range_del_iter is non-nullptr,
+  // then range_del_iter is set to a TruncatedRangeDelIterator for range
+  // tombstones in the SST file corresponding to the specified file number. The
+  // upper/lower bounds for the TruncatedRangeDelIterator are set to the SST
+  // file's boundary.
+  // @param options Must outlive the returned iterator.
+  // @param range_del_agg If non-nullptr, adds range deletions to the
+  //    aggregator. If an error occurs, returns it in a NewErrorInternalIterator
+  // @param for_compaction If true, a new TableReader may be allocated (but
+  //                       not cached), depending on the CF options
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  InternalIterator* NewIterator(
+      const ReadOptions& options, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, RangeDelAggregator* range_del_agg,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      TableReader** table_reader_ptr, HistogramImpl* file_read_hist,
+      TableReaderCaller caller, Arena* arena, bool skip_filters, int level,
+      size_t max_file_size_for_l0_meta_pin,
+      const InternalKey* smallest_compaction_key,
+      const InternalKey* largest_compaction_key, bool allow_unprepared_value,
+      TruncatedRangeDelIterator** range_del_iter = nullptr);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param get_context Context for get operation. The result of the lookup
+  //                    can be retrieved by calling get_context->State()
+  // @param file_read_hist If non-nullptr, the file reader statistics are
+  //                       recorded
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  Status Get(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const Slice& k, GetContext* get_context,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, size_t max_file_size_for_l0_meta_pin = 0);
+
+  // Return the range delete tombstone iterator of the file specified by
+  // `file_meta`.
+  Status GetRangeTombstoneIterator(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::unique_ptr<FragmentedRangeTombstoneIterator>* out_iter);
+
+  // Call table reader's MultiGetFilter to use the bloom filter to filter out
+  // keys. Returns Status::NotSupported() if row cache needs to be checked.
+  // If the table cache is looked up to get the table reader, the cache handle
+  // is returned in table_handle. This handle should be passed back to
+  // MultiGet() so it can be released.
+  Status MultiGetFilter(
+      const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      HistogramImpl* file_read_hist, int level,
+      MultiGetContext::Range* mget_range, Cache::Handle** table_handle);
+
+  // If a seek to internal key "k" in specified file finds an entry,
+  // call get_context->SaveValue() repeatedly until
+  // it returns false. As a side effect, it will insert the TableReader
+  // into the cache and potentially evict another entry
+  // @param mget_range Pointer to the structure describing a batch of keys to
+  //                   be looked up in this table file. The result is stored
+  //                   in the embedded GetContext
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level The level this table is at, -1 for "not set / don't know"
+  DECLARE_SYNC_AND_ASYNC(
+      Status, MultiGet, const ReadOptions& options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      bool skip_range_deletions = false, int level = -1,
+      Cache::Handle* table_handle = nullptr);
+
+  // Evict any entry for the specified file number
+  static void Evict(Cache* cache, uint64_t file_number);
+
+  // Query whether specified file number is currently in cache
+  static bool HasEntry(Cache* cache, uint64_t file_number);
+
+  // Clean table handle and erase it from the table cache
+  // Used in DB close, or the file is not live anymore.
+  void EraseHandle(const FileDescriptor& fd, Cache::Handle* handle);
+
+  // Find table reader
+  // @param skip_filters Disables loading/accessing the filter block
+  // @param level == -1 means not specified
+  Status FindTable(
+      const ReadOptions& ro, const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, Cache::Handle**,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      const bool no_io = false, bool record_read_stats = true,
+      HistogramImpl* file_read_hist = nullptr, bool skip_filters = false,
+      int level = -1, bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
+
+  // Get TableReader from a cache handle.
+  TableReader* GetTableReaderFromHandle(Cache::Handle* handle);
+
+  // Get the table properties of a given table.
+  // @no_io: indicates if we should load table to the cache if it is not present
+  //         in table cache yet.
+  // @returns: `properties` will be reset on success. Please note that we will
+  //            return Status::Incomplete() if table is not present in cache and
+  //            we set `no_io` to be true.
+  Status GetTableProperties(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      std::shared_ptr<const TableProperties>* properties,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool no_io = false);
+
+  Status ApproximateKeyAnchors(const ReadOptions& ro,
+                               const InternalKeyComparator& internal_comparator,
+                               const FileMetaData& file_meta,
+                               std::vector<TableReader::Anchor>& anchors);
+
+  // Return total memory usage of the table reader of the file.
+  // 0 if table reader of the file is not loaded.
+  size_t GetMemoryUsageByTableReader(
+      const FileOptions& toptions,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Returns approximated offset of a key in a file represented by fd.
+  uint64_t ApproximateOffsetOf(
+      const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Returns approximated data size between start and end keys in a file
+  // represented by fd (the start key must not be greater than the end key).
+  uint64_t ApproximateSize(
+      const Slice& start, const Slice& end, const FileMetaData& file_meta,
+      TableReaderCaller caller,
+      const InternalKeyComparator& internal_comparator,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr);
+
+  // Release the handle from a cache
+  void ReleaseHandle(Cache::Handle* handle);
+
+  Cache* get_cache() const { return cache_; }
+
+  // Capacity of the backing Cache that indicates infinite TableCache capacity.
+  // For example when max_open_files is -1 we set the backing Cache to this.
+  static const int kInfiniteCapacity = 0x400000;
+
+  // The tables opened with this TableCache will be immortal, i.e., their
+  // lifetime is as long as that of the DB.
+  void SetTablesAreImmortal() {
+    if (cache_->GetCapacity() >= kInfiniteCapacity) {
+      immortal_tables_ = true;
+    }
+  }
+
+ private:
+  // Build a table reader
+  Status GetTableReader(
+      const ReadOptions& ro, const FileOptions& file_options,
+      const InternalKeyComparator& internal_comparator,
+      const FileMetaData& file_meta, bool sequential_mode,
+      bool record_read_stats, HistogramImpl* file_read_hist,
+      std::unique_ptr<TableReader>* table_reader,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor = nullptr,
+      bool skip_filters = false, int level = -1,
+      bool prefetch_index_and_filter_in_cache = true,
+      size_t max_file_size_for_l0_meta_pin = 0,
+      Temperature file_temperature = Temperature::kUnknown);
+
+  // Update the max_covering_tombstone_seq in the GetContext for each key based
+  // on the range deletions in the table
+  void UpdateRangeTombstoneSeqnums(const ReadOptions& options, TableReader* t,
+                                   MultiGetContext::Range& table_range);
+
+  // Create a key prefix for looking up the row cache. The prefix is of the
+  // format row_cache_id + fd_number + seq_no. Later, the user key can be
+  // appended to form the full key
+  void CreateRowCacheKeyPrefix(const ReadOptions& options,
+                               const FileDescriptor& fd,
+                               const Slice& internal_key,
+                               GetContext* get_context, IterKey& row_cache_key);
+
+  // Helper function to lookup the row cache for a key. It appends the
+  // user key to row_cache_key at offset prefix_size
+  bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
+                       size_t prefix_size, GetContext* get_context);
+
+  const ImmutableOptions& ioptions_;
+  const FileOptions& file_options_;
+  Cache* const cache_;
+  std::string row_cache_id_;
+  bool immortal_tables_;
+  BlockCacheTracer* const block_cache_tracer_;
+  Striped<port::Mutex, Slice> loader_mutex_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  std::string db_session_id_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_cache_sync_and_async.h b/src/rocksdb/db/table_cache_sync_and_async.h
new file mode 100644
index 000000000..e72abdd45
--- /dev/null
+++ b/src/rocksdb/db/table_cache_sync_and_async.h
@@ -0,0 +1,135 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+namespace ROCKSDB_NAMESPACE {
+
+#if defined(WITHOUT_COROUTINES)
+#endif
+
+// Batched version of TableCache::MultiGet.
+DEFINE_SYNC_AND_ASYNC(Status, TableCache::MultiGet)
+(const ReadOptions& options, const InternalKeyComparator& internal_comparator,
+ const FileMetaData& file_meta, const MultiGetContext::Range* mget_range,
+ const std::shared_ptr<const SliceTransform>& prefix_extractor,
+ HistogramImpl* file_read_hist, bool skip_filters, bool skip_range_deletions,
+ int level, Cache::Handle* table_handle) {
+  auto& fd = file_meta.fd;
+  Status s;
+  TableReader* t = fd.table_reader;
+  Cache::Handle* handle = table_handle;
+  MultiGetRange table_range(*mget_range, mget_range->begin(),
+                            mget_range->end());
+  if (handle != nullptr && t == nullptr) {
+    t = GetTableReaderFromHandle(handle);
+  }
+#ifndef ROCKSDB_LITE
+  autovector<std::string, MultiGetContext::MAX_BATCH_SIZE> row_cache_entries;
+  IterKey row_cache_key;
+  size_t row_cache_key_prefix_size = 0;
+  KeyContext& first_key = *table_range.begin();
+  bool lookup_row_cache =
+      ioptions_.row_cache && !first_key.get_context->NeedToReadSequence();
+
+  // Check row cache if enabled. Since row cache does not currently store
+  // sequence numbers, we cannot use it if we need to fetch the sequence.
+  if (lookup_row_cache) {
+    GetContext* first_context = first_key.get_context;
+    CreateRowCacheKeyPrefix(options, fd, first_key.ikey, first_context,
+                            row_cache_key);
+    row_cache_key_prefix_size = row_cache_key.Size();
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      const Slice& user_key = miter->ukey_with_ts;
+
+      GetContext* get_context = miter->get_context;
+
+      if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+                          get_context)) {
+        table_range.SkipKey(miter);
+      } else {
+        row_cache_entries.emplace_back();
+        get_context->SetReplayLog(&(row_cache_entries.back()));
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  // Check that table_range is not empty. Its possible all keys may have been
+  // found in the row cache and thus the range may now be empty
+  if (s.ok() && !table_range.empty()) {
+    if (t == nullptr) {
+      assert(handle == nullptr);
+      s = FindTable(options, file_options_, internal_comparator, file_meta,
+                    &handle, prefix_extractor,
+                    options.read_tier == kBlockCacheTier /* no_io */,
+                    true /* record_read_stats */, file_read_hist, skip_filters,
+                    level, true /* prefetch_index_and_filter_in_cache */,
+                    0 /*max_file_size_for_l0_meta_pin*/, file_meta.temperature);
+      TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
+      if (s.ok()) {
+        t = GetTableReaderFromHandle(handle);
+        assert(t);
+      }
+    }
+    if (s.ok() && !options.ignore_range_deletions && !skip_range_deletions) {
+      UpdateRangeTombstoneSeqnums(options, t, table_range);
+    }
+    if (s.ok()) {
+      CO_AWAIT(t->MultiGet)
+      (options, &table_range, prefix_extractor.get(), skip_filters);
+    } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) {
+      for (auto iter = table_range.begin(); iter != table_range.end(); ++iter) {
+        Status* status = iter->s;
+        if (status->IsIncomplete()) {
+          // Couldn't find Table in cache but treat as kFound if no_io set
+          iter->get_context->MarkKeyMayExist();
+          s = Status::OK();
+        }
+      }
+    }
+  }
+
+#ifndef ROCKSDB_LITE
+  if (lookup_row_cache) {
+    size_t row_idx = 0;
+
+    for (auto miter = table_range.begin(); miter != table_range.end();
+         ++miter) {
+      std::string& row_cache_entry = row_cache_entries[row_idx++];
+      const Slice& user_key = miter->ukey_with_ts;
+      ;
+      GetContext* get_context = miter->get_context;
+
+      get_context->SetReplayLog(nullptr);
+      // Compute row cache key.
+      row_cache_key.TrimAppend(row_cache_key_prefix_size, user_key.data(),
+                               user_key.size());
+      // Put the replay log in row cache only if something was found.
+      if (s.ok() && !row_cache_entry.empty()) {
+        size_t charge = row_cache_entry.capacity() + sizeof(std::string);
+        void* row_ptr = new std::string(std::move(row_cache_entry));
+        // If row cache is full, it's OK.
+        ioptions_.row_cache
+            ->Insert(row_cache_key.GetUserKey(), row_ptr, charge,
+                     &DeleteEntry<std::string>)
+            .PermitUncheckedError();
+      }
+    }
+  }
+#endif  // ROCKSDB_LITE
+
+  if (handle != nullptr) {
+    ReleaseHandle(handle);
+  }
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/table_properties_collector.cc b/src/rocksdb/db/table_properties_collector.cc
new file mode 100644
index 000000000..edb9a1b63
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.cc
@@ -0,0 +1,74 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include "db/dbformat.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+uint64_t GetUint64Property(const UserCollectedProperties& props,
+                           const std::string& property_name,
+                           bool* property_present) {
+  auto pos = props.find(property_name);
+  if (pos == props.end()) {
+    *property_present = false;
+    return 0;
+  }
+  Slice raw = pos->second;
+  uint64_t val = 0;
+  *property_present = true;
+  return GetVarint64(&raw, &val) ? val : 0;
+}
+
+}  // anonymous namespace
+
+Status UserKeyTablePropertiesCollector::InternalAdd(const Slice& key,
+                                                    const Slice& value,
+                                                    uint64_t file_size) {
+  ParsedInternalKey ikey;
+  Status s = ParseInternalKey(key, &ikey, false /* log_err_key */);  // TODO
+  if (!s.ok()) {
+    return s;
+  }
+
+  return collector_->AddUserKey(ikey.user_key, value, GetEntryType(ikey.type),
+                                ikey.sequence, file_size);
+}
+
+void UserKeyTablePropertiesCollector::BlockAdd(
+    uint64_t block_uncomp_bytes, uint64_t block_compressed_bytes_fast,
+    uint64_t block_compressed_bytes_slow) {
+  return collector_->BlockAdd(block_uncomp_bytes, block_compressed_bytes_fast,
+                              block_compressed_bytes_slow);
+}
+
+Status UserKeyTablePropertiesCollector::Finish(
+    UserCollectedProperties* properties) {
+  return collector_->Finish(properties);
+}
+
+UserCollectedProperties UserKeyTablePropertiesCollector::GetReadableProperties()
+    const {
+  return collector_->GetReadableProperties();
+}
+
+uint64_t GetDeletedKeys(const UserCollectedProperties& props) {
+  bool property_present_ignored;
+  return GetUint64Property(props, TablePropertiesNames::kDeletedKeys,
+                           &property_present_ignored);
+}
+
+uint64_t GetMergeOperands(const UserCollectedProperties& props,
+                          bool* property_present) {
+  return GetUint64Property(props, TablePropertiesNames::kMergeOperands,
+                           property_present);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector.h b/src/rocksdb/db/table_properties_collector.h
new file mode 100644
index 000000000..9035ba793
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector.h
@@ -0,0 +1,175 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// This file defines a collection of statistics collectors.
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/table_properties.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Base class for internal table properties collector.
+class IntTblPropCollector {
+ public:
+  virtual ~IntTblPropCollector() {}
+  virtual Status Finish(UserCollectedProperties* properties) = 0;
+
+  virtual const char* Name() const = 0;
+
+  // @params key    the user key that is inserted into the table.
+  // @params value  the value that is inserted into the table.
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) = 0;
+
+  virtual void BlockAdd(uint64_t block_uncomp_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) = 0;
+
+  virtual UserCollectedProperties GetReadableProperties() const = 0;
+
+  virtual bool NeedCompact() const { return false; }
+};
+
+// Factory for internal table properties collector.
+class IntTblPropCollectorFactory {
+ public:
+  virtual ~IntTblPropCollectorFactory() {}
+  // has to be thread-safe
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id, int level_at_creation) = 0;
+
+  // The name of the properties collector can be used for debugging purpose.
+  virtual const char* Name() const = 0;
+};
+
+using IntTblPropCollectorFactories =
+    std::vector<std::unique_ptr<IntTblPropCollectorFactory>>;
+
+// When rocksdb creates a new table, it will encode all "user keys" into
+// "internal keys", which contains meta information of a given entry.
+//
+// This class extracts user key from the encoded internal key when Add() is
+// invoked.
+class UserKeyTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  // transfer of ownership
+  explicit UserKeyTablePropertiesCollector(TablePropertiesCollector* collector)
+      : collector_(collector) {}
+
+  virtual ~UserKeyTablePropertiesCollector() {}
+
+  virtual Status InternalAdd(const Slice& key, const Slice& value,
+                             uint64_t file_size) override;
+
+  virtual void BlockAdd(uint64_t block_uncomp_bytes,
+                        uint64_t block_compressed_bytes_fast,
+                        uint64_t block_compressed_bytes_slow) override;
+
+  virtual Status Finish(UserCollectedProperties* properties) override;
+
+  virtual const char* Name() const override { return collector_->Name(); }
+
+  UserCollectedProperties GetReadableProperties() const override;
+
+  virtual bool NeedCompact() const override {
+    return collector_->NeedCompact();
+  }
+
+ protected:
+  std::unique_ptr<TablePropertiesCollector> collector_;
+};
+
+class UserKeyTablePropertiesCollectorFactory
+    : public IntTblPropCollectorFactory {
+ public:
+  explicit UserKeyTablePropertiesCollectorFactory(
+      std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
+      : user_collector_factory_(user_collector_factory) {}
+  virtual IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t column_family_id, int level_at_creation) override {
+    TablePropertiesCollectorFactory::Context context;
+    context.column_family_id = column_family_id;
+    context.level_at_creation = level_at_creation;
+    return new UserKeyTablePropertiesCollector(
+        user_collector_factory_->CreateTablePropertiesCollector(context));
+  }
+
+  virtual const char* Name() const override {
+    return user_collector_factory_->Name();
+  }
+
+ private:
+  std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory_;
+};
+
+// When rocksdb creates a newtable, it will encode all "user keys" into
+// "internal keys". This class collects min/max timestamp from the encoded
+// internal key when Add() is invoked.
+//
+// @param cmp  the user comparator to compare the timestamps in internal key.
+class TimestampTablePropertiesCollector : public IntTblPropCollector {
+ public:
+  explicit TimestampTablePropertiesCollector(const Comparator* cmp)
+      : cmp_(cmp),
+        timestamp_min_(kDisableUserTimestamp),
+        timestamp_max_(kDisableUserTimestamp) {}
+
+  Status InternalAdd(const Slice& key, const Slice& /* value */,
+                     uint64_t /* file_size */) override {
+    auto user_key = ExtractUserKey(key);
+    assert(cmp_ && cmp_->timestamp_size() > 0);
+    if (user_key.size() < cmp_->timestamp_size()) {
+      return Status::Corruption(
+          "User key size mismatch when comparing to timestamp size.");
+    }
+    auto timestamp_in_key =
+        ExtractTimestampFromUserKey(user_key, cmp_->timestamp_size());
+    if (timestamp_max_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_in_key, timestamp_max_) > 0) {
+      timestamp_max_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    if (timestamp_min_ == kDisableUserTimestamp ||
+        cmp_->CompareTimestamp(timestamp_min_, timestamp_in_key) > 0) {
+      timestamp_min_.assign(timestamp_in_key.data(), timestamp_in_key.size());
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    return;
+  }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    assert(timestamp_min_.size() == timestamp_max_.size() &&
+           timestamp_max_.size() == cmp_->timestamp_size());
+    properties->insert({"rocksdb.timestamp_min", timestamp_min_});
+    properties->insert({"rocksdb.timestamp_max", timestamp_max_});
+    return Status::OK();
+  }
+
+  const char* Name() const override {
+    return "TimestampTablePropertiesCollector";
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return {{"rocksdb.timestamp_min", Slice(timestamp_min_).ToString(true)},
+            {"rocksdb.timestamp_max", Slice(timestamp_max_).ToString(true)}};
+  }
+
+ protected:
+  const Comparator* const cmp_;
+  std::string timestamp_min_;
+  std::string timestamp_max_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/table_properties_collector_test.cc b/src/rocksdb/db/table_properties_collector_test.cc
new file mode 100644
index 000000000..5f0f205da
--- /dev/null
+++ b/src/rocksdb/db/table_properties_collector_test.cc
@@ -0,0 +1,513 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/table_properties_collector.h"
+
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "file/sequence_file_reader.h"
+#include "file/writable_file_writer.h"
+#include "options/cf_options.h"
+#include "rocksdb/flush_block_policy.h"
+#include "rocksdb/table.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/meta_blocks.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_builder.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class TablePropertiesTest : public testing::Test,
+                            public testing::WithParamInterface<bool> {
+ public:
+  void SetUp() override { backward_mode_ = GetParam(); }
+
+  bool backward_mode_;
+};
+
+// Utilities test functions
+namespace {
+static const uint32_t kTestColumnFamilyId = 66;
+static const std::string kTestColumnFamilyName = "test_column_fam";
+static const int kTestLevel = 1;
+
+void MakeBuilder(
+    const Options& options, const ImmutableOptions& ioptions,
+    const MutableCFOptions& moptions,
+    const InternalKeyComparator& internal_comparator,
+    const IntTblPropCollectorFactories* int_tbl_prop_collector_factories,
+    std::unique_ptr<WritableFileWriter>* writable,
+    std::unique_ptr<TableBuilder>* builder) {
+  std::unique_ptr<FSWritableFile> wf(new test::StringSink);
+  writable->reset(
+      new WritableFileWriter(std::move(wf), "" /* don't care */, EnvOptions()));
+  TableBuilderOptions tboptions(
+      ioptions, moptions, internal_comparator, int_tbl_prop_collector_factories,
+      options.compression, options.compression_opts, kTestColumnFamilyId,
+      kTestColumnFamilyName, kTestLevel);
+  builder->reset(NewTableBuilder(tboptions, writable->get()));
+}
+}  // namespace
+
+// Collects keys that starts with "A" in a table.
+class RegularKeysStartWithA : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    std::string encoded_num_puts;
+    std::string encoded_num_deletes;
+    std::string encoded_num_single_deletes;
+    std::string encoded_num_size_changes;
+    PutVarint32(&encoded, count_);
+    PutVarint32(&encoded_num_puts, num_puts_);
+    PutVarint32(&encoded_num_deletes, num_deletes_);
+    PutVarint32(&encoded_num_single_deletes, num_single_deletes_);
+    PutVarint32(&encoded_num_size_changes, num_size_changes_);
+    *properties = UserCollectedProperties{
+        {"TablePropertiesTest", message_},
+        {"Count", encoded},
+        {"NumPuts", encoded_num_puts},
+        {"NumDeletes", encoded_num_deletes},
+        {"NumSingleDeletes", encoded_num_single_deletes},
+        {"NumSizeChanges", encoded_num_size_changes},
+    };
+    return Status::OK();
+  }
+
+  Status AddUserKey(const Slice& user_key, const Slice& /*value*/,
+                    EntryType type, SequenceNumber /*seq*/,
+                    uint64_t file_size) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    if (type == kEntryPut) {
+      num_puts_++;
+    } else if (type == kEntryDelete) {
+      num_deletes_++;
+    } else if (type == kEntrySingleDelete) {
+      num_single_deletes_++;
+    }
+    if (file_size < file_size_) {
+      message_ = "File size should not decrease.";
+    } else if (file_size != file_size_) {
+      num_size_changes_++;
+    }
+
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  std::string message_ = "Rocksdb";
+  uint32_t count_ = 0;
+  uint32_t num_puts_ = 0;
+  uint32_t num_deletes_ = 0;
+  uint32_t num_single_deletes_ = 0;
+  uint32_t num_size_changes_ = 0;
+  uint64_t file_size_ = 0;
+};
+
+// Collects keys that starts with "A" in a table. Backward compatible mode
+// It is also used to test internal key table property collector
+class RegularKeysStartWithABackwardCompatible
+    : public TablePropertiesCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status Add(const Slice& user_key, const Slice& /*value*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAInternal : public IntTblPropCollector {
+ public:
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  Status Finish(UserCollectedProperties* properties) override {
+    std::string encoded;
+    PutVarint32(&encoded, count_);
+    *properties = UserCollectedProperties{{"TablePropertiesTest", "Rocksdb"},
+                                          {"Count", encoded}};
+    return Status::OK();
+  }
+
+  Status InternalAdd(const Slice& user_key, const Slice& /*value*/,
+                     uint64_t /*file_size*/) override {
+    // simply asssume all user keys are not empty.
+    if (user_key.data()[0] == 'A') {
+      ++count_;
+    }
+    return Status::OK();
+  }
+
+  void BlockAdd(uint64_t /* block_uncomp_bytes */,
+                uint64_t /* block_compressed_bytes_fast */,
+                uint64_t /* block_compressed_bytes_slow */) override {
+    // Nothing to do.
+    return;
+  }
+
+  UserCollectedProperties GetReadableProperties() const override {
+    return UserCollectedProperties{};
+  }
+
+ private:
+  uint32_t count_ = 0;
+};
+
+class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
+                                     public TablePropertiesCollectorFactory {
+ public:
+  explicit RegularKeysStartWithAFactory(bool backward_mode)
+      : backward_mode_(backward_mode) {}
+  TablePropertiesCollector* CreateTablePropertiesCollector(
+      TablePropertiesCollectorFactory::Context context) override {
+    EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
+    EXPECT_EQ(kTestLevel, context.level_at_creation);
+    if (!backward_mode_) {
+      return new RegularKeysStartWithA();
+    } else {
+      return new RegularKeysStartWithABackwardCompatible();
+    }
+  }
+  IntTblPropCollector* CreateIntTblPropCollector(
+      uint32_t /*column_family_id*/, int /* level_at_creation */) override {
+    return new RegularKeysStartWithAInternal();
+  }
+  const char* Name() const override { return "RegularKeysStartWithA"; }
+
+  bool backward_mode_;
+};
+
+class FlushBlockEveryThreePolicy : public FlushBlockPolicy {
+ public:
+  bool Update(const Slice& /*key*/, const Slice& /*value*/) override {
+    return (++count_ % 3U == 0);
+  }
+
+ private:
+  uint64_t count_ = 0;
+};
+
+class FlushBlockEveryThreePolicyFactory : public FlushBlockPolicyFactory {
+ public:
+  explicit FlushBlockEveryThreePolicyFactory() {}
+
+  const char* Name() const override {
+    return "FlushBlockEveryThreePolicyFactory";
+  }
+
+  FlushBlockPolicy* NewFlushBlockPolicy(
+      const BlockBasedTableOptions& /*table_options*/,
+      const BlockBuilder& /*data_block_builder*/) const override {
+    return new FlushBlockEveryThreePolicy;
+  }
+};
+
+extern const uint64_t kBlockBasedTableMagicNumber;
+extern const uint64_t kPlainTableMagicNumber;
+namespace {
+void TestCustomizedTablePropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool test_int_tbl_prop_collector,
+    const Options& options, const InternalKeyComparator& internal_comparator) {
+  // make sure the entries will be inserted with order.
+  std::map<std::pair<std::string, ValueType>, std::string> kvs = {
+      {{"About   ", kTypeValue}, "val5"},  // starts with 'A'
+      {{"Abstract", kTypeValue}, "val2"},  // starts with 'A'
+      {{"Around  ", kTypeValue}, "val7"},  // starts with 'A'
+      {{"Beyond  ", kTypeValue}, "val3"},
+      {{"Builder ", kTypeValue}, "val1"},
+      {{"Love    ", kTypeDeletion}, ""},
+      {{"Cancel  ", kTypeValue}, "val4"},
+      {{"Find    ", kTypeValue}, "val6"},
+      {{"Rocks   ", kTypeDeletion}, ""},
+      {{"Foo     ", kTypeSingleDeletion}, ""},
+  };
+
+  // -- Step 1: build table
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writer;
+  const ImmutableOptions ioptions(options);
+  const MutableCFOptions moptions(options);
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  if (test_int_tbl_prop_collector) {
+    int_tbl_prop_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+  } else {
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+  }
+  MakeBuilder(options, ioptions, moptions, internal_comparator,
+              &int_tbl_prop_collector_factories, &writer, &builder);
+
+  SequenceNumber seqNum = 0U;
+  for (const auto& kv : kvs) {
+    InternalKey ikey(kv.first.first, seqNum++, kv.first.second);
+    builder->Add(ikey.Encode(), kv.second);
+  }
+  ASSERT_OK(builder->Finish());
+  ASSERT_OK(writer->Flush());
+
+  // -- Step 2: Read properties
+  test::StringSink* fwf =
+      static_cast<test::StringSink*>(writer->writable_file());
+  std::unique_ptr<FSRandomAccessFile> source(
+      new test::StringSource(fwf->contents()));
+  std::unique_ptr<RandomAccessFileReader> fake_file_reader(
+      new RandomAccessFileReader(std::move(source), "test"));
+
+  std::unique_ptr<TableProperties> props;
+  Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(),
+                                 magic_number, ioptions, &props);
+  ASSERT_OK(s);
+
+  auto user_collected = props->user_collected_properties;
+
+  ASSERT_NE(user_collected.find("TablePropertiesTest"), user_collected.end());
+  ASSERT_EQ("Rocksdb", user_collected.at("TablePropertiesTest"));
+
+  uint32_t starts_with_A = 0;
+  ASSERT_NE(user_collected.find("Count"), user_collected.end());
+  Slice key(user_collected.at("Count"));
+  ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+  ASSERT_EQ(3u, starts_with_A);
+
+  if (!backward_mode && !test_int_tbl_prop_collector) {
+    uint32_t num_puts;
+    ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+    Slice key_puts(user_collected.at("NumPuts"));
+    ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+    ASSERT_EQ(7u, num_puts);
+
+    uint32_t num_deletes;
+    ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+    Slice key_deletes(user_collected.at("NumDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+    ASSERT_EQ(2u, num_deletes);
+
+    uint32_t num_single_deletes;
+    ASSERT_NE(user_collected.find("NumSingleDeletes"), user_collected.end());
+    Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+    ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+    ASSERT_EQ(1u, num_single_deletes);
+
+    uint32_t num_size_changes;
+    ASSERT_NE(user_collected.find("NumSizeChanges"), user_collected.end());
+    Slice key_size_changes(user_collected.at("NumSizeChanges"));
+    ASSERT_TRUE(GetVarint32(&key_size_changes, &num_size_changes));
+    ASSERT_GE(num_size_changes, 2u);
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, CustomizedTablePropertiesCollector) {
+  // Test properties collectors with internal keys or regular keys
+  // for block based table
+  for (bool encode_as_internal : {true, false}) {
+    Options options;
+    BlockBasedTableOptions table_options;
+    table_options.flush_block_policy_factory =
+        std::make_shared<FlushBlockEveryThreePolicyFactory>();
+    options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+
+    test::PlainInternalKeyComparator ikc(options.comparator);
+    std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
+        new RegularKeysStartWithAFactory(backward_mode_));
+    options.table_properties_collector_factories.resize(1);
+    options.table_properties_collector_factories[0] = collector_factory;
+
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kBlockBasedTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+    // test plain table
+    PlainTableOptions plain_table_options;
+    plain_table_options.user_key_len = 8;
+    plain_table_options.bloom_bits_per_key = 8;
+    plain_table_options.hash_table_ratio = 0;
+
+    options.table_factory =
+        std::make_shared<PlainTableFactory>(plain_table_options);
+    TestCustomizedTablePropertiesCollector(backward_mode_,
+                                           kPlainTableMagicNumber,
+                                           encode_as_internal, options, ikc);
+#endif  // !ROCKSDB_LITE
+  }
+}
+
+namespace {
+void TestInternalKeyPropertiesCollector(
+    bool backward_mode, uint64_t magic_number, bool sanitized,
+    std::shared_ptr<TableFactory> table_factory) {
+  InternalKey keys[] = {
+      InternalKey("A       ", 0, ValueType::kTypeValue),
+      InternalKey("B       ", 1, ValueType::kTypeValue),
+      InternalKey("C       ", 2, ValueType::kTypeValue),
+      InternalKey("W       ", 3, ValueType::kTypeDeletion),
+      InternalKey("X       ", 4, ValueType::kTypeDeletion),
+      InternalKey("Y       ", 5, ValueType::kTypeDeletion),
+      InternalKey("Z       ", 6, ValueType::kTypeDeletion),
+      InternalKey("a       ", 7, ValueType::kTypeSingleDeletion),
+      InternalKey("b       ", 8, ValueType::kTypeMerge),
+      InternalKey("c       ", 9, ValueType::kTypeMerge),
+  };
+
+  std::unique_ptr<TableBuilder> builder;
+  std::unique_ptr<WritableFileWriter> writable;
+  Options options;
+  test::PlainInternalKeyComparator pikc(options.comparator);
+
+  IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+  options.table_factory = table_factory;
+  if (sanitized) {
+    options.table_properties_collector_factories.emplace_back(
+        new RegularKeysStartWithAFactory(backward_mode));
+    // with sanitization, even regular properties collector will be able to
+    // handle internal keys.
+    auto comparator = options.comparator;
+    // HACK: Set options.info_log to avoid writing log in
+    // SanitizeOptions().
+    options.info_log = std::make_shared<test::NullLogger>();
+    options = SanitizeOptions("db",  // just a place holder
+                              options);
+    ImmutableOptions ioptions(options);
+    GetIntTblPropCollectorFactory(ioptions, &int_tbl_prop_collector_factories);
+    options.comparator = comparator;
+  }
+  const ImmutableOptions ioptions(options);
+  MutableCFOptions moptions(options);
+
+  for (int iter = 0; iter < 2; ++iter) {
+    MakeBuilder(options, ioptions, moptions, pikc,
+                &int_tbl_prop_collector_factories, &writable, &builder);
+    for (const auto& k : keys) {
+      builder->Add(k.Encode(), "val");
+    }
+
+    ASSERT_OK(builder->Finish());
+    ASSERT_OK(writable->Flush());
+
+    test::StringSink* fwf =
+        static_cast<test::StringSink*>(writable->writable_file());
+    std::unique_ptr<FSRandomAccessFile> source(
+        new test::StringSource(fwf->contents()));
+    std::unique_ptr<RandomAccessFileReader> reader(
+        new RandomAccessFileReader(std::move(source), "test"));
+
+    std::unique_ptr<TableProperties> props;
+    Status s = ReadTableProperties(reader.get(), fwf->contents().size(),
+                                   magic_number, ioptions, &props);
+    ASSERT_OK(s);
+
+    auto user_collected = props->user_collected_properties;
+    uint64_t deleted = GetDeletedKeys(user_collected);
+    ASSERT_EQ(5u, deleted);  // deletes + single-deletes
+
+    bool property_present;
+    uint64_t merges = GetMergeOperands(user_collected, &property_present);
+    ASSERT_TRUE(property_present);
+    ASSERT_EQ(2u, merges);
+
+    if (sanitized) {
+      uint32_t starts_with_A = 0;
+      ASSERT_NE(user_collected.find("Count"), user_collected.end());
+      Slice key(user_collected.at("Count"));
+      ASSERT_TRUE(GetVarint32(&key, &starts_with_A));
+      ASSERT_EQ(1u, starts_with_A);
+
+      if (!backward_mode) {
+        uint32_t num_puts;
+        ASSERT_NE(user_collected.find("NumPuts"), user_collected.end());
+        Slice key_puts(user_collected.at("NumPuts"));
+        ASSERT_TRUE(GetVarint32(&key_puts, &num_puts));
+        ASSERT_EQ(3u, num_puts);
+
+        uint32_t num_deletes;
+        ASSERT_NE(user_collected.find("NumDeletes"), user_collected.end());
+        Slice key_deletes(user_collected.at("NumDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_deletes, &num_deletes));
+        ASSERT_EQ(4u, num_deletes);
+
+        uint32_t num_single_deletes;
+        ASSERT_NE(user_collected.find("NumSingleDeletes"),
+                  user_collected.end());
+        Slice key_single_deletes(user_collected.at("NumSingleDeletes"));
+        ASSERT_TRUE(GetVarint32(&key_single_deletes, &num_single_deletes));
+        ASSERT_EQ(1u, num_single_deletes);
+      }
+    }
+  }
+}
+}  // namespace
+
+TEST_P(TablePropertiesTest, InternalKeyPropertiesCollector) {
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kBlockBasedTableMagicNumber, true /* sanitize */,
+      std::make_shared<BlockBasedTableFactory>());
+  if (backward_mode_) {
+    TestInternalKeyPropertiesCollector(
+        backward_mode_, kBlockBasedTableMagicNumber, false /* not sanitize */,
+        std::make_shared<BlockBasedTableFactory>());
+  }
+
+#ifndef ROCKSDB_LITE  // PlainTable is not supported in Lite
+  PlainTableOptions plain_table_options;
+  plain_table_options.user_key_len = 8;
+  plain_table_options.bloom_bits_per_key = 8;
+  plain_table_options.hash_table_ratio = 0;
+
+  TestInternalKeyPropertiesCollector(
+      backward_mode_, kPlainTableMagicNumber, false /* not sanitize */,
+      std::make_shared<PlainTableFactory>(plain_table_options));
+#endif  // !ROCKSDB_LITE
+}
+
+INSTANTIATE_TEST_CASE_P(InternalKeyPropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+INSTANTIATE_TEST_CASE_P(CustomizedTablePropertiesCollector, TablePropertiesTest,
+                        ::testing::Bool());
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/transaction_log_impl.cc b/src/rocksdb/db/transaction_log_impl.cc
new file mode 100644
index 000000000..3878b428a
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.cc
@@ -0,0 +1,298 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/transaction_log_impl.h"
+
+#include <cinttypes>
+
+#include "db/write_batch_internal.h"
+#include "file/sequence_file_reader.h"
+#include "util/defer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TransactionLogIteratorImpl::TransactionLogIteratorImpl(
+    const std::string& dir, const ImmutableDBOptions* options,
+    const TransactionLogIterator::ReadOptions& read_options,
+    const EnvOptions& soptions, const SequenceNumber seq,
+    std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+    const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer)
+    : dir_(dir),
+      options_(options),
+      read_options_(read_options),
+      soptions_(soptions),
+      starting_sequence_number_(seq),
+      files_(std::move(files)),
+      versions_(versions),
+      seq_per_batch_(seq_per_batch),
+      io_tracer_(io_tracer),
+      started_(false),
+      is_valid_(false),
+      current_file_index_(0),
+      current_batch_seq_(0),
+      current_last_seq_(0) {
+  assert(files_ != nullptr);
+  assert(versions_ != nullptr);
+  assert(!seq_per_batch_);
+  current_status_.PermitUncheckedError();  // Clear on start
+  reporter_.env = options_->env;
+  reporter_.info_log = options_->info_log.get();
+  SeekToStartSequence();  // Seek till starting sequence
+}
+
+Status TransactionLogIteratorImpl::OpenLogFile(
+    const LogFile* log_file,
+    std::unique_ptr<SequentialFileReader>* file_reader) {
+  FileSystemPtr fs(options_->fs, io_tracer_);
+  std::unique_ptr<FSSequentialFile> file;
+  std::string fname;
+  Status s;
+  EnvOptions optimized_env_options = fs->OptimizeForLogRead(soptions_);
+  if (log_file->Type() == kArchivedLogFile) {
+    fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+  } else {
+    fname = LogFileName(dir_, log_file->LogNumber());
+    s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+    if (!s.ok()) {
+      //  If cannot open file in DB directory.
+      //  Try the archive dir, as it could have moved in the meanwhile.
+      fname = ArchivedLogFileName(dir_, log_file->LogNumber());
+      s = fs->NewSequentialFile(fname, optimized_env_options, &file, nullptr);
+    }
+  }
+  if (s.ok()) {
+    file_reader->reset(new SequentialFileReader(std::move(file), fname,
+                                                io_tracer_, options_->listeners,
+                                                options_->rate_limiter.get()));
+  }
+  return s;
+}
+
+BatchResult TransactionLogIteratorImpl::GetBatch() {
+  assert(is_valid_);  //  cannot call in a non valid state.
+  BatchResult result;
+  result.sequence = current_batch_seq_;
+  result.writeBatchPtr = std::move(current_batch_);
+  return result;
+}
+
+Status TransactionLogIteratorImpl::status() { return current_status_; }
+
+bool TransactionLogIteratorImpl::Valid() { return started_ && is_valid_; }
+
+bool TransactionLogIteratorImpl::RestrictedRead(Slice* record) {
+  // Don't read if no more complete entries to read from logs
+  if (current_last_seq_ >= versions_->LastSequence()) {
+    return false;
+  }
+  return current_log_reader_->ReadRecord(record, &scratch_);
+}
+
+void TransactionLogIteratorImpl::SeekToStartSequence(uint64_t start_file_index,
+                                                     bool strict) {
+  Slice record;
+  started_ = false;
+  is_valid_ = false;
+  // Check invariant of TransactionLogIterator when SeekToStartSequence()
+  // succeeds.
+  const Defer defer([this]() {
+    if (is_valid_) {
+      assert(current_status_.ok());
+      if (starting_sequence_number_ > current_batch_seq_) {
+        assert(current_batch_seq_ < current_last_seq_);
+        assert(current_last_seq_ >= starting_sequence_number_);
+      }
+    }
+  });
+  if (files_->size() <= start_file_index) {
+    return;
+  } else if (!current_status_.ok()) {
+    return;
+  }
+  Status s =
+      OpenLogReader(files_->at(static_cast<size_t>(start_file_index)).get());
+  if (!s.ok()) {
+    current_status_ = s;
+    reporter_.Info(current_status_.ToString().c_str());
+    return;
+  }
+  while (RestrictedRead(&record)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter_.Corruption(record.size(),
+                           Status::Corruption("very small log record"));
+      continue;
+    }
+    UpdateCurrentWriteBatch(record);
+    if (current_last_seq_ >= starting_sequence_number_) {
+      if (strict && current_batch_seq_ != starting_sequence_number_) {
+        current_status_ = Status::Corruption(
+            "Gap in sequence number. Could not "
+            "seek to required sequence number");
+        reporter_.Info(current_status_.ToString().c_str());
+        return;
+      } else if (strict) {
+        reporter_.Info(
+            "Could seek required sequence number. Iterator will "
+            "continue.");
+      }
+      is_valid_ = true;
+      started_ = true;  // set started_ as we could seek till starting sequence
+      return;
+    } else {
+      is_valid_ = false;
+    }
+  }
+
+  // Could not find start sequence in first file. Normally this must be the
+  // only file. Otherwise log the error and let the iterator return next entry
+  // If strict is set, we want to seek exactly till the start sequence and it
+  // should have been present in the file we scanned above
+  if (strict) {
+    current_status_ = Status::Corruption(
+        "Gap in sequence number. Could not "
+        "seek to required sequence number");
+    reporter_.Info(current_status_.ToString().c_str());
+  } else if (files_->size() != 1) {
+    current_status_ = Status::Corruption(
+        "Start sequence was not found, "
+        "skipping to the next available");
+    reporter_.Info(current_status_.ToString().c_str());
+    // Let NextImpl find the next available entry. started_ remains false
+    // because we don't want to check for gaps while moving to start sequence
+    NextImpl(true);
+  }
+}
+
+void TransactionLogIteratorImpl::Next() {
+  if (!current_status_.ok()) {
+    return;
+  }
+  return NextImpl(false);
+}
+
+void TransactionLogIteratorImpl::NextImpl(bool internal) {
+  Slice record;
+  is_valid_ = false;
+  if (!internal && !started_) {
+    // Runs every time until we can seek to the start sequence
+    SeekToStartSequence();
+  }
+  while (true) {
+    assert(current_log_reader_);
+    if (current_log_reader_->IsEOF()) {
+      current_log_reader_->UnmarkEOF();
+    }
+    while (RestrictedRead(&record)) {
+      if (record.size() < WriteBatchInternal::kHeader) {
+        reporter_.Corruption(record.size(),
+                             Status::Corruption("very small log record"));
+        continue;
+      } else {
+        // started_ should be true if called by application
+        assert(internal || started_);
+        // started_ should be false if called internally
+        assert(!internal || !started_);
+        UpdateCurrentWriteBatch(record);
+        if (internal && !started_) {
+          started_ = true;
+        }
+        return;
+      }
+    }
+
+    // Open the next file
+    if (current_file_index_ < files_->size() - 1) {
+      ++current_file_index_;
+      Status s = OpenLogReader(files_->at(current_file_index_).get());
+      if (!s.ok()) {
+        is_valid_ = false;
+        current_status_ = s;
+        return;
+      }
+    } else {
+      is_valid_ = false;
+      if (current_last_seq_ == versions_->LastSequence()) {
+        current_status_ = Status::OK();
+      } else {
+        const char* msg = "Create a new iterator to fetch the new tail.";
+        current_status_ = Status::TryAgain(msg);
+      }
+      return;
+    }
+  }
+}
+
+bool TransactionLogIteratorImpl::IsBatchExpected(
+    const WriteBatch* batch, const SequenceNumber expected_seq) {
+  assert(batch);
+  SequenceNumber batchSeq = WriteBatchInternal::Sequence(batch);
+  if (batchSeq != expected_seq) {
+    char buf[200];
+    snprintf(buf, sizeof(buf),
+             "Discontinuity in log records. Got seq=%" PRIu64
+             ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64
+             ".Log iterator will reseek the correct batch.",
+             batchSeq, expected_seq, versions_->LastSequence());
+    reporter_.Info(buf);
+    return false;
+  }
+  return true;
+}
+
+void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) {
+  std::unique_ptr<WriteBatch> batch(new WriteBatch());
+  Status s = WriteBatchInternal::SetContents(batch.get(), record);
+  s.PermitUncheckedError();  // TODO: What should we do with this error?
+
+  SequenceNumber expected_seq = current_last_seq_ + 1;
+  // If the iterator has started, then confirm that we get continuous batches
+  if (started_ && !IsBatchExpected(batch.get(), expected_seq)) {
+    // Seek to the batch having expected sequence number
+    if (expected_seq < files_->at(current_file_index_)->StartSequence()) {
+      // Expected batch must lie in the previous log file
+      // Avoid underflow.
+      if (current_file_index_ != 0) {
+        current_file_index_--;
+      }
+    }
+    starting_sequence_number_ = expected_seq;
+    // currentStatus_ will be set to Ok if reseek succeeds
+    // Note: this is still ok in seq_pre_batch_ && two_write_queuesp_ mode
+    // that allows gaps in the WAL since it will still skip over the gap.
+    current_status_ = Status::NotFound("Gap in sequence numbers");
+    // In seq_per_batch_ mode, gaps in the seq are possible so the strict mode
+    // should be disabled
+    return SeekToStartSequence(current_file_index_, !seq_per_batch_);
+  }
+
+  current_batch_seq_ = WriteBatchInternal::Sequence(batch.get());
+  assert(!seq_per_batch_);
+  current_last_seq_ =
+      current_batch_seq_ + WriteBatchInternal::Count(batch.get()) - 1;
+  // currentBatchSeq_ can only change here
+  assert(current_last_seq_ <= versions_->LastSequence());
+
+  current_batch_ = std::move(batch);
+  is_valid_ = true;
+  current_status_ = Status::OK();
+}
+
+Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* log_file) {
+  std::unique_ptr<SequentialFileReader> file;
+  Status s = OpenLogFile(log_file, &file);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(file);
+  current_log_reader_.reset(
+      new log::Reader(options_->info_log, std::move(file), &reporter_,
+                      read_options_.verify_checksums_, log_file->LogNumber()));
+  return Status::OK();
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/transaction_log_impl.h b/src/rocksdb/db/transaction_log_impl.h
new file mode 100644
index 000000000..e8c6efc02
--- /dev/null
+++ b/src/rocksdb/db/transaction_log_impl.h
@@ -0,0 +1,130 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+#pragma once
+
+#ifndef ROCKSDB_LITE
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/version_set.h"
+#include "file/filename.h"
+#include "logging/logging.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class LogFileImpl : public LogFile {
+ public:
+  LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq,
+              uint64_t sizeBytes)
+      : logNumber_(logNum),
+        type_(logType),
+        startSequence_(startSeq),
+        sizeFileBytes_(sizeBytes) {}
+
+  std::string PathName() const override {
+    if (type_ == kArchivedLogFile) {
+      return ArchivedLogFileName("", logNumber_);
+    }
+    return LogFileName("", logNumber_);
+  }
+
+  uint64_t LogNumber() const override { return logNumber_; }
+
+  WalFileType Type() const override { return type_; }
+
+  SequenceNumber StartSequence() const override { return startSequence_; }
+
+  uint64_t SizeFileBytes() const override { return sizeFileBytes_; }
+
+  bool operator<(const LogFile& that) const {
+    return LogNumber() < that.LogNumber();
+  }
+
+ private:
+  uint64_t logNumber_;
+  WalFileType type_;
+  SequenceNumber startSequence_;
+  uint64_t sizeFileBytes_;
+};
+
+class TransactionLogIteratorImpl : public TransactionLogIterator {
+ public:
+  TransactionLogIteratorImpl(
+      const std::string& dir, const ImmutableDBOptions* options,
+      const TransactionLogIterator::ReadOptions& read_options,
+      const EnvOptions& soptions, const SequenceNumber seqNum,
+      std::unique_ptr<VectorLogPtr> files, VersionSet const* const versions,
+      const bool seq_per_batch, const std::shared_ptr<IOTracer>& io_tracer);
+
+  virtual bool Valid() override;
+
+  virtual void Next() override;
+
+  virtual Status status() override;
+
+  virtual BatchResult GetBatch() override;
+
+ private:
+  const std::string& dir_;
+  const ImmutableDBOptions* options_;
+  const TransactionLogIterator::ReadOptions read_options_;
+  const EnvOptions& soptions_;
+  SequenceNumber starting_sequence_number_;
+  std::unique_ptr<VectorLogPtr> files_;
+  // Used only to get latest seq. num
+  // TODO(icanadi) can this be just a callback?
+  VersionSet const* const versions_;
+  const bool seq_per_batch_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  // State variables
+  bool started_;
+  bool is_valid_;  // not valid when it starts of.
+  Status current_status_;
+  size_t current_file_index_;
+  std::unique_ptr<WriteBatch> current_batch_;
+  std::unique_ptr<log::Reader> current_log_reader_;
+  std::string scratch_;
+  Status OpenLogFile(const LogFile* log_file,
+                     std::unique_ptr<SequentialFileReader>* file);
+
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    virtual void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_ERROR(info_log, "dropping %" ROCKSDB_PRIszt " bytes; %s", bytes,
+                      s.ToString().c_str());
+    }
+    virtual void Info(const char* s) { ROCKS_LOG_INFO(info_log, "%s", s); }
+  } reporter_;
+
+  SequenceNumber
+      current_batch_seq_;  // sequence number at start of current batch
+  SequenceNumber current_last_seq_;  // last sequence in the current batch
+  // Reads from transaction log only if the writebatch record has been written
+  bool RestrictedRead(Slice* record);
+  // Seeks to starting_sequence_number_ reading from start_file_index in files_.
+  // If strict is set, then must get a batch starting with
+  // starting_sequence_number_.
+  void SeekToStartSequence(uint64_t start_file_index = 0, bool strict = false);
+  // Implementation of Next. SeekToStartSequence calls it internally with
+  // internal=true to let it find next entry even if it has to jump gaps because
+  // the iterator may start off from the first available entry but promises to
+  // be continuous after that
+  void NextImpl(bool internal = false);
+  // Check if batch is expected, else return false
+  bool IsBatchExpected(const WriteBatch* batch, SequenceNumber expected_seq);
+  // Update current batch if a continuous batch is found.
+  void UpdateCurrentWriteBatch(const Slice& record);
+  Status OpenLogReader(const LogFile* file);
+};
+}  // namespace ROCKSDB_NAMESPACE
+#endif  // ROCKSDB_LITE
diff --git a/src/rocksdb/db/trim_history_scheduler.cc b/src/rocksdb/db/trim_history_scheduler.cc
new file mode 100644
index 000000000..d7ca0899f
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/trim_history_scheduler.h"
+
+#include <cassert>
+
+#include "db/column_family.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void TrimHistoryScheduler::ScheduleWork(ColumnFamilyData* cfd) {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  cfd->Ref();
+  cfds_.push_back(cfd);
+  is_empty_.store(false, std::memory_order_relaxed);
+}
+
+ColumnFamilyData* TrimHistoryScheduler::TakeNextColumnFamily() {
+  std::lock_guard<std::mutex> lock(checking_mutex_);
+  while (true) {
+    if (cfds_.empty()) {
+      return nullptr;
+    }
+    ColumnFamilyData* cfd = cfds_.back();
+    cfds_.pop_back();
+    if (cfds_.empty()) {
+      is_empty_.store(true, std::memory_order_relaxed);
+    }
+
+    if (!cfd->IsDropped()) {
+      // success
+      return cfd;
+    }
+    cfd->UnrefAndTryDelete();
+  }
+}
+
+bool TrimHistoryScheduler::Empty() {
+  bool is_empty = is_empty_.load(std::memory_order_relaxed);
+  return is_empty;
+}
+
+void TrimHistoryScheduler::Clear() {
+  ColumnFamilyData* cfd;
+  while ((cfd = TakeNextColumnFamily()) != nullptr) {
+    cfd->UnrefAndTryDelete();
+  }
+  assert(Empty());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/trim_history_scheduler.h b/src/rocksdb/db/trim_history_scheduler.h
new file mode 100644
index 000000000..252802a7a
--- /dev/null
+++ b/src/rocksdb/db/trim_history_scheduler.h
@@ -0,0 +1,46 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <mutex>
+
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class ColumnFamilyData;
+
+// Similar to FlushScheduler, TrimHistoryScheduler is a FIFO queue that keeps
+// track of column families whose flushed immutable memtables may need to be
+// removed (aka trimmed). The actual trimming may be slightly delayed. Due to
+// the use of the mutex and atomic variable, ScheduleWork,
+// TakeNextColumnFamily, and, Empty can be called concurrently.
+class TrimHistoryScheduler {
+ public:
+  TrimHistoryScheduler() : is_empty_(true) {}
+
+  // When a column family needs history trimming, add cfd to the FIFO queue
+  void ScheduleWork(ColumnFamilyData* cfd);
+
+  // Remove the column family from the queue, the caller is responsible for
+  // calling `MemtableList::TrimHistory`
+  ColumnFamilyData* TakeNextColumnFamily();
+
+  bool Empty();
+
+  void Clear();
+
+  // Not on critical path, use mutex to ensure thread safety
+ private:
+  std::atomic<bool> is_empty_;
+  autovector<ColumnFamilyData*> cfds_;
+  std::mutex checking_mutex_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.cc b/src/rocksdb/db/version_builder.cc
new file mode 100644
index 000000000..2c65dcf71
--- /dev/null
+++ b/src/rocksdb/db/version_builder.cc
@@ -0,0 +1,1372 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_builder.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cinttypes>
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_reservation_manager.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/table_cache.h"
+#include "db/version_set.h"
+#include "port/port.h"
+#include "table/table_reader.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilder::Rep {
+  class NewestFirstBySeqNo {
+   public:
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+
+      if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) {
+        return lhs->fd.largest_seqno > rhs->fd.largest_seqno;
+      }
+
+      if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) {
+        return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno;
+      }
+
+      // Break ties by file number
+      return lhs->fd.GetNumber() > rhs->fd.GetNumber();
+    }
+  };
+
+  class BySmallestKey {
+   public:
+    explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {}
+
+    bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const {
+      assert(lhs);
+      assert(rhs);
+      assert(cmp_);
+
+      const int r = cmp_->Compare(lhs->smallest, rhs->smallest);
+      if (r != 0) {
+        return (r < 0);
+      }
+
+      // Break ties by file number
+      return (lhs->fd.GetNumber() < rhs->fd.GetNumber());
+    }
+
+   private:
+    const InternalKeyComparator* cmp_;
+  };
+
+  struct LevelState {
+    std::unordered_set<uint64_t> deleted_files;
+    // Map from file number to file meta data.
+    std::unordered_map<uint64_t, FileMetaData*> added_files;
+  };
+
+  // A class that represents the accumulated changes (like additional garbage or
+  // newly linked/unlinked SST files) for a given blob file after applying a
+  // series of VersionEdits.
+  class BlobFileMetaDataDelta {
+   public:
+    bool IsEmpty() const {
+      return !additional_garbage_count_ && !additional_garbage_bytes_ &&
+             newly_linked_ssts_.empty() && newly_unlinked_ssts_.empty();
+    }
+
+    uint64_t GetAdditionalGarbageCount() const {
+      return additional_garbage_count_;
+    }
+
+    uint64_t GetAdditionalGarbageBytes() const {
+      return additional_garbage_bytes_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyLinkedSsts() const {
+      return newly_linked_ssts_;
+    }
+
+    const std::unordered_set<uint64_t>& GetNewlyUnlinkedSsts() const {
+      return newly_unlinked_ssts_;
+    }
+
+    void AddGarbage(uint64_t count, uint64_t bytes) {
+      additional_garbage_count_ += count;
+      additional_garbage_bytes_ += bytes;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      assert(newly_linked_ssts_.find(sst_file_number) ==
+             newly_linked_ssts_.end());
+
+      // Reconcile with newly unlinked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_unlinked_ssts_.find(sst_file_number);
+
+      if (it != newly_unlinked_ssts_.end()) {
+        newly_unlinked_ssts_.erase(it);
+      } else {
+        newly_linked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      assert(newly_unlinked_ssts_.find(sst_file_number) ==
+             newly_unlinked_ssts_.end());
+
+      // Reconcile with newly linked SSTs on the fly. (Note: an SST can be
+      // linked to and unlinked from the same blob file in the case of a trivial
+      // move.)
+      auto it = newly_linked_ssts_.find(sst_file_number);
+
+      if (it != newly_linked_ssts_.end()) {
+        newly_linked_ssts_.erase(it);
+      } else {
+        newly_unlinked_ssts_.emplace(sst_file_number);
+      }
+    }
+
+   private:
+    uint64_t additional_garbage_count_ = 0;
+    uint64_t additional_garbage_bytes_ = 0;
+    std::unordered_set<uint64_t> newly_linked_ssts_;
+    std::unordered_set<uint64_t> newly_unlinked_ssts_;
+  };
+
+  // A class that represents the state of a blob file after applying a series of
+  // VersionEdits. In addition to the resulting state, it also contains the
+  // delta (see BlobFileMetaDataDelta above). The resulting state can be used to
+  // identify obsolete blob files, while the delta makes it possible to
+  // efficiently detect trivial moves.
+  class MutableBlobFileMetaData {
+   public:
+    // To be used for brand new blob files
+    explicit MutableBlobFileMetaData(
+        std::shared_ptr<SharedBlobFileMetaData>&& shared_meta)
+        : shared_meta_(std::move(shared_meta)) {}
+
+    // To be used for pre-existing blob files
+    explicit MutableBlobFileMetaData(
+        const std::shared_ptr<BlobFileMetaData>& meta)
+        : shared_meta_(meta->GetSharedMeta()),
+          linked_ssts_(meta->GetLinkedSsts()),
+          garbage_blob_count_(meta->GetGarbageBlobCount()),
+          garbage_blob_bytes_(meta->GetGarbageBlobBytes()) {}
+
+    const std::shared_ptr<SharedBlobFileMetaData>& GetSharedMeta() const {
+      return shared_meta_;
+    }
+
+    uint64_t GetBlobFileNumber() const {
+      assert(shared_meta_);
+      return shared_meta_->GetBlobFileNumber();
+    }
+
+    bool HasDelta() const { return !delta_.IsEmpty(); }
+
+    const std::unordered_set<uint64_t>& GetLinkedSsts() const {
+      return linked_ssts_;
+    }
+
+    uint64_t GetGarbageBlobCount() const { return garbage_blob_count_; }
+
+    uint64_t GetGarbageBlobBytes() const { return garbage_blob_bytes_; }
+
+    bool AddGarbage(uint64_t count, uint64_t bytes) {
+      assert(shared_meta_);
+
+      if (garbage_blob_count_ + count > shared_meta_->GetTotalBlobCount() ||
+          garbage_blob_bytes_ + bytes > shared_meta_->GetTotalBlobBytes()) {
+        return false;
+      }
+
+      delta_.AddGarbage(count, bytes);
+
+      garbage_blob_count_ += count;
+      garbage_blob_bytes_ += bytes;
+
+      return true;
+    }
+
+    void LinkSst(uint64_t sst_file_number) {
+      delta_.LinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) == linked_ssts_.end());
+      linked_ssts_.emplace(sst_file_number);
+    }
+
+    void UnlinkSst(uint64_t sst_file_number) {
+      delta_.UnlinkSst(sst_file_number);
+
+      assert(linked_ssts_.find(sst_file_number) != linked_ssts_.end());
+      linked_ssts_.erase(sst_file_number);
+    }
+
+   private:
+    std::shared_ptr<SharedBlobFileMetaData> shared_meta_;
+    // Accumulated changes
+    BlobFileMetaDataDelta delta_;
+    // Resulting state after applying the changes
+    BlobFileMetaData::LinkedSsts linked_ssts_;
+    uint64_t garbage_blob_count_ = 0;
+    uint64_t garbage_blob_bytes_ = 0;
+  };
+
+  const FileOptions& file_options_;
+  const ImmutableCFOptions* const ioptions_;
+  TableCache* table_cache_;
+  VersionStorageInfo* base_vstorage_;
+  VersionSet* version_set_;
+  int num_levels_;
+  LevelState* levels_;
+  // Store sizes of levels larger than num_levels_. We do this instead of
+  // storing them in levels_ to avoid regression in case there are no files
+  // on invalid levels. The version is not consistent if in the end the files
+  // on invalid levels don't cancel out.
+  std::unordered_map<int, size_t> invalid_level_sizes_;
+  // Whether there are invalid new files or invalid deletion on levels larger
+  // than num_levels_.
+  bool has_invalid_levels_;
+  // Current levels of table files affected by additions/deletions.
+  std::unordered_map<uint64_t, int> table_file_levels_;
+  // Current compact cursors that should be changed after the last compaction
+  std::unordered_map<int, InternalKey> updated_compact_cursors_;
+  NewestFirstBySeqNo level_zero_cmp_;
+  BySmallestKey level_nonzero_cmp_;
+
+  // Mutable metadata objects for all blob files affected by the series of
+  // version edits.
+  std::map<uint64_t, MutableBlobFileMetaData> mutable_blob_file_metas_;
+
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr_;
+
+ public:
+  Rep(const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+      TableCache* table_cache, VersionStorageInfo* base_vstorage,
+      VersionSet* version_set,
+      std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+      : file_options_(file_options),
+        ioptions_(ioptions),
+        table_cache_(table_cache),
+        base_vstorage_(base_vstorage),
+        version_set_(version_set),
+        num_levels_(base_vstorage->num_levels()),
+        has_invalid_levels_(false),
+        level_nonzero_cmp_(base_vstorage_->InternalComparator()),
+        file_metadata_cache_res_mgr_(file_metadata_cache_res_mgr) {
+    assert(ioptions_);
+
+    levels_ = new LevelState[num_levels_];
+  }
+
+  ~Rep() {
+    for (int level = 0; level < num_levels_; level++) {
+      const auto& added = levels_[level].added_files;
+      for (auto& pair : added) {
+        UnrefFile(pair.second);
+      }
+    }
+
+    delete[] levels_;
+  }
+
+  void UnrefFile(FileMetaData* f) {
+    f->refs--;
+    if (f->refs <= 0) {
+      if (f->table_reader_handle) {
+        assert(table_cache_ != nullptr);
+        table_cache_->ReleaseHandle(f->table_reader_handle);
+        f->table_reader_handle = nullptr;
+      }
+
+      if (file_metadata_cache_res_mgr_) {
+        Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+            f->ApproximateMemoryUsage(), false /* increase */);
+        s.PermitUncheckedError();
+      }
+      delete f;
+    }
+  }
+
+  // Mapping used for checking the consistency of links between SST files and
+  // blob files. It is built using the forward links (table file -> blob file),
+  // and is subsequently compared with the inverse mapping stored in the
+  // BlobFileMetaData objects.
+  using ExpectedLinkedSsts =
+      std::unordered_map<uint64_t, BlobFileMetaData::LinkedSsts>;
+
+  static void UpdateExpectedLinkedSsts(
+      uint64_t table_file_number, uint64_t blob_file_number,
+      ExpectedLinkedSsts* expected_linked_ssts) {
+    assert(expected_linked_ssts);
+
+    if (blob_file_number == kInvalidBlobFileNumber) {
+      return;
+    }
+
+    (*expected_linked_ssts)[blob_file_number].emplace(table_file_number);
+  }
+
+  template <typename Checker>
+  Status CheckConsistencyDetailsForLevel(
+      const VersionStorageInfo* vstorage, int level, Checker checker,
+      const std::string& sync_point,
+      ExpectedLinkedSsts* expected_linked_ssts) const {
+#ifdef NDEBUG
+    (void)sync_point;
+#endif
+
+    assert(vstorage);
+    assert(level >= 0 && level < num_levels_);
+    assert(expected_linked_ssts);
+
+    const auto& level_files = vstorage->LevelFiles(level);
+
+    if (level_files.empty()) {
+      return Status::OK();
+    }
+
+    assert(level_files[0]);
+    UpdateExpectedLinkedSsts(level_files[0]->fd.GetNumber(),
+                             level_files[0]->oldest_blob_file_number,
+                             expected_linked_ssts);
+
+    for (size_t i = 1; i < level_files.size(); ++i) {
+      assert(level_files[i]);
+      UpdateExpectedLinkedSsts(level_files[i]->fd.GetNumber(),
+                               level_files[i]->oldest_blob_file_number,
+                               expected_linked_ssts);
+
+      auto lhs = level_files[i - 1];
+      auto rhs = level_files[i];
+
+#ifndef NDEBUG
+      auto pair = std::make_pair(&lhs, &rhs);
+      TEST_SYNC_POINT_CALLBACK(sync_point, &pair);
+#endif
+
+      const Status s = checker(lhs, rhs);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  // Make sure table files are sorted correctly and that the links between
+  // table files and blob files are consistent.
+  Status CheckConsistencyDetails(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    ExpectedLinkedSsts expected_linked_ssts;
+
+    if (num_levels_ > 0) {
+      // Check L0
+      {
+        auto l0_checker = [this](const FileMetaData* lhs,
+                                 const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_zero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << "L0 files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) {
+            // This is an external file that we ingested
+            const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno;
+
+            if (!(external_file_seqno < lhs->fd.largest_seqno ||
+                  external_file_seqno == 0)) {
+              std::ostringstream oss;
+              oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                  << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                  << " vs. file #" << rhs->fd.GetNumber()
+                  << " with global_seqno " << external_file_seqno;
+
+              return Status::Corruption("VersionBuilder", oss.str());
+            }
+          } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) {
+            std::ostringstream oss;
+            oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno "
+                << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno
+                << " vs. file #" << rhs->fd.GetNumber() << " with seqno "
+                << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno;
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, /* level */ 0, l0_checker,
+            "VersionBuilder::CheckConsistency0", &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+
+      // Check L1 and up
+      const InternalKeyComparator* const icmp = vstorage->InternalComparator();
+      assert(icmp);
+
+      for (int level = 1; level < num_levels_; ++level) {
+        auto checker = [this, level, icmp](const FileMetaData* lhs,
+                                           const FileMetaData* rhs) {
+          assert(lhs);
+          assert(rhs);
+
+          if (!level_nonzero_cmp_(lhs, rhs)) {
+            std::ostringstream oss;
+            oss << 'L' << level << " files are not sorted properly: files #"
+                << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber();
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          // Make sure there is no overlap in level
+          if (icmp->Compare(lhs->largest, rhs->smallest) >= 0) {
+            std::ostringstream oss;
+            oss << 'L' << level << " has overlapping ranges: file #"
+                << lhs->fd.GetNumber()
+                << " largest key: " << lhs->largest.DebugString(true)
+                << " vs. file #" << rhs->fd.GetNumber()
+                << " smallest key: " << rhs->smallest.DebugString(true);
+
+            return Status::Corruption("VersionBuilder", oss.str());
+          }
+
+          return Status::OK();
+        };
+
+        const Status s = CheckConsistencyDetailsForLevel(
+            vstorage, level, checker, "VersionBuilder::CheckConsistency1",
+            &expected_linked_ssts);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+
+    // Make sure that all blob files in the version have non-garbage data and
+    // the links between them and the table files are consistent.
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& blob_file_meta : blob_files) {
+      assert(blob_file_meta);
+
+      const uint64_t blob_file_number = blob_file_meta->GetBlobFileNumber();
+
+      if (blob_file_meta->GetGarbageBlobCount() >=
+          blob_file_meta->GetTotalBlobCount()) {
+        std::ostringstream oss;
+        oss << "Blob file #" << blob_file_number
+            << " consists entirely of garbage";
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+
+      if (blob_file_meta->GetLinkedSsts() !=
+          expected_linked_ssts[blob_file_number]) {
+        std::ostringstream oss;
+        oss << "Links are inconsistent between table files and blob file #"
+            << blob_file_number;
+
+        return Status::Corruption("VersionBuilder", oss.str());
+      }
+    }
+
+    Status ret_s;
+    TEST_SYNC_POINT_CALLBACK("VersionBuilder::CheckConsistencyBeforeReturn",
+                             &ret_s);
+    return ret_s;
+  }
+
+  Status CheckConsistency(const VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    // Always run consistency checks in debug build
+#ifdef NDEBUG
+    if (!vstorage->force_consistency_checks()) {
+      return Status::OK();
+    }
+#endif
+    Status s = CheckConsistencyDetails(vstorage);
+    if (s.IsCorruption() && s.getState()) {
+      // Make it clear the error is due to force_consistency_checks = 1 or
+      // debug build
+#ifdef NDEBUG
+      auto prefix = "force_consistency_checks";
+#else
+      auto prefix = "force_consistency_checks(DEBUG)";
+#endif
+      s = Status::Corruption(prefix, s.getState());
+    } else {
+      // was only expecting corruption with message, or OK
+      assert(s.ok());
+    }
+    return s;
+  }
+
+  bool CheckConsistencyForNumLevels() const {
+    // Make sure there are no files on or beyond num_levels().
+    if (has_invalid_levels_) {
+      return false;
+    }
+
+    for (const auto& pair : invalid_level_sizes_) {
+      const size_t level_size = pair.second;
+      if (level_size != 0) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool IsBlobFileInVersion(uint64_t blob_file_number) const {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return true;
+    }
+
+    assert(base_vstorage_);
+    const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+    return !!meta;
+  }
+
+  MutableBlobFileMetaData* GetOrCreateMutableBlobFileMetaData(
+      uint64_t blob_file_number) {
+    auto mutable_it = mutable_blob_file_metas_.find(blob_file_number);
+    if (mutable_it != mutable_blob_file_metas_.end()) {
+      return &mutable_it->second;
+    }
+
+    assert(base_vstorage_);
+    const auto meta = base_vstorage_->GetBlobFileMetaData(blob_file_number);
+
+    if (meta) {
+      mutable_it = mutable_blob_file_metas_
+                       .emplace(blob_file_number, MutableBlobFileMetaData(meta))
+                       .first;
+      return &mutable_it->second;
+    }
+
+    return nullptr;
+  }
+
+  Status ApplyBlobFileAddition(const BlobFileAddition& blob_file_addition) {
+    const uint64_t blob_file_number = blob_file_addition.GetBlobFileNumber();
+
+    if (IsBlobFileInVersion(blob_file_number)) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " already added";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    // Note: we use C++11 for now but in C++14, this could be done in a more
+    // elegant way using generalized lambda capture.
+    VersionSet* const vs = version_set_;
+    const ImmutableCFOptions* const ioptions = ioptions_;
+
+    auto deleter = [vs, ioptions](SharedBlobFileMetaData* shared_meta) {
+      if (vs) {
+        assert(ioptions);
+        assert(!ioptions->cf_paths.empty());
+        assert(shared_meta);
+
+        vs->AddObsoleteBlobFile(shared_meta->GetBlobFileNumber(),
+                                ioptions->cf_paths.front().path);
+      }
+
+      delete shared_meta;
+    };
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, blob_file_addition.GetTotalBlobCount(),
+        blob_file_addition.GetTotalBlobBytes(),
+        blob_file_addition.GetChecksumMethod(),
+        blob_file_addition.GetChecksumValue(), deleter);
+
+    mutable_blob_file_metas_.emplace(
+        blob_file_number, MutableBlobFileMetaData(std::move(shared_meta)));
+
+    return Status::OK();
+  }
+
+  Status ApplyBlobFileGarbage(const BlobFileGarbage& blob_file_garbage) {
+    const uint64_t blob_file_number = blob_file_garbage.GetBlobFileNumber();
+
+    MutableBlobFileMetaData* const mutable_meta =
+        GetOrCreateMutableBlobFileMetaData(blob_file_number);
+
+    if (!mutable_meta) {
+      std::ostringstream oss;
+      oss << "Blob file #" << blob_file_number << " not found";
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (!mutable_meta->AddGarbage(blob_file_garbage.GetGarbageBlobCount(),
+                                  blob_file_garbage.GetGarbageBlobBytes())) {
+      std::ostringstream oss;
+      oss << "Garbage overflow for blob file #" << blob_file_number;
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    return Status::OK();
+  }
+
+  int GetCurrentLevelForTableFile(uint64_t file_number) const {
+    auto it = table_file_levels_.find(file_number);
+    if (it != table_file_levels_.end()) {
+      return it->second;
+    }
+
+    assert(base_vstorage_);
+    return base_vstorage_->GetFileLocation(file_number).GetLevel();
+  }
+
+  uint64_t GetOldestBlobFileNumberForTableFile(int level,
+                                               uint64_t file_number) const {
+    assert(level < num_levels_);
+
+    const auto& added_files = levels_[level].added_files;
+
+    auto it = added_files.find(file_number);
+    if (it != added_files.end()) {
+      const FileMetaData* const meta = it->second;
+      assert(meta);
+
+      return meta->oldest_blob_file_number;
+    }
+
+    assert(base_vstorage_);
+    const FileMetaData* const meta =
+        base_vstorage_->GetFileMetaDataByNumber(file_number);
+    assert(meta);
+
+    return meta->oldest_blob_file_number;
+  }
+
+  Status ApplyFileDeletion(int level, uint64_t file_number) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (level != current_level) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot delete table file #" << file_number << " from level "
+          << level << " since it is ";
+      if (current_level ==
+          VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+        oss << "not in the LSM tree";
+      } else {
+        oss << "on level " << current_level;
+      }
+
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (level >= num_levels_) {
+      assert(invalid_level_sizes_[level] > 0);
+      --invalid_level_sizes_[level];
+
+      table_file_levels_[file_number] =
+          VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+      return Status::OK();
+    }
+
+    const uint64_t blob_file_number =
+        GetOldestBlobFileNumberForTableFile(level, file_number);
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->UnlinkSst(file_number);
+      }
+    }
+
+    auto& level_state = levels_[level];
+
+    auto& add_files = level_state.added_files;
+    auto add_it = add_files.find(file_number);
+    if (add_it != add_files.end()) {
+      UnrefFile(add_it->second);
+      add_files.erase(add_it);
+    }
+
+    auto& del_files = level_state.deleted_files;
+    assert(del_files.find(file_number) == del_files.end());
+    del_files.emplace(file_number);
+
+    table_file_levels_[file_number] =
+        VersionStorageInfo::FileLocation::Invalid().GetLevel();
+
+    return Status::OK();
+  }
+
+  Status ApplyFileAddition(int level, const FileMetaData& meta) {
+    assert(level != VersionStorageInfo::FileLocation::Invalid().GetLevel());
+
+    const uint64_t file_number = meta.fd.GetNumber();
+
+    const int current_level = GetCurrentLevelForTableFile(file_number);
+
+    if (current_level !=
+        VersionStorageInfo::FileLocation::Invalid().GetLevel()) {
+      if (level >= num_levels_) {
+        has_invalid_levels_ = true;
+      }
+
+      std::ostringstream oss;
+      oss << "Cannot add table file #" << file_number << " to level " << level
+          << " since it is already in the LSM tree on level " << current_level;
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+
+    if (level >= num_levels_) {
+      ++invalid_level_sizes_[level];
+      table_file_levels_[file_number] = level;
+
+      return Status::OK();
+    }
+
+    auto& level_state = levels_[level];
+
+    auto& del_files = level_state.deleted_files;
+    auto del_it = del_files.find(file_number);
+    if (del_it != del_files.end()) {
+      del_files.erase(del_it);
+    }
+
+    FileMetaData* const f = new FileMetaData(meta);
+    f->refs = 1;
+
+    if (file_metadata_cache_res_mgr_) {
+      Status s = file_metadata_cache_res_mgr_->UpdateCacheReservation(
+          f->ApproximateMemoryUsage(), true /* increase */);
+      if (!s.ok()) {
+        delete f;
+        s = Status::MemoryLimit(
+            "Can't allocate " +
+            kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                CacheEntryRole::kFileMetadata)] +
+            " due to exceeding the memory limit "
+            "based on "
+            "cache capacity");
+        return s;
+      }
+    }
+
+    auto& add_files = level_state.added_files;
+    assert(add_files.find(file_number) == add_files.end());
+    add_files.emplace(file_number, f);
+
+    const uint64_t blob_file_number = f->oldest_blob_file_number;
+
+    if (blob_file_number != kInvalidBlobFileNumber) {
+      MutableBlobFileMetaData* const mutable_meta =
+          GetOrCreateMutableBlobFileMetaData(blob_file_number);
+      if (mutable_meta) {
+        mutable_meta->LinkSst(file_number);
+      }
+    }
+
+    table_file_levels_[file_number] = level;
+
+    return Status::OK();
+  }
+
+  Status ApplyCompactCursors(int level,
+                             const InternalKey& smallest_uncompacted_key) {
+    if (level < 0) {
+      std::ostringstream oss;
+      oss << "Cannot add compact cursor (" << level << ","
+          << smallest_uncompacted_key.Encode().ToString()
+          << " due to invalid level (level = " << level << ")";
+      return Status::Corruption("VersionBuilder", oss.str());
+    }
+    if (level < num_levels_) {
+      // Omit levels (>= num_levels_) when re-open with shrinking num_levels_
+      updated_compact_cursors_[level] = smallest_uncompacted_key;
+    }
+    return Status::OK();
+  }
+
+  // Apply all of the edits in *edit to the current state.
+  Status Apply(const VersionEdit* edit) {
+    {
+      const Status s = CheckConsistency(base_vstorage_);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Note: we process the blob file related changes first because the
+    // table file addition/deletion logic depends on the blob files
+    // already being there.
+
+    // Add new blob files
+    for (const auto& blob_file_addition : edit->GetBlobFileAdditions()) {
+      const Status s = ApplyBlobFileAddition(blob_file_addition);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Increase the amount of garbage for blob files affected by GC
+    for (const auto& blob_file_garbage : edit->GetBlobFileGarbages()) {
+      const Status s = ApplyBlobFileGarbage(blob_file_garbage);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Delete table files
+    for (const auto& deleted_file : edit->GetDeletedFiles()) {
+      const int level = deleted_file.first;
+      const uint64_t file_number = deleted_file.second;
+
+      const Status s = ApplyFileDeletion(level, file_number);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Add new table files
+    for (const auto& new_file : edit->GetNewFiles()) {
+      const int level = new_file.first;
+      const FileMetaData& meta = new_file.second;
+
+      const Status s = ApplyFileAddition(level, meta);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+
+    // Populate compact cursors for round-robin compaction, leave
+    // the cursor to be empty to indicate it is invalid
+    for (const auto& cursor : edit->GetCompactCursors()) {
+      const int level = cursor.first;
+      const InternalKey smallest_uncompacted_key = cursor.second;
+      const Status s = ApplyCompactCursors(level, smallest_uncompacted_key);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+    return Status::OK();
+  }
+
+  // Helper function template for merging the blob file metadata from the base
+  // version with the mutable metadata representing the state after applying the
+  // edits. The function objects process_base and process_mutable are
+  // respectively called to handle a base version object when there is no
+  // matching mutable object, and a mutable object when there is no matching
+  // base version object. process_both is called to perform the merge when a
+  // given blob file appears both in the base version and the mutable list. The
+  // helper stops processing objects if a function object returns false. Blob
+  // files with a file number below first_blob_file are not processed.
+  template <typename ProcessBase, typename ProcessMutable, typename ProcessBoth>
+  void MergeBlobFileMetas(uint64_t first_blob_file, ProcessBase process_base,
+                          ProcessMutable process_mutable,
+                          ProcessBoth process_both) const {
+    assert(base_vstorage_);
+
+    auto base_it = base_vstorage_->GetBlobFileMetaDataLB(first_blob_file);
+    const auto base_it_end = base_vstorage_->GetBlobFiles().end();
+
+    auto mutable_it = mutable_blob_file_metas_.lower_bound(first_blob_file);
+    const auto mutable_it_end = mutable_blob_file_metas_.end();
+
+    while (base_it != base_it_end && mutable_it != mutable_it_end) {
+      const auto& base_meta = *base_it;
+      assert(base_meta);
+
+      const uint64_t base_blob_file_number = base_meta->GetBlobFileNumber();
+      const uint64_t mutable_blob_file_number = mutable_it->first;
+
+      if (base_blob_file_number < mutable_blob_file_number) {
+        if (!process_base(base_meta)) {
+          return;
+        }
+
+        ++base_it;
+      } else if (mutable_blob_file_number < base_blob_file_number) {
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_mutable(mutable_meta)) {
+          return;
+        }
+
+        ++mutable_it;
+      } else {
+        assert(base_blob_file_number == mutable_blob_file_number);
+
+        const auto& mutable_meta = mutable_it->second;
+
+        if (!process_both(base_meta, mutable_meta)) {
+          return;
+        }
+
+        ++base_it;
+        ++mutable_it;
+      }
+    }
+
+    while (base_it != base_it_end) {
+      const auto& base_meta = *base_it;
+
+      if (!process_base(base_meta)) {
+        return;
+      }
+
+      ++base_it;
+    }
+
+    while (mutable_it != mutable_it_end) {
+      const auto& mutable_meta = mutable_it->second;
+
+      if (!process_mutable(mutable_meta)) {
+        return;
+      }
+
+      ++mutable_it;
+    }
+  }
+
+  // Helper function template for finding the first blob file that has linked
+  // SSTs.
+  template <typename Meta>
+  static bool CheckLinkedSsts(const Meta& meta,
+                              uint64_t* min_oldest_blob_file_num) {
+    assert(min_oldest_blob_file_num);
+
+    if (!meta.GetLinkedSsts().empty()) {
+      assert(*min_oldest_blob_file_num == kInvalidBlobFileNumber);
+
+      *min_oldest_blob_file_num = meta.GetBlobFileNumber();
+
+      return false;
+    }
+
+    return true;
+  }
+
+  // Find the oldest blob file that has linked SSTs.
+  uint64_t GetMinOldestBlobFileNumber() const {
+    uint64_t min_oldest_blob_file_num = kInvalidBlobFileNumber;
+
+    auto process_base =
+        [&min_oldest_blob_file_num](
+            const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          return CheckLinkedSsts(*base_meta, &min_oldest_blob_file_num);
+        };
+
+    auto process_mutable = [&min_oldest_blob_file_num](
+                               const MutableBlobFileMetaData& mutable_meta) {
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    auto process_both = [&min_oldest_blob_file_num](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+#ifndef NDEBUG
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+#else
+      (void)base_meta;
+#endif
+
+      // Look at mutable_meta since it supersedes *base_meta
+      return CheckLinkedSsts(mutable_meta, &min_oldest_blob_file_num);
+    };
+
+    MergeBlobFileMetas(kInvalidBlobFileNumber, process_base, process_mutable,
+                       process_both);
+
+    return min_oldest_blob_file_num;
+  }
+
+  static std::shared_ptr<BlobFileMetaData> CreateBlobFileMetaData(
+      const MutableBlobFileMetaData& mutable_meta) {
+    return BlobFileMetaData::Create(
+        mutable_meta.GetSharedMeta(), mutable_meta.GetLinkedSsts(),
+        mutable_meta.GetGarbageBlobCount(), mutable_meta.GetGarbageBlobBytes());
+  }
+
+  // Add the blob file specified by meta to *vstorage if it is determined to
+  // contain valid data (blobs).
+  template <typename Meta>
+  static void AddBlobFileIfNeeded(VersionStorageInfo* vstorage, Meta&& meta) {
+    assert(vstorage);
+    assert(meta);
+
+    if (meta->GetLinkedSsts().empty() &&
+        meta->GetGarbageBlobCount() >= meta->GetTotalBlobCount()) {
+      return;
+    }
+
+    vstorage->AddBlobFile(std::forward<Meta>(meta));
+  }
+
+  // Merge the blob file metadata from the base version with the changes (edits)
+  // applied, and save the result into *vstorage.
+  void SaveBlobFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    assert(base_vstorage_);
+    vstorage->ReserveBlob(base_vstorage_->GetBlobFiles().size() +
+                          mutable_blob_file_metas_.size());
+
+    const uint64_t oldest_blob_file_with_linked_ssts =
+        GetMinOldestBlobFileNumber();
+
+    auto process_base =
+        [vstorage](const std::shared_ptr<BlobFileMetaData>& base_meta) {
+          assert(base_meta);
+
+          AddBlobFileIfNeeded(vstorage, base_meta);
+
+          return true;
+        };
+
+    auto process_mutable =
+        [vstorage](const MutableBlobFileMetaData& mutable_meta) {
+          AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+          return true;
+        };
+
+    auto process_both = [vstorage](
+                            const std::shared_ptr<BlobFileMetaData>& base_meta,
+                            const MutableBlobFileMetaData& mutable_meta) {
+      assert(base_meta);
+      assert(base_meta->GetSharedMeta() == mutable_meta.GetSharedMeta());
+
+      if (!mutable_meta.HasDelta()) {
+        assert(base_meta->GetGarbageBlobCount() ==
+               mutable_meta.GetGarbageBlobCount());
+        assert(base_meta->GetGarbageBlobBytes() ==
+               mutable_meta.GetGarbageBlobBytes());
+        assert(base_meta->GetLinkedSsts() == mutable_meta.GetLinkedSsts());
+
+        AddBlobFileIfNeeded(vstorage, base_meta);
+
+        return true;
+      }
+
+      AddBlobFileIfNeeded(vstorage, CreateBlobFileMetaData(mutable_meta));
+
+      return true;
+    };
+
+    MergeBlobFileMetas(oldest_blob_file_with_linked_ssts, process_base,
+                       process_mutable, process_both);
+  }
+
+  void MaybeAddFile(VersionStorageInfo* vstorage, int level,
+                    FileMetaData* f) const {
+    const uint64_t file_number = f->fd.GetNumber();
+
+    const auto& level_state = levels_[level];
+
+    const auto& del_files = level_state.deleted_files;
+    const auto del_it = del_files.find(file_number);
+
+    if (del_it != del_files.end()) {
+      // f is to-be-deleted table file
+      vstorage->RemoveCurrentStats(f);
+    } else {
+      const auto& add_files = level_state.added_files;
+      const auto add_it = add_files.find(file_number);
+
+      // Note: if the file appears both in the base version and in the added
+      // list, the added FileMetaData supersedes the one in the base version.
+      if (add_it != add_files.end() && add_it->second != f) {
+        vstorage->RemoveCurrentStats(f);
+      } else {
+        vstorage->AddFile(level, f);
+      }
+    }
+  }
+
+  template <typename Cmp>
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage, int level, Cmp cmp) const {
+    // Merge the set of added files with the set of pre-existing files.
+    // Drop any deleted files.  Store the result in *vstorage.
+    const auto& base_files = base_vstorage_->LevelFiles(level);
+    const auto& unordered_added_files = levels_[level].added_files;
+    vstorage->Reserve(level, base_files.size() + unordered_added_files.size());
+
+    // Sort added files for the level.
+    std::vector<FileMetaData*> added_files;
+    added_files.reserve(unordered_added_files.size());
+    for (const auto& pair : unordered_added_files) {
+      added_files.push_back(pair.second);
+    }
+    std::sort(added_files.begin(), added_files.end(), cmp);
+
+    auto base_iter = base_files.begin();
+    auto base_end = base_files.end();
+    auto added_iter = added_files.begin();
+    auto added_end = added_files.end();
+    while (added_iter != added_end || base_iter != base_end) {
+      if (base_iter == base_end ||
+          (added_iter != added_end && cmp(*added_iter, *base_iter))) {
+        MaybeAddFile(vstorage, level, *added_iter++);
+      } else {
+        MaybeAddFile(vstorage, level, *base_iter++);
+      }
+    }
+  }
+
+  void SaveSSTFilesTo(VersionStorageInfo* vstorage) const {
+    assert(vstorage);
+
+    if (!num_levels_) {
+      return;
+    }
+
+    SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_);
+
+    for (int level = 1; level < num_levels_; ++level) {
+      SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_);
+    }
+  }
+
+  void SaveCompactCursorsTo(VersionStorageInfo* vstorage) const {
+    for (auto iter = updated_compact_cursors_.begin();
+         iter != updated_compact_cursors_.end(); iter++) {
+      vstorage->AddCursorForOneLevel(iter->first, iter->second);
+    }
+  }
+
+  // Save the current state in *vstorage.
+  Status SaveTo(VersionStorageInfo* vstorage) const {
+    Status s;
+
+#ifndef NDEBUG
+    // The same check is done within Apply() so we skip it in release mode.
+    s = CheckConsistency(base_vstorage_);
+    if (!s.ok()) {
+      return s;
+    }
+#endif  // NDEBUG
+
+    s = CheckConsistency(vstorage);
+    if (!s.ok()) {
+      return s;
+    }
+
+    SaveSSTFilesTo(vstorage);
+
+    SaveBlobFilesTo(vstorage);
+
+    SaveCompactCursorsTo(vstorage);
+
+    s = CheckConsistency(vstorage);
+    return s;
+  }
+
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin) {
+    assert(table_cache_ != nullptr);
+
+    size_t table_cache_capacity = table_cache_->get_cache()->GetCapacity();
+    bool always_load = (table_cache_capacity == TableCache::kInfiniteCapacity);
+    size_t max_load = std::numeric_limits<size_t>::max();
+
+    if (!always_load) {
+      // If it is initial loading and not set to always loading all the
+      // files, we only load up to kInitialLoadLimit files, to limit the
+      // time reopening the DB.
+      const size_t kInitialLoadLimit = 16;
+      size_t load_limit;
+      // If the table cache is not 1/4 full, we pin the table handle to
+      // file metadata to avoid the cache read costs when reading the file.
+      // The downside of pinning those files is that LRU won't be followed
+      // for those files. This doesn't matter much because if number of files
+      // of the DB excceeds table cache capacity, eventually no table reader
+      // will be pinned and LRU will be followed.
+      if (is_initial_load) {
+        load_limit = std::min(kInitialLoadLimit, table_cache_capacity / 4);
+      } else {
+        load_limit = table_cache_capacity / 4;
+      }
+
+      size_t table_cache_usage = table_cache_->get_cache()->GetUsage();
+      if (table_cache_usage >= load_limit) {
+        // TODO (yanqin) find a suitable status code.
+        return Status::OK();
+      } else {
+        max_load = load_limit - table_cache_usage;
+      }
+    }
+
+    // <file metadata, level>
+    std::vector<std::pair<FileMetaData*, int>> files_meta;
+    std::vector<Status> statuses;
+    for (int level = 0; level < num_levels_; level++) {
+      for (auto& file_meta_pair : levels_[level].added_files) {
+        auto* file_meta = file_meta_pair.second;
+        // If the file has been opened before, just skip it.
+        if (!file_meta->table_reader_handle) {
+          files_meta.emplace_back(file_meta, level);
+          statuses.emplace_back(Status::OK());
+        }
+        if (files_meta.size() >= max_load) {
+          break;
+        }
+      }
+      if (files_meta.size() >= max_load) {
+        break;
+      }
+    }
+
+    std::atomic<size_t> next_file_meta_idx(0);
+    std::function<void()> load_handlers_func([&]() {
+      while (true) {
+        size_t file_idx = next_file_meta_idx.fetch_add(1);
+        if (file_idx >= files_meta.size()) {
+          break;
+        }
+
+        auto* file_meta = files_meta[file_idx].first;
+        int level = files_meta[file_idx].second;
+        statuses[file_idx] = table_cache_->FindTable(
+            ReadOptions(), file_options_,
+            *(base_vstorage_->InternalComparator()), *file_meta,
+            &file_meta->table_reader_handle, prefix_extractor, false /*no_io */,
+            true /* record_read_stats */,
+            internal_stats->GetFileReadHist(level), false, level,
+            prefetch_index_and_filter_in_cache, max_file_size_for_l0_meta_pin,
+            file_meta->temperature);
+        if (file_meta->table_reader_handle != nullptr) {
+          // Load table_reader
+          file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle(
+              file_meta->table_reader_handle);
+        }
+      }
+    });
+
+    std::vector<port::Thread> threads;
+    for (int i = 1; i < max_threads; i++) {
+      threads.emplace_back(load_handlers_func);
+    }
+    load_handlers_func();
+    for (auto& t : threads) {
+      t.join();
+    }
+    Status ret;
+    for (const auto& s : statuses) {
+      if (!s.ok()) {
+        if (ret.ok()) {
+          ret = s;
+        }
+      }
+    }
+    return ret;
+  }
+};
+
+VersionBuilder::VersionBuilder(
+    const FileOptions& file_options, const ImmutableCFOptions* ioptions,
+    TableCache* table_cache, VersionStorageInfo* base_vstorage,
+    VersionSet* version_set,
+    std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr)
+    : rep_(new Rep(file_options, ioptions, table_cache, base_vstorage,
+                   version_set, file_metadata_cache_res_mgr)) {}
+
+VersionBuilder::~VersionBuilder() = default;
+
+bool VersionBuilder::CheckConsistencyForNumLevels() {
+  return rep_->CheckConsistencyForNumLevels();
+}
+
+Status VersionBuilder::Apply(const VersionEdit* edit) {
+  return rep_->Apply(edit);
+}
+
+Status VersionBuilder::SaveTo(VersionStorageInfo* vstorage) const {
+  return rep_->SaveTo(vstorage);
+}
+
+Status VersionBuilder::LoadTableHandlers(
+    InternalStats* internal_stats, int max_threads,
+    bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+    const std::shared_ptr<const SliceTransform>& prefix_extractor,
+    size_t max_file_size_for_l0_meta_pin) {
+  return rep_->LoadTableHandlers(
+      internal_stats, max_threads, prefetch_index_and_filter_in_cache,
+      is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin);
+}
+
+uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const {
+  return rep_->GetMinOldestBlobFileNumber();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), cfd->current()->storage_info(),
+          cfd->current()->version_set(),
+          cfd->GetFileMetadataCacheReservationManager())),
+      version_(cfd->current()) {
+  version_->Ref();
+}
+
+BaseReferencedVersionBuilder::BaseReferencedVersionBuilder(
+    ColumnFamilyData* cfd, Version* v)
+    : version_builder_(new VersionBuilder(
+          cfd->current()->version_set()->file_options(), cfd->ioptions(),
+          cfd->table_cache(), v->storage_info(), v->version_set(),
+          cfd->GetFileMetadataCacheReservationManager())),
+      version_(v) {
+  assert(version_ != cfd->current());
+}
+
+BaseReferencedVersionBuilder::~BaseReferencedVersionBuilder() {
+  version_->Unref();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder.h b/src/rocksdb/db/version_builder.h
new file mode 100644
index 000000000..1c022832a
--- /dev/null
+++ b/src/rocksdb/db/version_builder.h
@@ -0,0 +1,72 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+#pragma once
+
+#include <memory>
+
+#include "rocksdb/file_system.h"
+#include "rocksdb/slice_transform.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ImmutableCFOptions;
+class TableCache;
+class VersionStorageInfo;
+class VersionEdit;
+struct FileMetaData;
+class InternalStats;
+class Version;
+class VersionSet;
+class ColumnFamilyData;
+class CacheReservationManager;
+
+// A helper class so we can efficiently apply a whole sequence
+// of edits to a particular state without creating intermediate
+// Versions that contain full copies of the intermediate state.
+class VersionBuilder {
+ public:
+  VersionBuilder(const FileOptions& file_options,
+                 const ImmutableCFOptions* ioptions, TableCache* table_cache,
+                 VersionStorageInfo* base_vstorage, VersionSet* version_set,
+                 std::shared_ptr<CacheReservationManager>
+                     file_metadata_cache_res_mgr = nullptr);
+  ~VersionBuilder();
+
+  bool CheckConsistencyForNumLevels();
+  Status Apply(const VersionEdit* edit);
+  Status SaveTo(VersionStorageInfo* vstorage) const;
+  Status LoadTableHandlers(
+      InternalStats* internal_stats, int max_threads,
+      bool prefetch_index_and_filter_in_cache, bool is_initial_load,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      size_t max_file_size_for_l0_meta_pin);
+  uint64_t GetMinOldestBlobFileNumber() const;
+
+ private:
+  class Rep;
+  std::unique_ptr<Rep> rep_;
+};
+
+// A wrapper of version builder which references the current version in
+// constructor and unref it in the destructor.
+// Both of the constructor and destructor need to be called inside DB Mutex.
+class BaseReferencedVersionBuilder {
+ public:
+  explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd);
+  BaseReferencedVersionBuilder(ColumnFamilyData* cfd, Version* v);
+  ~BaseReferencedVersionBuilder();
+  VersionBuilder* version_builder() const { return version_builder_.get(); }
+
+ private:
+  std::unique_ptr<VersionBuilder> version_builder_;
+  Version* version_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_builder_test.cc b/src/rocksdb/db/version_builder_test.cc
new file mode 100644
index 000000000..ee5c3f2e3
--- /dev/null
+++ b/src/rocksdb/db/version_builder_test.cc
@@ -0,0 +1,1695 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <cstring>
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "db/version_edit.h"
+#include "db/version_set.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class VersionBuilderTest : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+  uint32_t file_num_;
+  CompactionOptionsFIFO fifo_options_;
+  std::vector<uint64_t> size_being_compacted_;
+
+  VersionBuilderTest()
+      : ucmp_(BytewiseComparator()),
+        icmp_(ucmp_),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel,
+                  nullptr, false),
+        file_num_(1) {
+    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
+    size_being_compacted_.resize(options_.num_levels);
+  }
+
+  ~VersionBuilderTest() override {
+    for (int i = 0; i < vstorage_.num_levels(); i++) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  void Add(int level, uint64_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0, uint32_t path_id = 0,
+           SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100,
+           uint64_t num_entries = 0, uint64_t num_deletions = 0,
+           bool sampled = false, SequenceNumber smallest_seqno = 0,
+           SequenceNumber largest_seqno = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq),
+        GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno,
+        /* marked_for_compact */ false, Temperature::kUnknown,
+        oldest_blob_file_number, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size = file_size;
+    f->num_entries = num_entries;
+    f->num_deletions = num_deletions;
+    vstorage_.AddFile(level, f);
+    if (sampled) {
+      f->init_stats_from_file = true;
+      vstorage_.UpdateAccumulatedStats(f);
+    }
+  }
+
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes, std::string checksum_method,
+               std::string checksum_value,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) {
+    constexpr int level = 0;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr uint64_t file_size = 100;
+    constexpr uint32_t path_id = 0;
+    constexpr SequenceNumber smallest_seq = 0;
+    constexpr SequenceNumber largest_seq = 0;
+    constexpr uint64_t num_entries = 0;
+    constexpr uint64_t num_deletions = 0;
+    constexpr bool sampled = false;
+
+    Add(level, table_file_number, smallest, largest, file_size, path_id,
+        smallest_seq, largest_seq, num_entries, num_deletions, sampled,
+        smallest_seq, largest_seq, blob_file_number);
+  }
+
+  void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number,
+                          uint64_t blob_file_number) {
+    assert(edit);
+
+    constexpr int level = 0;
+    constexpr uint32_t path_id = 0;
+    constexpr uint64_t file_size = 100;
+    constexpr char smallest[] = "bar";
+    constexpr char largest[] = "foo";
+    constexpr SequenceNumber smallest_seqno = 100;
+    constexpr SequenceNumber largest_seqno = 300;
+    constexpr bool marked_for_compaction = false;
+
+    edit->AddFile(
+        level, table_file_number, path_id, file_size, GetInternalKey(smallest),
+        GetInternalKey(largest), smallest_seqno, largest_seqno,
+        marked_for_compaction, Temperature::kUnknown, blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  }
+
+  void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) {
+    assert(vstorage);
+
+    vstorage->PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage->SetFinalized();
+  }
+
+  void UpdateVersionStorageInfo() { UpdateVersionStorageInfo(&vstorage_); }
+};
+
+void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) {
+  for (int i = 0; i < new_vstorage->num_levels(); i++) {
+    for (auto* f : new_vstorage->LevelFiles(i)) {
+      if (--f->refs == 0) {
+        delete f;
+      }
+    }
+  }
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveTo) {
+  Add(0, 1U, "150", "200", 100U);
+
+  Add(1, 66U, "150", "200", 100U);
+  Add(1, 88U, "201", "300", 100U);
+
+  Add(2, 6U, "150", "179", 100U);
+  Add(2, 7U, "180", "220", 100U);
+  Add(2, 8U, "221", "300", 100U);
+
+  Add(3, 26U, "150", "170", 100U);
+  Add(3, 27U, "171", "179", 100U);
+  Add(3, 28U, "191", "220", 100U);
+  Add(3, 29U, "221", "300", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(3, 27U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(3));
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+
+  Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U);
+  Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U);
+
+  Add(4, 6U, "150", "179", 100U);
+  Add(4, 7U, "180", "220", 100U);
+  Add(4, 8U, "221", "300", 100U);
+
+  Add(5, 26U, "150", "170", 100U);
+  Add(5, 27U, "171", "179", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.DeleteFile(0, 1U);
+  version_edit.DeleteFile(0, 88U);
+  version_edit.DeleteFile(4, 6U);
+  version_edit.DeleteFile(4, 7U);
+  version_edit.DeleteFile(4, 8U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(0U, new_vstorage.NumLevelBytes(0));
+  ASSERT_EQ(100U, new_vstorage.NumLevelBytes(4));
+  ASSERT_EQ(200U, new_vstorage.NumLevelBytes(5));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr, false);
+
+  VersionEdit version_edit;
+  version_edit.AddFile(
+      2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit.AddFile(
+      2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+
+  VersionEdit version_edit2;
+  version_edit.AddFile(
+      2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  version_edit2.DeleteFile(2, 616);
+  version_edit2.DeleteFile(2, 636);
+  version_edit.AddFile(
+      2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200,
+      false, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(version_builder.Apply(&version_edit2));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionIncorrectLevel) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 100;
+
+  Add(level, file_number, smallest, largest, file_size);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int incorrect_level = 3;
+
+  edit.DeleteFile(incorrect_level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #2345 from level 3 since "
+                          "it is on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionNotInLSMTree) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 1234;
+
+  edit.DeleteFile(level, file_number);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot delete table file #1234 from level 3 since "
+                          "it is not in the LSM tree"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 10000;
+  constexpr uint32_t path_id = 0;
+  constexpr SequenceNumber smallest_seq = 100;
+  constexpr SequenceNumber largest_seq = 500;
+  constexpr uint64_t num_entries = 0;
+  constexpr uint64_t num_deletions = 0;
+  constexpr bool sampled = false;
+  constexpr SequenceNumber smallest_seqno = 1;
+  constexpr SequenceNumber largest_seqno = 1000;
+
+  Add(level, file_number, smallest, largest, file_size, path_id, smallest_seq,
+      largest_seq, num_entries, num_deletions, sampled, smallest_seqno,
+      largest_seqno);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  VersionEdit addition;
+
+  constexpr bool marked_for_compaction = false;
+
+  addition.AddFile(level, file_number, path_id, file_size,
+                   GetInternalKey(smallest, smallest_seq),
+                   GetInternalKey(largest, largest_seq), smallest_seqno,
+                   largest_seqno, marked_for_compaction, Temperature::kUnknown,
+                   kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+                   kUnknownFileCreationTime, kUnknownFileChecksum,
+                   kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_EQ(new_vstorage.GetFileLocation(file_number).GetLevel(), level);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) {
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr uint64_t file_size = 10000;
+
+  Add(level, file_number, smallest, largest, file_size);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int new_level = 2;
+  constexpr uint32_t path_id = 0;
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(
+      new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 1"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr int level = 3;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest),
+               GetInternalKey(largest), smallest_seqno, largest_seqno,
+               marked_for_compaction, Temperature::kUnknown,
+               kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  VersionEdit other_edit;
+
+  constexpr int new_level = 2;
+
+  other_edit.AddFile(
+      new_level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  const Status s = builder.Apply(&other_edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(),
+                          "Cannot add table file #2345 to level 2 since it is "
+                          "already in the LSM tree on level 3"));
+}
+
+TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) {
+  UpdateVersionStorageInfo();
+
+  constexpr int level = 1;
+  constexpr uint64_t file_number = 2345;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 10000;
+  constexpr char smallest[] = "bar";
+  constexpr char largest[] = "foo";
+  constexpr SequenceNumber smallest_seqno = 100;
+  constexpr SequenceNumber largest_seqno = 1000;
+  constexpr bool marked_for_compaction = false;
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  addition.AddFile(
+      level, file_number, path_id, file_size, GetInternalKey(smallest),
+      GetInternalKey(largest), smallest_seqno, largest_seqno,
+      marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber,
+      kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+      kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  VersionEdit deletion;
+
+  deletion.DeleteFile(level, file_number);
+
+  ASSERT_OK(builder.Apply(&deletion));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  ASSERT_FALSE(new_vstorage.GetFileLocation(file_number).IsValid());
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAddition) {
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&edit, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), 0);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyInBase) {
+  // Attempt to add a blob file that is already present in the base version.
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts(), garbage_blob_count,
+          garbage_blob_bytes);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileAdditionAlreadyApplied) {
+  // Attempt to add the same blob file twice using version edits.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 already added"));
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) {
+  // Increase the amount of garbage for a blob file present in the base version.
+
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  AddBlob(blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+          checksum_value, BlobFileMetaData::LinkedSsts{table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  const auto meta = vstorage_.GetBlobFileMetaData(blob_file_number);
+  ASSERT_NE(meta, nullptr);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  AddDummyFile(table_file_number, blob_file_number);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t new_garbage_blob_count = 456;
+  constexpr uint64_t new_garbage_blob_bytes = 111111;
+
+  edit.AddBlobFileGarbage(blob_file_number, new_garbage_blob_count,
+                          new_garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetSharedMeta(), meta->GetSharedMeta());
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(),
+            garbage_blob_count + new_garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(),
+            garbage_blob_bytes + new_garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) {
+  // Increase the amount of garbage for a blob file added using a version edit.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  constexpr uint64_t garbage_blob_count = 123;
+  constexpr uint64_t garbage_blob_bytes = 456789;
+
+  VersionEdit garbage;
+
+  garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                             garbage_blob_bytes);
+
+  ASSERT_OK(builder.Apply(&garbage));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 1);
+
+  const auto new_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(new_meta, nullptr);
+  ASSERT_EQ(new_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(new_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(new_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(new_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(new_meta->GetChecksumValue(), checksum_value);
+  ASSERT_EQ(new_meta->GetLinkedSsts(),
+            BlobFileMetaData::LinkedSsts{table_file_number});
+  ASSERT_EQ(new_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(new_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileNotFound) {
+  // Attempt to increase the amount of garbage for a blob file that is
+  // neither in the base version, nor was it added using a version edit.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t garbage_blob_count = 5678;
+  constexpr uint64_t garbage_blob_bytes = 999999;
+
+  edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                          garbage_blob_bytes);
+
+  const Status s = builder.Apply(&edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "Blob file #1234 not found"));
+}
+
+TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) {
+  // Test that VersionEdits that would result in the count/total size of garbage
+  // exceeding the count/total size of all blobs are rejected.
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit addition;
+
+  constexpr uint64_t blob_file_number = 1234;
+  constexpr uint64_t total_blob_count = 5678;
+  constexpr uint64_t total_blob_bytes = 999999;
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] =
+      "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c\x52"
+      "\x5c\xbd";
+
+  addition.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                       checksum_method, checksum_value);
+
+  // Add dummy table file to ensure the blob file is referenced.
+  constexpr uint64_t table_file_number = 1;
+  AddDummyFileToEdit(&addition, table_file_number, blob_file_number);
+
+  ASSERT_OK(builder.Apply(&addition));
+
+  {
+    // Garbage blob count overflow
+    constexpr uint64_t garbage_blob_count = 5679;
+    constexpr uint64_t garbage_blob_bytes = 999999;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+
+  {
+    // Garbage blob bytes overflow
+    constexpr uint64_t garbage_blob_count = 5678;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    VersionEdit garbage;
+
+    garbage.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                               garbage_blob_bytes);
+
+    const Status s = builder.Apply(&garbage);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(
+        std::strstr(s.getState(), "Garbage overflow for blob file #1234"));
+  }
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesTo) {
+  // Add three blob files to base version.
+  for (uint64_t i = 1; i <= 3; ++i) {
+    const uint64_t table_file_number = 2 * i;
+    const uint64_t blob_file_number = 2 * i + 1;
+    const uint64_t total_blob_count = i * 1000;
+    const uint64_t total_blob_bytes = i * 1000000;
+    const uint64_t garbage_blob_count = i * 100;
+    const uint64_t garbage_blob_bytes = i * 20000;
+
+    AddBlob(blob_file_number, total_blob_count, total_blob_bytes,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{table_file_number}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  // Add dummy table files to ensure the blob files are referenced.
+  // Note: files are added to L0, so they have to be added in reverse order
+  // (newest first).
+  for (uint64_t i = 3; i >= 1; --i) {
+    const uint64_t table_file_number = 2 * i;
+    const uint64_t blob_file_number = 2 * i + 1;
+
+    AddDummyFile(table_file_number, blob_file_number);
+  }
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add some garbage to the second and third blob files. The second blob file
+  // remains valid since it does not consist entirely of garbage yet. The third
+  // blob file is all garbage after the edit and will not be part of the new
+  // version. The corresponding dummy table file is also removed for
+  // consistency.
+  edit.AddBlobFileGarbage(/* blob_file_number */ 5,
+                          /* garbage_blob_count */ 200,
+                          /* garbage_blob_bytes */ 100000);
+  edit.AddBlobFileGarbage(/* blob_file_number */ 7,
+                          /* garbage_blob_count */ 2700,
+                          /* garbage_blob_bytes */ 2940000);
+  edit.DeleteFile(/* level */ 0, /* file_number */ 6);
+
+  // Add a fourth blob file.
+  edit.AddBlobFile(/* blob_file_number */ 9, /* total_blob_count */ 4000,
+                   /* total_blob_bytes */ 4000000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = false;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 3);
+
+  const auto meta3 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  ASSERT_NE(meta3, nullptr);
+  ASSERT_EQ(meta3->GetBlobFileNumber(), 3);
+  ASSERT_EQ(meta3->GetTotalBlobCount(), 1000);
+  ASSERT_EQ(meta3->GetTotalBlobBytes(), 1000000);
+  ASSERT_EQ(meta3->GetGarbageBlobCount(), 100);
+  ASSERT_EQ(meta3->GetGarbageBlobBytes(), 20000);
+
+  const auto meta5 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 5);
+
+  ASSERT_NE(meta5, nullptr);
+  ASSERT_EQ(meta5->GetBlobFileNumber(), 5);
+  ASSERT_EQ(meta5->GetTotalBlobCount(), 2000);
+  ASSERT_EQ(meta5->GetTotalBlobBytes(), 2000000);
+  ASSERT_EQ(meta5->GetGarbageBlobCount(), 400);
+  ASSERT_EQ(meta5->GetGarbageBlobBytes(), 140000);
+
+  const auto meta9 = new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 9);
+
+  ASSERT_NE(meta9, nullptr);
+  ASSERT_EQ(meta9->GetBlobFileNumber(), 9);
+  ASSERT_EQ(meta9->GetTotalBlobCount(), 4000);
+  ASSERT_EQ(meta9->GetTotalBlobBytes(), 4000000);
+  ASSERT_EQ(meta9->GetGarbageBlobCount(), 0);
+  ASSERT_EQ(meta9->GetGarbageBlobBytes(), 0);
+
+  // Delete the first table file, which makes the first blob file obsolete
+  // since it's at the head and unreferenced.
+  VersionBuilder second_builder(env_options, &ioptions_, table_cache,
+                                &new_vstorage, version_set);
+
+  VersionEdit second_edit;
+  second_edit.DeleteFile(/* level */ 0, /* file_number */ 2);
+
+  ASSERT_OK(second_builder.Apply(&second_edit));
+
+  VersionStorageInfo newer_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                    kCompactionStyleLevel, &new_vstorage,
+                                    force_consistency_checks);
+
+  ASSERT_OK(second_builder.SaveTo(&newer_vstorage));
+
+  UpdateVersionStorageInfo(&newer_vstorage);
+
+  const auto& newer_blob_files = newer_vstorage.GetBlobFiles();
+  ASSERT_EQ(newer_blob_files.size(), 2);
+
+  const auto newer_meta3 =
+      newer_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  ASSERT_EQ(newer_meta3, nullptr);
+
+  UnrefFilesInVersion(&newer_vstorage);
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) {
+  // When multiple background jobs (flushes/compactions) are executing in
+  // parallel, it is possible for the VersionEdit adding blob file K to be
+  // applied *after* the VersionEdit adding blob file N (for N > K). This test
+  // case makes sure this is handled correctly.
+
+  // Add blob file #4 (referenced by table file #3) to base version.
+  constexpr uint64_t base_table_file_number = 3;
+  constexpr uint64_t base_blob_file_number = 4;
+  constexpr uint64_t base_total_blob_count = 100;
+  constexpr uint64_t base_total_blob_bytes = 1 << 20;
+
+  constexpr char checksum_method[] = "SHA1";
+  constexpr char checksum_value[] = "\xfa\xce\xb0\x0c";
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  AddDummyFile(base_table_file_number, base_blob_file_number);
+  AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes,
+          checksum_method, checksum_value,
+          BlobFileMetaData::LinkedSsts{base_table_file_number},
+          garbage_blob_count, garbage_blob_bytes);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  // Add blob file #2 (referenced by table file #1).
+  constexpr int level = 0;
+  constexpr uint64_t table_file_number = 1;
+  constexpr uint32_t path_id = 0;
+  constexpr uint64_t file_size = 1 << 12;
+  constexpr char smallest[] = "key1";
+  constexpr char largest[] = "key987";
+  constexpr SequenceNumber smallest_seqno = 0;
+  constexpr SequenceNumber largest_seqno = 0;
+  constexpr bool marked_for_compaction = false;
+
+  constexpr uint64_t blob_file_number = 2;
+  static_assert(blob_file_number < base_blob_file_number,
+                "Added blob file should have a smaller file number");
+
+  constexpr uint64_t total_blob_count = 234;
+  constexpr uint64_t total_blob_bytes = 1 << 22;
+
+  edit.AddFile(level, table_file_number, path_id, file_size,
+               GetInternalKey(smallest), GetInternalKey(largest),
+               smallest_seqno, largest_seqno, marked_for_compaction,
+               Temperature::kUnknown, blob_file_number,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               checksum_value, checksum_method, kNullUniqueId64x2);
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  const auto& new_blob_files = new_vstorage.GetBlobFiles();
+  ASSERT_EQ(new_blob_files.size(), 2);
+
+  const auto base_meta =
+      new_vstorage.GetBlobFileMetaData(base_blob_file_number);
+
+  ASSERT_NE(base_meta, nullptr);
+  ASSERT_EQ(base_meta->GetBlobFileNumber(), base_blob_file_number);
+  ASSERT_EQ(base_meta->GetTotalBlobCount(), base_total_blob_count);
+  ASSERT_EQ(base_meta->GetTotalBlobBytes(), base_total_blob_bytes);
+  ASSERT_EQ(base_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(base_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(base_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(base_meta->GetChecksumValue(), checksum_value);
+
+  const auto added_meta = new_vstorage.GetBlobFileMetaData(blob_file_number);
+
+  ASSERT_NE(added_meta, nullptr);
+  ASSERT_EQ(added_meta->GetBlobFileNumber(), blob_file_number);
+  ASSERT_EQ(added_meta->GetTotalBlobCount(), total_blob_count);
+  ASSERT_EQ(added_meta->GetTotalBlobBytes(), total_blob_bytes);
+  ASSERT_EQ(added_meta->GetGarbageBlobCount(), garbage_blob_count);
+  ASSERT_EQ(added_meta->GetGarbageBlobBytes(), garbage_blob_bytes);
+  ASSERT_EQ(added_meta->GetChecksumMethod(), checksum_method);
+  ASSERT_EQ(added_meta->GetChecksumValue(), checksum_value);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) {
+  // Initialize base version. The first table file points to a valid blob file
+  // in this version; the second one does not refer to any blob files.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+  Add(/* level */ 1, /* file_number */ 23, /* smallest */ "201",
+      /* largest */ "300", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 200, /* largest_seq */ 200,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 200, /* largest_seqno */ 200,
+      kInvalidBlobFileNumber);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  // Add a new table file that points to the existing blob file, and add a
+  // new table file--blob file pair.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddFile(/* level */ 1, /* file_number */ 606, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("701"),
+               /* largest */ GetInternalKey("750"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("801"),
+               /* largest */ GetInternalKey("850"), /* smallest_seqno */ 200,
+               /* largest_seqno */ 200, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+  edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000,
+                   /* total_blob_bytes */ 200000,
+                   /* checksum_method */ std::string(),
+                   /* checksum_value */ std::string());
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesInconsistentLinks) {
+  // Initialize base version. Links between the table file and the blob file
+  // are inconsistent.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 256);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 500, /* garbage_blob_bytes */ 300000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(
+      s.getState(),
+      "Links are inconsistent between table files and blob file #16"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbage) {
+  // Initialize base version. The table file points to a blob file that is
+  // all garbage.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+
+  UpdateVersionStorageInfo();
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForBlobFilesAllGarbageLinkedSsts) {
+  // Initialize base version, with a table file pointing to a blob file
+  // that has no garbage at this point.
+
+  Add(/* level */ 1, /* file_number */ 1, /* smallest */ "150",
+      /* largest */ "200", /* file_size */ 100,
+      /* path_id */ 0, /* smallest_seq */ 100, /* largest_seq */ 100,
+      /* num_entries */ 0, /* num_deletions */ 0,
+      /* sampled */ false, /* smallest_seqno */ 100, /* largest_seqno */ 100,
+      /* oldest_blob_file_number */ 16);
+
+  AddBlob(/* blob_file_number */ 16, /* total_blob_count */ 1000,
+          /* total_blob_bytes */ 1000000,
+          /* checksum_method */ std::string(),
+          /* checksum_value */ std::string(), BlobFileMetaData::LinkedSsts{1},
+          /* garbage_blob_count */ 0, /* garbage_blob_bytes */ 0);
+
+  UpdateVersionStorageInfo();
+
+  // Mark the entire blob file garbage but do not remove the linked SST.
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  VersionEdit edit;
+
+  edit.AddBlobFileGarbage(/* blob_file_number */ 16,
+                          /* garbage_blob_count */ 1000,
+                          /* garbage_blob_bytes */ 1000000);
+
+  ASSERT_OK(builder.Apply(&edit));
+
+  // Save to a new version in order to trigger consistency checks.
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  const Status s = builder.SaveTo(&new_vstorage);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(
+      std::strstr(s.getState(), "Blob file #16 consists entirely of garbage"));
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) {
+  // Initialize base version. Table files 1..10 are linked to blob files 1..5,
+  // while table files 11..20 are not linked to any blob files.
+
+  for (uint64_t i = 1; i <= 10; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100,
+        /* oldest_blob_file_number */ ((i - 1) % 5) + 1);
+  }
+
+  for (uint64_t i = 1; i <= 5; ++i) {
+    AddBlob(/* blob_file_number */ i, /* total_blob_count */ 2000,
+            /* total_blob_bytes */ 2000000,
+            /* checksum_method */ std::string(),
+            /* checksum_value */ std::string(),
+            BlobFileMetaData::LinkedSsts{i, i + 5},
+            /* garbage_blob_count */ 1000, /* garbage_blob_bytes */ 1000000);
+  }
+
+  for (uint64_t i = 11; i <= 20; ++i) {
+    std::ostringstream oss;
+    oss << std::setw(2) << std::setfill('0') << i;
+
+    const std::string key = oss.str();
+
+    Add(/* level */ 1, /* file_number */ i, /* smallest */ key.c_str(),
+        /* largest */ key.c_str(), /* file_size */ 100,
+        /* path_id */ 0, /* smallest_seq */ i * 100, /* largest_seq */ i * 100,
+        /* num_entries */ 0, /* num_deletions */ 0,
+        /* sampled */ false, /* smallest_seqno */ i * 100,
+        /* largest_seqno */ i * 100, kInvalidBlobFileNumber);
+  }
+
+  UpdateVersionStorageInfo();
+
+  {
+    const auto& blob_files = vstorage_.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 6}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          vstorage_.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+  }
+
+  VersionEdit edit;
+
+  // Add an SST that references a blob file.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 21, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("21", 2100),
+      /* largest */ GetInternalKey("21", 2100), /* smallest_seqno */ 2100,
+      /* largest_seqno */ 2100, /* marked_for_compaction */ false,
+      Temperature::kUnknown,
+      /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Add an SST that does not reference any blob files.
+  edit.AddFile(
+      /* level */ 1, /* file_number */ 22, /* path_id */ 0,
+      /* file_size */ 100, /* smallest */ GetInternalKey("22", 2200),
+      /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200,
+      /* largest_seqno */ 2200, /* marked_for_compaction */ false,
+      Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+      kUnknownFileCreationTime, kUnknownFileChecksum,
+      kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Delete a file that references a blob file.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 6);
+
+  // Delete a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 16);
+
+  // Trivially move a file that references a blob file. Note that we save
+  // the original BlobFileMetaData object so we can check that no new object
+  // gets created.
+  auto meta3 = vstorage_.GetBlobFileMetaData(/* blob_file_number */ 3);
+
+  edit.DeleteFile(/* level */ 1, /* file_number */ 3);
+  edit.AddFile(/* level */ 2, /* file_number */ 3, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("03", 300),
+               /* largest */ GetInternalKey("03", 300),
+               /* smallest_seqno */ 300,
+               /* largest_seqno */ 300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  // Trivially move a file that does not reference any blob files.
+  edit.DeleteFile(/* level */ 1, /* file_number */ 13);
+  edit.AddFile(/* level */ 2, /* file_number */ 13, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("13", 1300),
+               /* largest */ GetInternalKey("13", 1300),
+               /* smallest_seqno */ 1300,
+               /* largest_seqno */ 1300, /* marked_for_compaction */ false,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  // Add one more SST file that references a blob file, then promptly
+  // delete it in a second version edit before the new version gets saved.
+  // This file should not show up as linked to the blob file in the new version.
+  edit.AddFile(/* level */ 1, /* file_number */ 23, /* path_id */ 0,
+               /* file_size */ 100, /* smallest */ GetInternalKey("23", 2300),
+               /* largest */ GetInternalKey("23", 2300),
+               /* smallest_seqno */ 2300,
+               /* largest_seqno */ 2300, /* marked_for_compaction */ false,
+               Temperature::kUnknown,
+               /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime,
+               kUnknownFileCreationTime, kUnknownFileChecksum,
+               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+
+  VersionEdit edit2;
+
+  edit2.DeleteFile(/* level */ 1, /* file_number */ 23);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder builder(env_options, &ioptions_, table_cache, &vstorage_,
+                         version_set);
+
+  ASSERT_OK(builder.Apply(&edit));
+  ASSERT_OK(builder.Apply(&edit2));
+
+  constexpr bool force_consistency_checks = true;
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, &vstorage_,
+                                  force_consistency_checks);
+
+  ASSERT_OK(builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  {
+    const auto& blob_files = new_vstorage.GetBlobFiles();
+    ASSERT_EQ(blob_files.size(), 5);
+
+    const std::vector<BlobFileMetaData::LinkedSsts> expected_linked_ssts{
+        {1, 21}, {2, 7}, {3, 8}, {4, 9}, {5, 10}};
+
+    for (size_t i = 0; i < 5; ++i) {
+      const auto meta =
+          new_vstorage.GetBlobFileMetaData(/* blob_file_number */ i + 1);
+      ASSERT_NE(meta, nullptr);
+      ASSERT_EQ(meta->GetLinkedSsts(), expected_linked_ssts[i]);
+    }
+
+    // Make sure that no new BlobFileMetaData got created for the blob file
+    // affected by the trivial move.
+    ASSERT_EQ(new_vstorage.GetBlobFileMetaData(/* blob_file_number */ 3),
+              meta3);
+  }
+
+  UnrefFilesInVersion(&new_vstorage);
+}
+
+TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) {
+  Add(0, 1U, "150", "200", 100U);
+
+  UpdateVersionStorageInfo();
+
+  VersionEdit version_edit;
+  version_edit.DeleteFile(0, 1U);
+
+  EnvOptions env_options;
+  constexpr TableCache* table_cache = nullptr;
+  constexpr VersionSet* version_set = nullptr;
+
+  VersionBuilder version_builder(env_options, &ioptions_, table_cache,
+                                 &vstorage_, version_set);
+  VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels,
+                                  kCompactionStyleLevel, nullptr,
+                                  true /* force_consistency_checks */);
+  ASSERT_OK(version_builder.Apply(&version_edit));
+  ASSERT_OK(version_builder.SaveTo(&new_vstorage));
+
+  UpdateVersionStorageInfo(&new_vstorage);
+
+  VersionBuilder version_builder2(env_options, &ioptions_, table_cache,
+                                  &new_vstorage, version_set);
+  VersionStorageInfo new_vstorage2(&icmp_, ucmp_, options_.num_levels,
+                                   kCompactionStyleLevel, nullptr,
+                                   true /* force_consistency_checks */);
+  ASSERT_NOK(version_builder2.Apply(&version_edit));
+
+  UnrefFilesInVersion(&new_vstorage);
+  UnrefFilesInVersion(&new_vstorage2);
+}
+
+TEST_F(VersionBuilderTest, EstimatedActiveKeys) {
+  const uint32_t kTotalSamples = 20;
+  const uint32_t kNumLevels = 5;
+  const uint32_t kFilesPerLevel = 8;
+  const uint32_t kNumFiles = kNumLevels * kFilesPerLevel;
+  const uint32_t kEntriesPerFile = 1000;
+  const uint32_t kDeletionsPerFile = 100;
+  for (uint32_t i = 0; i < kNumFiles; ++i) {
+    Add(static_cast<int>(i / kFilesPerLevel), i + 1,
+        std::to_string((i + 100) * 1000).c_str(),
+        std::to_string((i + 100) * 1000 + 999).c_str(), 100U, 0, 100, 100,
+        kEntriesPerFile, kDeletionsPerFile, (i < kTotalSamples));
+  }
+  // minus 2X for the number of deletion entries because:
+  // 1x for deletion entry does not count as a data entry.
+  // 1x for each deletion entry will actually remove one data entry.
+  ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(),
+            (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_edit.cc b/src/rocksdb/db/version_edit.cc
new file mode 100644
index 000000000..e4e02fe25
--- /dev/null
+++ b/src/rocksdb/db/version_edit.cc
@@ -0,0 +1,1043 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "db/version_set.h"
+#include "logging/event_logger.h"
+#include "rocksdb/slice.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {}  // anonymous namespace
+
+uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
+  assert(number <= kFileNumberMask);
+  return number | (path_id * (kFileNumberMask + 1));
+}
+
+Status FileMetaData::UpdateBoundaries(const Slice& key, const Slice& value,
+                                      SequenceNumber seqno,
+                                      ValueType value_type) {
+  if (value_type == kTypeBlobIndex) {
+    BlobIndex blob_index;
+    const Status s = blob_index.DecodeFrom(value);
+    if (!s.ok()) {
+      return s;
+    }
+
+    if (!blob_index.IsInlined() && !blob_index.HasTTL()) {
+      if (blob_index.file_number() == kInvalidBlobFileNumber) {
+        return Status::Corruption("Invalid blob file number");
+      }
+
+      if (oldest_blob_file_number == kInvalidBlobFileNumber ||
+          oldest_blob_file_number > blob_index.file_number()) {
+        oldest_blob_file_number = blob_index.file_number();
+      }
+    }
+  }
+
+  if (smallest.size() == 0) {
+    smallest.DecodeFrom(key);
+  }
+  largest.DecodeFrom(key);
+  fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+  fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+
+  return Status::OK();
+}
+
+void VersionEdit::Clear() {
+  max_level_ = 0;
+  db_id_.clear();
+  comparator_.clear();
+  log_number_ = 0;
+  prev_log_number_ = 0;
+  next_file_number_ = 0;
+  max_column_family_ = 0;
+  min_log_number_to_keep_ = 0;
+  last_sequence_ = 0;
+  has_db_id_ = false;
+  has_comparator_ = false;
+  has_log_number_ = false;
+  has_prev_log_number_ = false;
+  has_next_file_number_ = false;
+  has_max_column_family_ = false;
+  has_min_log_number_to_keep_ = false;
+  has_last_sequence_ = false;
+  compact_cursors_.clear();
+  deleted_files_.clear();
+  new_files_.clear();
+  blob_file_additions_.clear();
+  blob_file_garbages_.clear();
+  wal_additions_.clear();
+  wal_deletion_.Reset();
+  column_family_ = 0;
+  is_column_family_add_ = false;
+  is_column_family_drop_ = false;
+  column_family_name_.clear();
+  is_in_atomic_group_ = false;
+  remaining_entries_ = 0;
+  full_history_ts_low_.clear();
+}
+
+bool VersionEdit::EncodeTo(std::string* dst) const {
+  if (has_db_id_) {
+    PutVarint32(dst, kDbId);
+    PutLengthPrefixedSlice(dst, db_id_);
+  }
+  if (has_comparator_) {
+    PutVarint32(dst, kComparator);
+    PutLengthPrefixedSlice(dst, comparator_);
+  }
+  if (has_log_number_) {
+    PutVarint32Varint64(dst, kLogNumber, log_number_);
+  }
+  if (has_prev_log_number_) {
+    PutVarint32Varint64(dst, kPrevLogNumber, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    PutVarint32Varint64(dst, kNextFileNumber, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    PutVarint32Varint32(dst, kMaxColumnFamily, max_column_family_);
+  }
+  if (has_min_log_number_to_keep_) {
+    PutVarint32Varint64(dst, kMinLogNumberToKeep, min_log_number_to_keep_);
+  }
+  if (has_last_sequence_) {
+    PutVarint32Varint64(dst, kLastSequence, last_sequence_);
+  }
+  for (size_t i = 0; i < compact_cursors_.size(); i++) {
+    if (compact_cursors_[i].second.Valid()) {
+      PutVarint32(dst, kCompactCursor);
+      PutVarint32(dst, compact_cursors_[i].first);  // level
+      PutLengthPrefixedSlice(dst, compact_cursors_[i].second.Encode());
+    }
+  }
+  for (const auto& deleted : deleted_files_) {
+    PutVarint32Varint32Varint64(dst, kDeletedFile, deleted.first /* level */,
+                                deleted.second /* file number */);
+  }
+
+  bool min_log_num_written = false;
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    if (!f.smallest.Valid() || !f.largest.Valid()) {
+      return false;
+    }
+    PutVarint32(dst, kNewFile4);
+    PutVarint32Varint64(dst, new_files_[i].first /* level */, f.fd.GetNumber());
+    PutVarint64(dst, f.fd.GetFileSize());
+    PutLengthPrefixedSlice(dst, f.smallest.Encode());
+    PutLengthPrefixedSlice(dst, f.largest.Encode());
+    PutVarint64Varint64(dst, f.fd.smallest_seqno, f.fd.largest_seqno);
+    // Customized fields' format:
+    // +-----------------------------+
+    // | 1st field's tag (varint32)  |
+    // +-----------------------------+
+    // | 1st field's size (varint32) |
+    // +-----------------------------+
+    // |    bytes for 1st field      |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // |                             |
+    // |          ......             |
+    // |                             |
+    // +-----------------------------+
+    // | last field's size (varint32)|
+    // +-----------------------------+
+    // |    bytes for last field     |
+    // |  (based on size decoded)    |
+    // +-----------------------------+
+    // | terminating tag (varint32)  |
+    // +-----------------------------+
+    //
+    // Customized encoding for fields:
+    //   tag kPathId: 1 byte as path_id
+    //   tag kNeedCompaction:
+    //        now only can take one char value 1 indicating need-compaction
+    //
+    PutVarint32(dst, NewFileCustomTag::kOldestAncesterTime);
+    std::string varint_oldest_ancester_time;
+    PutVarint64(&varint_oldest_ancester_time, f.oldest_ancester_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintOldestAncesterTime",
+                             &varint_oldest_ancester_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_oldest_ancester_time));
+
+    PutVarint32(dst, NewFileCustomTag::kFileCreationTime);
+    std::string varint_file_creation_time;
+    PutVarint64(&varint_file_creation_time, f.file_creation_time);
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:VarintFileCreationTime",
+                             &varint_file_creation_time);
+    PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksum);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum));
+
+    PutVarint32(dst, NewFileCustomTag::kFileChecksumFuncName);
+    PutLengthPrefixedSlice(dst, Slice(f.file_checksum_func_name));
+
+    if (f.fd.GetPathId() != 0) {
+      PutVarint32(dst, NewFileCustomTag::kPathId);
+      char p = static_cast<char>(f.fd.GetPathId());
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.temperature != Temperature::kUnknown) {
+      PutVarint32(dst, NewFileCustomTag::kTemperature);
+      char p = static_cast<char>(f.temperature);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (f.marked_for_compaction) {
+      PutVarint32(dst, NewFileCustomTag::kNeedCompaction);
+      char p = static_cast<char>(1);
+      PutLengthPrefixedSlice(dst, Slice(&p, 1));
+    }
+    if (has_min_log_number_to_keep_ && !min_log_num_written) {
+      PutVarint32(dst, NewFileCustomTag::kMinLogNumberToKeepHack);
+      std::string varint_log_number;
+      PutFixed64(&varint_log_number, min_log_number_to_keep_);
+      PutLengthPrefixedSlice(dst, Slice(varint_log_number));
+      min_log_num_written = true;
+    }
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      PutVarint32(dst, NewFileCustomTag::kOldestBlobFileNumber);
+      std::string oldest_blob_file_number;
+      PutVarint64(&oldest_blob_file_number, f.oldest_blob_file_number);
+      PutLengthPrefixedSlice(dst, Slice(oldest_blob_file_number));
+    }
+    UniqueId64x2 unique_id = f.unique_id;
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:UniqueId", &unique_id);
+    if (unique_id != kNullUniqueId64x2) {
+      PutVarint32(dst, NewFileCustomTag::kUniqueId);
+      std::string unique_id_str = EncodeUniqueIdBytes(&unique_id);
+      PutLengthPrefixedSlice(dst, Slice(unique_id_str));
+    }
+
+    TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
+                             dst);
+
+    PutVarint32(dst, NewFileCustomTag::kTerminate);
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    PutVarint32(dst, kBlobFileAddition);
+    blob_file_addition.EncodeTo(dst);
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    PutVarint32(dst, kBlobFileGarbage);
+    blob_file_garbage.EncodeTo(dst);
+  }
+
+  for (const auto& wal_addition : wal_additions_) {
+    PutVarint32(dst, kWalAddition2);
+    std::string encoded;
+    wal_addition.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    PutVarint32(dst, kWalDeletion2);
+    std::string encoded;
+    wal_deletion_.EncodeTo(&encoded);
+    PutLengthPrefixedSlice(dst, encoded);
+  }
+
+  // 0 is default and does not need to be explicitly written
+  if (column_family_ != 0) {
+    PutVarint32Varint32(dst, kColumnFamily, column_family_);
+  }
+
+  if (is_column_family_add_) {
+    PutVarint32(dst, kColumnFamilyAdd);
+    PutLengthPrefixedSlice(dst, Slice(column_family_name_));
+  }
+
+  if (is_column_family_drop_) {
+    PutVarint32(dst, kColumnFamilyDrop);
+  }
+
+  if (is_in_atomic_group_) {
+    PutVarint32(dst, kInAtomicGroup);
+    PutVarint32(dst, remaining_entries_);
+  }
+
+  if (HasFullHistoryTsLow()) {
+    PutVarint32(dst, kFullHistoryTsLow);
+    PutLengthPrefixedSlice(dst, full_history_ts_low_);
+  }
+  return true;
+}
+
+static bool GetInternalKey(Slice* input, InternalKey* dst) {
+  Slice str;
+  if (GetLengthPrefixedSlice(input, &str)) {
+    dst->DecodeFrom(str);
+    return dst->Valid();
+  } else {
+    return false;
+  }
+}
+
+bool VersionEdit::GetLevel(Slice* input, int* level, const char** /*msg*/) {
+  uint32_t v = 0;
+  if (GetVarint32(input, &v)) {
+    *level = v;
+    if (max_level_ < *level) {
+      max_level_ = *level;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+static bool is_pseudo_new_file_record_pr3488(
+  const int level,
+  const uint64_t number,
+  const uint64_t file_size,
+  InternalKey& smallest,
+  InternalKey& largest,
+  const bool has_min_log_number_to_keep_) {
+
+  if (level == 0 && number == 0 && file_size == 0 &&
+      has_min_log_number_to_keep_) {
+    InternalKey dummy_key(Slice("dummy_key"), 0ull, ValueType::kTypeValue);
+    return (*smallest.rep() == *dummy_key.rep() &&
+            *largest.rep() == *dummy_key.rep());
+  } else {
+    return false;
+  }
+}
+
+const char* VersionEdit::DecodeNewFile4From(Slice* input) {
+  const char* msg = nullptr;
+  int level = 0;
+  FileMetaData f;
+  uint64_t number = 0;
+  uint32_t path_id = 0;
+  uint64_t file_size = 0;
+  SequenceNumber smallest_seqno = 0;
+  SequenceNumber largest_seqno = kMaxSequenceNumber;
+  if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
+      GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
+      GetInternalKey(input, &f.largest) &&
+      GetVarint64(input, &smallest_seqno) &&
+      GetVarint64(input, &largest_seqno)) {
+    // See comments in VersionEdit::EncodeTo() for format of customized fields
+    while (true) {
+      uint32_t custom_tag = 0;
+      Slice field;
+      if (!GetVarint32(input, &custom_tag)) {
+        return "new-file4 custom field";
+      }
+      if (custom_tag == kTerminate) {
+        break;
+      }
+      if (!GetLengthPrefixedSlice(input, &field)) {
+        return "new-file4 custom field length prefixed slice error";
+      }
+      switch (custom_tag) {
+        case kPathId:
+          if (field.size() != 1) {
+            return "path_id field wrong size";
+          }
+          path_id = field[0];
+          if (path_id > 3) {
+            return "path_id wrong vaue";
+          }
+          break;
+        case kOldestAncesterTime:
+          if (!GetVarint64(&field, &f.oldest_ancester_time)) {
+            return "invalid oldest ancester time";
+          }
+          break;
+        case kFileCreationTime:
+          if (!GetVarint64(&field, &f.file_creation_time)) {
+            return "invalid file creation time";
+          }
+          break;
+        case kFileChecksum:
+          f.file_checksum = field.ToString();
+          break;
+        case kFileChecksumFuncName:
+          f.file_checksum_func_name = field.ToString();
+          break;
+        case kNeedCompaction:
+          if (field.size() != 1) {
+            return "need_compaction field wrong size";
+          }
+          f.marked_for_compaction = (field[0] == 1);
+          break;
+        case kMinLogNumberToKeepHack:
+          // This is a hack to encode kMinLogNumberToKeep in a
+          // forward-compatible fashion.
+          if (!GetFixed64(&field, &min_log_number_to_keep_)) {
+            return "deleted log number malformatted";
+          }
+          has_min_log_number_to_keep_ = true;
+          break;
+        case kOldestBlobFileNumber:
+          if (!GetVarint64(&field, &f.oldest_blob_file_number)) {
+            return "invalid oldest blob file number";
+          }
+          break;
+        case kTemperature:
+          if (field.size() != 1) {
+            return "temperature field wrong size";
+          } else {
+            Temperature casted_field = static_cast<Temperature>(field[0]);
+            if (casted_field <= Temperature::kCold) {
+              f.temperature = casted_field;
+            }
+          }
+          break;
+        case kUniqueId:
+          if (!DecodeUniqueIdBytes(field.ToString(), &f.unique_id).ok()) {
+            f.unique_id = kNullUniqueId64x2;
+            return "invalid unique id";
+          }
+          break;
+        default:
+          if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
+            // Should not proceed if cannot understand it
+            return "new-file4 custom field not supported";
+          }
+          break;
+      }
+    }
+  } else {
+    return "new-file4 entry";
+  }
+  if (is_pseudo_new_file_record_pr3488(level, number, file_size,
+                                       f.smallest, f.largest,
+                                       has_min_log_number_to_keep_)) {
+    // Since this has nothing to do with NewFile, return immediately.
+    return nullptr;
+  }
+  f.fd =
+      FileDescriptor(number, path_id, file_size, smallest_seqno, largest_seqno);
+  new_files_.push_back(std::make_pair(level, f));
+  return nullptr;
+}
+
+Status VersionEdit::DecodeFrom(const Slice& src) {
+  Clear();
+#ifndef NDEBUG
+  bool ignore_ignorable_tags = false;
+  TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:IgnoreIgnorableTags",
+                           &ignore_ignorable_tags);
+#endif
+  Slice input = src;
+  const char* msg = nullptr;
+  uint32_t tag = 0;
+
+  // Temporary storage for parsing
+  int level = 0;
+  FileMetaData f;
+  Slice str;
+  InternalKey key;
+  while (msg == nullptr && GetVarint32(&input, &tag)) {
+#ifndef NDEBUG
+    if (ignore_ignorable_tags && tag > kTagSafeIgnoreMask) {
+      tag = kTagSafeIgnoreMask;
+    }
+#endif
+    switch (tag) {
+      case kDbId:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          db_id_ = str.ToString();
+          has_db_id_ = true;
+        } else {
+          msg = "db id";
+        }
+        break;
+      case kComparator:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          comparator_ = str.ToString();
+          has_comparator_ = true;
+        } else {
+          msg = "comparator name";
+        }
+        break;
+
+      case kLogNumber:
+        if (GetVarint64(&input, &log_number_)) {
+          has_log_number_ = true;
+        } else {
+          msg = "log number";
+        }
+        break;
+
+      case kPrevLogNumber:
+        if (GetVarint64(&input, &prev_log_number_)) {
+          has_prev_log_number_ = true;
+        } else {
+          msg = "previous log number";
+        }
+        break;
+
+      case kNextFileNumber:
+        if (GetVarint64(&input, &next_file_number_)) {
+          has_next_file_number_ = true;
+        } else {
+          msg = "next file number";
+        }
+        break;
+
+      case kMaxColumnFamily:
+        if (GetVarint32(&input, &max_column_family_)) {
+          has_max_column_family_ = true;
+        } else {
+          msg = "max column family";
+        }
+        break;
+
+      case kMinLogNumberToKeep:
+        if (GetVarint64(&input, &min_log_number_to_keep_)) {
+          has_min_log_number_to_keep_ = true;
+        } else {
+          msg = "min log number to kee";
+        }
+        break;
+
+      case kLastSequence:
+        if (GetVarint64(&input, &last_sequence_)) {
+          has_last_sequence_ = true;
+        } else {
+          msg = "last sequence number";
+        }
+        break;
+
+      case kCompactCursor:
+        if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) {
+          // Here we re-use the output format of compact pointer in LevelDB
+          // to persist compact_cursors_
+          compact_cursors_.push_back(std::make_pair(level, key));
+        } else {
+          if (!msg) {
+            msg = "compaction cursor";
+          }
+        }
+        break;
+
+      case kDeletedFile: {
+        uint64_t number = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) {
+          deleted_files_.insert(std::make_pair(level, number));
+        } else {
+          if (!msg) {
+            msg = "deleted file";
+          }
+        }
+        break;
+      }
+
+      case kNewFile: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest)) {
+          f.fd = FileDescriptor(number, 0, file_size);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file entry";
+          }
+        }
+        break;
+      }
+      case kNewFile2: {
+        uint64_t number = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, 0, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file2 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile3: {
+        uint64_t number = 0;
+        uint32_t path_id = 0;
+        uint64_t file_size = 0;
+        SequenceNumber smallest_seqno = 0;
+        SequenceNumber largest_seqno = kMaxSequenceNumber;
+        if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) &&
+            GetVarint32(&input, &path_id) && GetVarint64(&input, &file_size) &&
+            GetInternalKey(&input, &f.smallest) &&
+            GetInternalKey(&input, &f.largest) &&
+            GetVarint64(&input, &smallest_seqno) &&
+            GetVarint64(&input, &largest_seqno)) {
+          f.fd = FileDescriptor(number, path_id, file_size, smallest_seqno,
+                                largest_seqno);
+          new_files_.push_back(std::make_pair(level, f));
+        } else {
+          if (!msg) {
+            msg = "new-file3 entry";
+          }
+        }
+        break;
+      }
+
+      case kNewFile4: {
+        msg = DecodeNewFile4From(&input);
+        break;
+      }
+
+      case kBlobFileAddition:
+      case kBlobFileAddition_DEPRECATED: {
+        BlobFileAddition blob_file_addition;
+        const Status s = blob_file_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFile(std::move(blob_file_addition));
+        break;
+      }
+
+      case kBlobFileGarbage:
+      case kBlobFileGarbage_DEPRECATED: {
+        BlobFileGarbage blob_file_garbage;
+        const Status s = blob_file_garbage.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        AddBlobFileGarbage(std::move(blob_file_garbage));
+        break;
+      }
+
+      case kWalAddition: {
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalAddition2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalAddition not prefixed by length";
+          break;
+        }
+
+        WalAddition wal_addition;
+        const Status s = wal_addition.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_additions_.emplace_back(std::move(wal_addition));
+        break;
+      }
+
+      case kWalDeletion: {
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&input);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kWalDeletion2: {
+        Slice encoded;
+        if (!GetLengthPrefixedSlice(&input, &encoded)) {
+          msg = "WalDeletion not prefixed by length";
+          break;
+        }
+
+        WalDeletion wal_deletion;
+        const Status s = wal_deletion.DecodeFrom(&encoded);
+        if (!s.ok()) {
+          return s;
+        }
+
+        wal_deletion_ = std::move(wal_deletion);
+        break;
+      }
+
+      case kColumnFamily:
+        if (!GetVarint32(&input, &column_family_)) {
+          if (!msg) {
+            msg = "set column family id";
+          }
+        }
+        break;
+
+      case kColumnFamilyAdd:
+        if (GetLengthPrefixedSlice(&input, &str)) {
+          is_column_family_add_ = true;
+          column_family_name_ = str.ToString();
+        } else {
+          if (!msg) {
+            msg = "column family add";
+          }
+        }
+        break;
+
+      case kColumnFamilyDrop:
+        is_column_family_drop_ = true;
+        break;
+
+      case kInAtomicGroup:
+        is_in_atomic_group_ = true;
+        if (!GetVarint32(&input, &remaining_entries_)) {
+          if (!msg) {
+            msg = "remaining entries";
+          }
+        }
+        break;
+
+      case kFullHistoryTsLow:
+        if (!GetLengthPrefixedSlice(&input, &str)) {
+          msg = "full_history_ts_low";
+        } else if (str.empty()) {
+          msg = "full_history_ts_low: empty";
+        } else {
+          full_history_ts_low_.assign(str.data(), str.size());
+        }
+        break;
+
+      default:
+        if (tag & kTagSafeIgnoreMask) {
+          // Tag from future which can be safely ignored.
+          // The next field must be the length of the entry.
+          uint32_t field_len;
+          if (!GetVarint32(&input, &field_len) ||
+              static_cast<size_t>(field_len) > input.size()) {
+            if (!msg) {
+              msg = "safely ignoreable tag length error";
+            }
+          } else {
+            input.remove_prefix(static_cast<size_t>(field_len));
+          }
+        } else {
+          msg = "unknown tag";
+        }
+        break;
+    }
+  }
+
+  if (msg == nullptr && !input.empty()) {
+    msg = "invalid tag";
+  }
+
+  Status result;
+  if (msg != nullptr) {
+    result = Status::Corruption("VersionEdit", msg);
+  }
+  return result;
+}
+
+std::string VersionEdit::DebugString(bool hex_key) const {
+  std::string r;
+  r.append("VersionEdit {");
+  if (has_db_id_) {
+    r.append("\n  DB ID: ");
+    r.append(db_id_);
+  }
+  if (has_comparator_) {
+    r.append("\n  Comparator: ");
+    r.append(comparator_);
+  }
+  if (has_log_number_) {
+    r.append("\n  LogNumber: ");
+    AppendNumberTo(&r, log_number_);
+  }
+  if (has_prev_log_number_) {
+    r.append("\n  PrevLogNumber: ");
+    AppendNumberTo(&r, prev_log_number_);
+  }
+  if (has_next_file_number_) {
+    r.append("\n  NextFileNumber: ");
+    AppendNumberTo(&r, next_file_number_);
+  }
+  if (has_max_column_family_) {
+    r.append("\n  MaxColumnFamily: ");
+    AppendNumberTo(&r, max_column_family_);
+  }
+  if (has_min_log_number_to_keep_) {
+    r.append("\n  MinLogNumberToKeep: ");
+    AppendNumberTo(&r, min_log_number_to_keep_);
+  }
+  if (has_last_sequence_) {
+    r.append("\n  LastSeq: ");
+    AppendNumberTo(&r, last_sequence_);
+  }
+  for (const auto& level_and_compact_cursor : compact_cursors_) {
+    r.append("\n  CompactCursor: ");
+    AppendNumberTo(&r, level_and_compact_cursor.first);
+    r.append(" ");
+    r.append(level_and_compact_cursor.second.DebugString(hex_key));
+  }
+  for (const auto& deleted_file : deleted_files_) {
+    r.append("\n  DeleteFile: ");
+    AppendNumberTo(&r, deleted_file.first);
+    r.append(" ");
+    AppendNumberTo(&r, deleted_file.second);
+  }
+  for (size_t i = 0; i < new_files_.size(); i++) {
+    const FileMetaData& f = new_files_[i].second;
+    r.append("\n  AddFile: ");
+    AppendNumberTo(&r, new_files_[i].first);
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetNumber());
+    r.append(" ");
+    AppendNumberTo(&r, f.fd.GetFileSize());
+    r.append(" ");
+    r.append(f.smallest.DebugString(hex_key));
+    r.append(" .. ");
+    r.append(f.largest.DebugString(hex_key));
+    if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+      r.append(" blob_file:");
+      AppendNumberTo(&r, f.oldest_blob_file_number);
+    }
+    r.append(" oldest_ancester_time:");
+    AppendNumberTo(&r, f.oldest_ancester_time);
+    r.append(" file_creation_time:");
+    AppendNumberTo(&r, f.file_creation_time);
+    r.append(" file_checksum:");
+    r.append(Slice(f.file_checksum).ToString(true));
+    r.append(" file_checksum_func_name: ");
+    r.append(f.file_checksum_func_name);
+    if (f.temperature != Temperature::kUnknown) {
+      r.append(" temperature: ");
+      // Maybe change to human readable format whenthe feature becomes
+      // permanent
+      r.append(std::to_string(static_cast<int>(f.temperature)));
+    }
+    if (f.unique_id != kNullUniqueId64x2) {
+      r.append(" unique_id(internal): ");
+      UniqueId64x2 id = f.unique_id;
+      r.append(InternalUniqueIdToHumanString(&id));
+      r.append(" public_unique_id: ");
+      InternalUniqueIdToExternal(&id);
+      r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id)));
+    }
+  }
+
+  for (const auto& blob_file_addition : blob_file_additions_) {
+    r.append("\n  BlobFileAddition: ");
+    r.append(blob_file_addition.DebugString());
+  }
+
+  for (const auto& blob_file_garbage : blob_file_garbages_) {
+    r.append("\n  BlobFileGarbage: ");
+    r.append(blob_file_garbage.DebugString());
+  }
+
+  for (const auto& wal_addition : wal_additions_) {
+    r.append("\n  WalAddition: ");
+    r.append(wal_addition.DebugString());
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    r.append("\n  WalDeletion: ");
+    r.append(wal_deletion_.DebugString());
+  }
+
+  r.append("\n  ColumnFamily: ");
+  AppendNumberTo(&r, column_family_);
+  if (is_column_family_add_) {
+    r.append("\n  ColumnFamilyAdd: ");
+    r.append(column_family_name_);
+  }
+  if (is_column_family_drop_) {
+    r.append("\n  ColumnFamilyDrop");
+  }
+  if (is_in_atomic_group_) {
+    r.append("\n  AtomicGroup: ");
+    AppendNumberTo(&r, remaining_entries_);
+    r.append(" entries remains");
+  }
+  if (HasFullHistoryTsLow()) {
+    r.append("\n FullHistoryTsLow: ");
+    r.append(Slice(full_history_ts_low_).ToString(hex_key));
+  }
+  r.append("\n}\n");
+  return r;
+}
+
+std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const {
+  JSONWriter jw;
+  jw << "EditNumber" << edit_num;
+
+  if (has_db_id_) {
+    jw << "DB ID" << db_id_;
+  }
+  if (has_comparator_) {
+    jw << "Comparator" << comparator_;
+  }
+  if (has_log_number_) {
+    jw << "LogNumber" << log_number_;
+  }
+  if (has_prev_log_number_) {
+    jw << "PrevLogNumber" << prev_log_number_;
+  }
+  if (has_next_file_number_) {
+    jw << "NextFileNumber" << next_file_number_;
+  }
+  if (has_max_column_family_) {
+    jw << "MaxColumnFamily" << max_column_family_;
+  }
+  if (has_min_log_number_to_keep_) {
+    jw << "MinLogNumberToKeep" << min_log_number_to_keep_;
+  }
+  if (has_last_sequence_) {
+    jw << "LastSeq" << last_sequence_;
+  }
+
+  if (!deleted_files_.empty()) {
+    jw << "DeletedFiles";
+    jw.StartArray();
+
+    for (const auto& deleted_file : deleted_files_) {
+      jw.StartArrayedObject();
+      jw << "Level" << deleted_file.first;
+      jw << "FileNumber" << deleted_file.second;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!new_files_.empty()) {
+    jw << "AddedFiles";
+    jw.StartArray();
+
+    for (size_t i = 0; i < new_files_.size(); i++) {
+      jw.StartArrayedObject();
+      jw << "Level" << new_files_[i].first;
+      const FileMetaData& f = new_files_[i].second;
+      jw << "FileNumber" << f.fd.GetNumber();
+      jw << "FileSize" << f.fd.GetFileSize();
+      jw << "SmallestIKey" << f.smallest.DebugString(hex_key);
+      jw << "LargestIKey" << f.largest.DebugString(hex_key);
+      jw << "OldestAncesterTime" << f.oldest_ancester_time;
+      jw << "FileCreationTime" << f.file_creation_time;
+      jw << "FileChecksum" << Slice(f.file_checksum).ToString(true);
+      jw << "FileChecksumFuncName" << f.file_checksum_func_name;
+      if (f.temperature != Temperature::kUnknown) {
+        jw << "temperature" << std::to_string(static_cast<int>(f.temperature));
+      }
+      if (f.oldest_blob_file_number != kInvalidBlobFileNumber) {
+        jw << "OldestBlobFile" << f.oldest_blob_file_number;
+      }
+      if (f.temperature != Temperature::kUnknown) {
+        // Maybe change to human readable format whenthe feature becomes
+        // permanent
+        jw << "Temperature" << static_cast<int>(f.temperature);
+      }
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_additions_.empty()) {
+    jw << "BlobFileAdditions";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_addition : blob_file_additions_) {
+      jw.StartArrayedObject();
+      jw << blob_file_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!blob_file_garbages_.empty()) {
+    jw << "BlobFileGarbages";
+
+    jw.StartArray();
+
+    for (const auto& blob_file_garbage : blob_file_garbages_) {
+      jw.StartArrayedObject();
+      jw << blob_file_garbage;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!wal_additions_.empty()) {
+    jw << "WalAdditions";
+
+    jw.StartArray();
+
+    for (const auto& wal_addition : wal_additions_) {
+      jw.StartArrayedObject();
+      jw << wal_addition;
+      jw.EndArrayedObject();
+    }
+
+    jw.EndArray();
+  }
+
+  if (!wal_deletion_.IsEmpty()) {
+    jw << "WalDeletion";
+    jw.StartObject();
+    jw << wal_deletion_;
+    jw.EndObject();
+  }
+
+  jw << "ColumnFamily" << column_family_;
+
+  if (is_column_family_add_) {
+    jw << "ColumnFamilyAdd" << column_family_name_;
+  }
+  if (is_column_family_drop_) {
+    jw << "ColumnFamilyDrop" << column_family_name_;
+  }
+  if (is_in_atomic_group_) {
+    jw << "AtomicGroup" << remaining_entries_;
+  }
+
+  if (HasFullHistoryTsLow()) {
+    jw << "FullHistoryTsLow" << Slice(full_history_ts_low_).ToString(hex_key);
+  }
+
+  jw.EndObject();
+
+  return jw.Get();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit.h b/src/rocksdb/db/version_edit.h
new file mode 100644
index 000000000..c9800a3c0
--- /dev/null
+++ b/src/rocksdb/db/version_edit.h
@@ -0,0 +1,669 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <algorithm>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/blob/blob_file_addition.h"
+#include "db/blob/blob_file_garbage.h"
+#include "db/dbformat.h"
+#include "db/wal_edit.h"
+#include "memory/arena.h"
+#include "port/malloc.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/cache.h"
+#include "table/table_reader.h"
+#include "table/unique_id_impl.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Tag numbers for serialized VersionEdit.  These numbers are written to
+// disk and should not be changed. The number should be forward compatible so
+// users can down-grade RocksDB safely. A future Tag is ignored by doing '&'
+// between Tag and kTagSafeIgnoreMask field.
+enum Tag : uint32_t {
+  kComparator = 1,
+  kLogNumber = 2,
+  kNextFileNumber = 3,
+  kLastSequence = 4,
+  kCompactCursor = 5,
+  kDeletedFile = 6,
+  kNewFile = 7,
+  // 8 was used for large value refs
+  kPrevLogNumber = 9,
+  kMinLogNumberToKeep = 10,
+
+  // these are new formats divergent from open source leveldb
+  kNewFile2 = 100,
+  kNewFile3 = 102,
+  kNewFile4 = 103,      // 4th (the latest) format version of adding files
+  kColumnFamily = 200,  // specify column family for version edit
+  kColumnFamilyAdd = 201,
+  kColumnFamilyDrop = 202,
+  kMaxColumnFamily = 203,
+
+  kInAtomicGroup = 300,
+
+  kBlobFileAddition = 400,
+  kBlobFileGarbage,
+
+  // Mask for an unidentified tag from the future which can be safely ignored.
+  kTagSafeIgnoreMask = 1 << 13,
+
+  // Forward compatible (aka ignorable) records
+  kDbId,
+  kBlobFileAddition_DEPRECATED,
+  kBlobFileGarbage_DEPRECATED,
+  kWalAddition,
+  kWalDeletion,
+  kFullHistoryTsLow,
+  kWalAddition2,
+  kWalDeletion2,
+};
+
+enum NewFileCustomTag : uint32_t {
+  kTerminate = 1,  // The end of customized fields
+  kNeedCompaction = 2,
+  // Since Manifest is not entirely forward-compatible, we currently encode
+  // kMinLogNumberToKeep as part of NewFile as a hack. This should be removed
+  // when manifest becomes forward-compatible.
+  kMinLogNumberToKeepHack = 3,
+  kOldestBlobFileNumber = 4,
+  kOldestAncesterTime = 5,
+  kFileCreationTime = 6,
+  kFileChecksum = 7,
+  kFileChecksumFuncName = 8,
+  kTemperature = 9,
+  kMinTimestamp = 10,
+  kMaxTimestamp = 11,
+  kUniqueId = 12,
+
+  // If this bit for the custom tag is set, opening DB should fail if
+  // we don't know this field.
+  kCustomTagNonSafeIgnoreMask = 1 << 6,
+
+  // Forward incompatible (aka unignorable) fields
+  kPathId,
+};
+
+class VersionSet;
+
+constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF;
+constexpr uint64_t kUnknownOldestAncesterTime = 0;
+constexpr uint64_t kUnknownFileCreationTime = 0;
+
+extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id);
+
+// A copyable structure contains information needed to read data from an SST
+// file. It can contain a pointer to a table reader opened for the file, or
+// file number and size, which can be used to create a new table reader for it.
+// The behavior is undefined when a copied of the structure is used when the
+// file is not in any live version any more.
+struct FileDescriptor {
+  // Table reader in table_reader_handle
+  TableReader* table_reader;
+  uint64_t packed_number_and_path_id;
+  uint64_t file_size;             // File size in bytes
+  SequenceNumber smallest_seqno;  // The smallest seqno in this file
+  SequenceNumber largest_seqno;   // The largest seqno in this file
+
+  FileDescriptor() : FileDescriptor(0, 0, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size)
+      : FileDescriptor(number, path_id, _file_size, kMaxSequenceNumber, 0) {}
+
+  FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size,
+                 SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno)
+      : table_reader(nullptr),
+        packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)),
+        file_size(_file_size),
+        smallest_seqno(_smallest_seqno),
+        largest_seqno(_largest_seqno) {}
+
+  FileDescriptor(const FileDescriptor& fd) { *this = fd; }
+
+  FileDescriptor& operator=(const FileDescriptor& fd) {
+    table_reader = fd.table_reader;
+    packed_number_and_path_id = fd.packed_number_and_path_id;
+    file_size = fd.file_size;
+    smallest_seqno = fd.smallest_seqno;
+    largest_seqno = fd.largest_seqno;
+    return *this;
+  }
+
+  uint64_t GetNumber() const {
+    return packed_number_and_path_id & kFileNumberMask;
+  }
+  uint32_t GetPathId() const {
+    return static_cast<uint32_t>(packed_number_and_path_id /
+                                 (kFileNumberMask + 1));
+  }
+  uint64_t GetFileSize() const { return file_size; }
+};
+
+struct FileSampledStats {
+  FileSampledStats() : num_reads_sampled(0) {}
+  FileSampledStats(const FileSampledStats& other) { *this = other; }
+  FileSampledStats& operator=(const FileSampledStats& other) {
+    num_reads_sampled = other.num_reads_sampled.load();
+    return *this;
+  }
+
+  // number of user reads to this file.
+  mutable std::atomic<uint64_t> num_reads_sampled;
+};
+
+struct FileMetaData {
+  FileDescriptor fd;
+  InternalKey smallest;  // Smallest internal key served by table
+  InternalKey largest;   // Largest internal key served by table
+
+  // Needs to be disposed when refs becomes 0.
+  Cache::Handle* table_reader_handle = nullptr;
+
+  FileSampledStats stats;
+
+  // Stats for compensating deletion entries during compaction
+
+  // File size compensated by deletion entry.
+  // This is updated in Version::UpdateAccumulatedStats() first time when the
+  // file is created or loaded.  After it is updated (!= 0), it is immutable.
+  uint64_t compensated_file_size = 0;
+  // These values can mutate, but they can only be read or written from
+  // single-threaded LogAndApply thread
+  uint64_t num_entries = 0;     // the number of entries.
+  uint64_t num_deletions = 0;   // the number of deletion entries.
+  uint64_t raw_key_size = 0;    // total uncompressed key size.
+  uint64_t raw_value_size = 0;  // total uncompressed value size.
+
+  int refs = 0;  // Reference count
+
+  bool being_compacted = false;       // Is this file undergoing compaction?
+  bool init_stats_from_file = false;  // true if the data-entry stats of this
+                                      // file has initialized from file.
+
+  bool marked_for_compaction = false;  // True if client asked us nicely to
+                                       // compact this file.
+  Temperature temperature = Temperature::kUnknown;
+
+  // Used only in BlobDB. The file number of the oldest blob file this SST file
+  // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1.
+  uint64_t oldest_blob_file_number = kInvalidBlobFileNumber;
+
+  // The file could be the compaction output from other SST files, which could
+  // in turn be outputs for compact older SST files. We track the memtable
+  // flush timestamp for the oldest SST file that eventually contribute data
+  // to this file. 0 means the information is not available.
+  uint64_t oldest_ancester_time = kUnknownOldestAncesterTime;
+
+  // Unix time when the SST file is created.
+  uint64_t file_creation_time = kUnknownFileCreationTime;
+
+  // File checksum
+  std::string file_checksum = kUnknownFileChecksum;
+
+  // File checksum function name
+  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
+
+  // SST unique id
+  UniqueId64x2 unique_id{};
+
+  FileMetaData() = default;
+
+  FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,
+               const InternalKey& smallest_key, const InternalKey& largest_key,
+               const SequenceNumber& smallest_seq,
+               const SequenceNumber& largest_seq, bool marked_for_compact,
+               Temperature _temperature, uint64_t oldest_blob_file,
+               uint64_t _oldest_ancester_time, uint64_t _file_creation_time,
+               const std::string& _file_checksum,
+               const std::string& _file_checksum_func_name,
+               UniqueId64x2 _unique_id)
+      : fd(file, file_path_id, file_size, smallest_seq, largest_seq),
+        smallest(smallest_key),
+        largest(largest_key),
+        marked_for_compaction(marked_for_compact),
+        temperature(_temperature),
+        oldest_blob_file_number(oldest_blob_file),
+        oldest_ancester_time(_oldest_ancester_time),
+        file_creation_time(_file_creation_time),
+        file_checksum(_file_checksum),
+        file_checksum_func_name(_file_checksum_func_name),
+        unique_id(std::move(_unique_id)) {
+    TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this);
+  }
+
+  // REQUIRED: Keys must be given to the function in sorted order (it expects
+  // the last key to be the largest).
+  Status UpdateBoundaries(const Slice& key, const Slice& value,
+                          SequenceNumber seqno, ValueType value_type);
+
+  // Unlike UpdateBoundaries, ranges do not need to be presented in any
+  // particular order.
+  void UpdateBoundariesForRange(const InternalKey& start,
+                                const InternalKey& end, SequenceNumber seqno,
+                                const InternalKeyComparator& icmp) {
+    if (smallest.size() == 0 || icmp.Compare(start, smallest) < 0) {
+      smallest = start;
+    }
+    if (largest.size() == 0 || icmp.Compare(largest, end) < 0) {
+      largest = end;
+    }
+    fd.smallest_seqno = std::min(fd.smallest_seqno, seqno);
+    fd.largest_seqno = std::max(fd.largest_seqno, seqno);
+  }
+
+  // Try to get oldest ancester time from the class itself or table properties
+  // if table reader is already pinned.
+  // 0 means the information is not available.
+  uint64_t TryGetOldestAncesterTime() {
+    if (oldest_ancester_time != kUnknownOldestAncesterTime) {
+      return oldest_ancester_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->creation_time;
+    }
+    return kUnknownOldestAncesterTime;
+  }
+
+  uint64_t TryGetFileCreationTime() {
+    if (file_creation_time != kUnknownFileCreationTime) {
+      return file_creation_time;
+    } else if (fd.table_reader != nullptr &&
+               fd.table_reader->GetTableProperties() != nullptr) {
+      return fd.table_reader->GetTableProperties()->file_creation_time;
+    }
+    return kUnknownFileCreationTime;
+  }
+
+  // WARNING: manual update to this function is needed
+  // whenever a new string property is added to FileMetaData
+  // to reduce approximation error.
+  //
+  // TODO: eliminate the need of manually updating this function
+  // for new string properties
+  size_t ApproximateMemoryUsage() const {
+    size_t usage = 0;
+#ifdef ROCKSDB_MALLOC_USABLE_SIZE
+    usage += malloc_usable_size(const_cast<FileMetaData*>(this));
+#else
+    usage += sizeof(*this);
+#endif  // ROCKSDB_MALLOC_USABLE_SIZE
+    usage += smallest.size() + largest.size() + file_checksum.size() +
+             file_checksum_func_name.size();
+    return usage;
+  }
+};
+
+// A compressed copy of file meta data that just contain minimum data needed
+// to serve read operations, while still keeping the pointer to full metadata
+// of the file in case it is needed.
+struct FdWithKeyRange {
+  FileDescriptor fd;
+  FileMetaData* file_metadata;  // Point to all metadata
+  Slice smallest_key;           // slice that contain smallest key
+  Slice largest_key;            // slice that contain largest key
+
+  FdWithKeyRange()
+      : fd(), file_metadata(nullptr), smallest_key(), largest_key() {}
+
+  FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key,
+                 FileMetaData* _file_metadata)
+      : fd(_fd),
+        file_metadata(_file_metadata),
+        smallest_key(_smallest_key),
+        largest_key(_largest_key) {}
+};
+
+// Data structure to store an array of FdWithKeyRange in one level
+// Actual data is guaranteed to be stored closely
+struct LevelFilesBrief {
+  size_t num_files;
+  FdWithKeyRange* files;
+  LevelFilesBrief() {
+    num_files = 0;
+    files = nullptr;
+  }
+};
+
+// The state of a DB at any given time is referred to as a Version.
+// Any modification to the Version is considered a Version Edit. A Version is
+// constructed by joining a sequence of Version Edits. Version Edits are written
+// to the MANIFEST file.
+class VersionEdit {
+ public:
+  void Clear();
+
+  void SetDBId(const std::string& db_id) {
+    has_db_id_ = true;
+    db_id_ = db_id;
+  }
+  bool HasDbId() const { return has_db_id_; }
+  const std::string& GetDbId() const { return db_id_; }
+
+  void SetComparatorName(const Slice& name) {
+    has_comparator_ = true;
+    comparator_ = name.ToString();
+  }
+  bool HasComparatorName() const { return has_comparator_; }
+  const std::string& GetComparatorName() const { return comparator_; }
+
+  void SetLogNumber(uint64_t num) {
+    has_log_number_ = true;
+    log_number_ = num;
+  }
+  bool HasLogNumber() const { return has_log_number_; }
+  uint64_t GetLogNumber() const { return log_number_; }
+
+  void SetPrevLogNumber(uint64_t num) {
+    has_prev_log_number_ = true;
+    prev_log_number_ = num;
+  }
+  bool HasPrevLogNumber() const { return has_prev_log_number_; }
+  uint64_t GetPrevLogNumber() const { return prev_log_number_; }
+
+  void SetNextFile(uint64_t num) {
+    has_next_file_number_ = true;
+    next_file_number_ = num;
+  }
+  bool HasNextFile() const { return has_next_file_number_; }
+  uint64_t GetNextFile() const { return next_file_number_; }
+
+  void SetMaxColumnFamily(uint32_t max_column_family) {
+    has_max_column_family_ = true;
+    max_column_family_ = max_column_family;
+  }
+  bool HasMaxColumnFamily() const { return has_max_column_family_; }
+  uint32_t GetMaxColumnFamily() const { return max_column_family_; }
+
+  void SetMinLogNumberToKeep(uint64_t num) {
+    has_min_log_number_to_keep_ = true;
+    min_log_number_to_keep_ = num;
+  }
+  bool HasMinLogNumberToKeep() const { return has_min_log_number_to_keep_; }
+  uint64_t GetMinLogNumberToKeep() const { return min_log_number_to_keep_; }
+
+  void SetLastSequence(SequenceNumber seq) {
+    has_last_sequence_ = true;
+    last_sequence_ = seq;
+  }
+  bool HasLastSequence() const { return has_last_sequence_; }
+  SequenceNumber GetLastSequence() const { return last_sequence_; }
+
+  // Delete the specified table file from the specified level.
+  void DeleteFile(int level, uint64_t file) {
+    deleted_files_.emplace(level, file);
+  }
+
+  // Retrieve the table files deleted as well as their associated levels.
+  using DeletedFiles = std::set<std::pair<int, uint64_t>>;
+  const DeletedFiles& GetDeletedFiles() const { return deleted_files_; }
+
+  // Add the specified table file at the specified level.
+  // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
+  // REQUIRES: "oldest_blob_file_number" is the number of the oldest blob file
+  // referred to by this file if any, kInvalidBlobFileNumber otherwise.
+  void AddFile(int level, uint64_t file, uint32_t file_path_id,
+               uint64_t file_size, const InternalKey& smallest,
+               const InternalKey& largest, const SequenceNumber& smallest_seqno,
+               const SequenceNumber& largest_seqno, bool marked_for_compaction,
+               Temperature temperature, uint64_t oldest_blob_file_number,
+               uint64_t oldest_ancester_time, uint64_t file_creation_time,
+               const std::string& file_checksum,
+               const std::string& file_checksum_func_name,
+               const UniqueId64x2& unique_id) {
+    assert(smallest_seqno <= largest_seqno);
+    new_files_.emplace_back(
+        level,
+        FileMetaData(file, file_path_id, file_size, smallest, largest,
+                     smallest_seqno, largest_seqno, marked_for_compaction,
+                     temperature, oldest_blob_file_number, oldest_ancester_time,
+                     file_creation_time, file_checksum, file_checksum_func_name,
+                     unique_id));
+    if (!HasLastSequence() || largest_seqno > GetLastSequence()) {
+      SetLastSequence(largest_seqno);
+    }
+  }
+
+  void AddFile(int level, const FileMetaData& f) {
+    assert(f.fd.smallest_seqno <= f.fd.largest_seqno);
+    new_files_.emplace_back(level, f);
+    if (!HasLastSequence() || f.fd.largest_seqno > GetLastSequence()) {
+      SetLastSequence(f.fd.largest_seqno);
+    }
+  }
+
+  // Retrieve the table files added as well as their associated levels.
+  using NewFiles = std::vector<std::pair<int, FileMetaData>>;
+  const NewFiles& GetNewFiles() const { return new_files_; }
+
+  // Retrieve all the compact cursors
+  using CompactCursors = std::vector<std::pair<int, InternalKey>>;
+  const CompactCursors& GetCompactCursors() const { return compact_cursors_; }
+  void AddCompactCursor(int level, const InternalKey& cursor) {
+    compact_cursors_.push_back(std::make_pair(level, cursor));
+  }
+  void SetCompactCursors(
+      const std::vector<InternalKey>& compact_cursors_by_level) {
+    compact_cursors_.clear();
+    compact_cursors_.reserve(compact_cursors_by_level.size());
+    for (int i = 0; i < (int)compact_cursors_by_level.size(); i++) {
+      if (compact_cursors_by_level[i].Valid()) {
+        compact_cursors_.push_back(
+            std::make_pair(i, compact_cursors_by_level[i]));
+      }
+    }
+  }
+
+  // Add a new blob file.
+  void AddBlobFile(uint64_t blob_file_number, uint64_t total_blob_count,
+                   uint64_t total_blob_bytes, std::string checksum_method,
+                   std::string checksum_value) {
+    blob_file_additions_.emplace_back(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        std::move(checksum_method), std::move(checksum_value));
+  }
+
+  void AddBlobFile(BlobFileAddition blob_file_addition) {
+    blob_file_additions_.emplace_back(std::move(blob_file_addition));
+  }
+
+  // Retrieve all the blob files added.
+  using BlobFileAdditions = std::vector<BlobFileAddition>;
+  const BlobFileAdditions& GetBlobFileAdditions() const {
+    return blob_file_additions_;
+  }
+
+  void SetBlobFileAdditions(BlobFileAdditions blob_file_additions) {
+    assert(blob_file_additions_.empty());
+    blob_file_additions_ = std::move(blob_file_additions);
+  }
+
+  // Add garbage for an existing blob file.  Note: intentionally broken English
+  // follows.
+  void AddBlobFileGarbage(uint64_t blob_file_number,
+                          uint64_t garbage_blob_count,
+                          uint64_t garbage_blob_bytes) {
+    blob_file_garbages_.emplace_back(blob_file_number, garbage_blob_count,
+                                     garbage_blob_bytes);
+  }
+
+  void AddBlobFileGarbage(BlobFileGarbage blob_file_garbage) {
+    blob_file_garbages_.emplace_back(std::move(blob_file_garbage));
+  }
+
+  // Retrieve all the blob file garbage added.
+  using BlobFileGarbages = std::vector<BlobFileGarbage>;
+  const BlobFileGarbages& GetBlobFileGarbages() const {
+    return blob_file_garbages_;
+  }
+
+  void SetBlobFileGarbages(BlobFileGarbages blob_file_garbages) {
+    assert(blob_file_garbages_.empty());
+    blob_file_garbages_ = std::move(blob_file_garbages);
+  }
+
+  // Add a WAL (either just created or closed).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void AddWal(WalNumber number, WalMetadata metadata = WalMetadata()) {
+    assert(NumEntries() == wal_additions_.size());
+    wal_additions_.emplace_back(number, std::move(metadata));
+  }
+
+  // Retrieve all the added WALs.
+  const WalAdditions& GetWalAdditions() const { return wal_additions_; }
+
+  bool IsWalAddition() const { return !wal_additions_.empty(); }
+
+  // Delete a WAL (either directly deleted or archived).
+  // AddWal and DeleteWalsBefore cannot be called on the same VersionEdit.
+  void DeleteWalsBefore(WalNumber number) {
+    assert((NumEntries() == 1) == !wal_deletion_.IsEmpty());
+    wal_deletion_ = WalDeletion(number);
+  }
+
+  const WalDeletion& GetWalDeletion() const { return wal_deletion_; }
+
+  bool IsWalDeletion() const { return !wal_deletion_.IsEmpty(); }
+
+  bool IsWalManipulation() const {
+    size_t entries = NumEntries();
+    return (entries > 0) && ((entries == wal_additions_.size()) ||
+                             (entries == !wal_deletion_.IsEmpty()));
+  }
+
+  // Number of edits
+  size_t NumEntries() const {
+    return new_files_.size() + deleted_files_.size() +
+           blob_file_additions_.size() + blob_file_garbages_.size() +
+           wal_additions_.size() + !wal_deletion_.IsEmpty();
+  }
+
+  void SetColumnFamily(uint32_t column_family_id) {
+    column_family_ = column_family_id;
+  }
+  uint32_t GetColumnFamily() const { return column_family_; }
+
+  // set column family ID by calling SetColumnFamily()
+  void AddColumnFamily(const std::string& name) {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_add_ = true;
+    column_family_name_ = name;
+  }
+
+  // set column family ID by calling SetColumnFamily()
+  void DropColumnFamily() {
+    assert(!is_column_family_drop_);
+    assert(!is_column_family_add_);
+    assert(NumEntries() == 0);
+    is_column_family_drop_ = true;
+  }
+
+  bool IsColumnFamilyManipulation() const {
+    return is_column_family_add_ || is_column_family_drop_;
+  }
+
+  bool IsColumnFamilyAdd() const { return is_column_family_add_; }
+
+  bool IsColumnFamilyDrop() const { return is_column_family_drop_; }
+
+  void MarkAtomicGroup(uint32_t remaining_entries) {
+    is_in_atomic_group_ = true;
+    remaining_entries_ = remaining_entries;
+  }
+  bool IsInAtomicGroup() const { return is_in_atomic_group_; }
+  uint32_t GetRemainingEntries() const { return remaining_entries_; }
+
+  bool HasFullHistoryTsLow() const { return !full_history_ts_low_.empty(); }
+  const std::string& GetFullHistoryTsLow() const {
+    assert(HasFullHistoryTsLow());
+    return full_history_ts_low_;
+  }
+  void SetFullHistoryTsLow(std::string full_history_ts_low) {
+    assert(!full_history_ts_low.empty());
+    full_history_ts_low_ = std::move(full_history_ts_low);
+  }
+
+  // return true on success.
+  bool EncodeTo(std::string* dst) const;
+  Status DecodeFrom(const Slice& src);
+
+  std::string DebugString(bool hex_key = false) const;
+  std::string DebugJSON(int edit_num, bool hex_key = false) const;
+
+ private:
+  friend class ReactiveVersionSet;
+  friend class VersionEditHandlerBase;
+  friend class ListColumnFamiliesHandler;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
+  friend class VersionSet;
+  friend class Version;
+  friend class AtomicGroupReadBuffer;
+
+  bool GetLevel(Slice* input, int* level, const char** msg);
+
+  const char* DecodeNewFile4From(Slice* input);
+
+  int max_level_ = 0;
+  std::string db_id_;
+  std::string comparator_;
+  uint64_t log_number_ = 0;
+  uint64_t prev_log_number_ = 0;
+  uint64_t next_file_number_ = 0;
+  uint32_t max_column_family_ = 0;
+  // The most recent WAL log number that is deleted
+  uint64_t min_log_number_to_keep_ = 0;
+  SequenceNumber last_sequence_ = 0;
+  bool has_db_id_ = false;
+  bool has_comparator_ = false;
+  bool has_log_number_ = false;
+  bool has_prev_log_number_ = false;
+  bool has_next_file_number_ = false;
+  bool has_max_column_family_ = false;
+  bool has_min_log_number_to_keep_ = false;
+  bool has_last_sequence_ = false;
+
+  // Compaction cursors for round-robin compaction policy
+  CompactCursors compact_cursors_;
+
+  DeletedFiles deleted_files_;
+  NewFiles new_files_;
+
+  BlobFileAdditions blob_file_additions_;
+  BlobFileGarbages blob_file_garbages_;
+
+  WalAdditions wal_additions_;
+  WalDeletion wal_deletion_;
+
+  // Each version edit record should have column_family_ set
+  // If it's not set, it is default (0)
+  uint32_t column_family_ = 0;
+  // a version edit can be either column_family add or
+  // column_family drop. If it's column family add,
+  // it also includes column family name.
+  bool is_column_family_drop_ = false;
+  bool is_column_family_add_ = false;
+  std::string column_family_name_;
+
+  bool is_in_atomic_group_ = false;
+  uint32_t remaining_entries_ = 0;
+
+  std::string full_history_ts_low_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.cc b/src/rocksdb/db/version_edit_handler.cc
new file mode 100644
index 000000000..145e78789
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.cc
@@ -0,0 +1,1002 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit_handler.h"
+
+#include <cinttypes>
+#include <sstream>
+
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_source.h"
+#include "logging/logging.h"
+#include "monitoring/persistent_stats_history.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void VersionEditHandlerBase::Iterate(log::Reader& reader,
+                                     Status* log_read_status) {
+  Slice record;
+  std::string scratch;
+  assert(log_read_status);
+  assert(log_read_status->ok());
+
+  size_t recovered_edits = 0;
+  Status s = Initialize();
+  while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
+         reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
+    VersionEdit edit;
+    s = edit.DecodeFrom(record);
+    if (!s.ok()) {
+      break;
+    }
+
+    s = read_buffer_.AddEdit(&edit);
+    if (!s.ok()) {
+      break;
+    }
+    ColumnFamilyData* cfd = nullptr;
+    if (edit.is_in_atomic_group_) {
+      if (read_buffer_.IsFull()) {
+        for (auto& e : read_buffer_.replay_buffer()) {
+          s = ApplyVersionEdit(e, &cfd);
+          if (!s.ok()) {
+            break;
+          }
+          ++recovered_edits;
+        }
+        if (!s.ok()) {
+          break;
+        }
+        read_buffer_.Clear();
+      }
+    } else {
+      s = ApplyVersionEdit(edit, &cfd);
+      if (s.ok()) {
+        ++recovered_edits;
+      }
+    }
+  }
+  if (!log_read_status->ok()) {
+    s = *log_read_status;
+  }
+
+  CheckIterationResult(reader, &s);
+
+  if (!s.ok()) {
+    if (s.IsCorruption()) {
+      // when we find a Corruption error, something is
+      // wrong with the underlying file. in this case we
+      // want to report the filename, so in here we append
+      // the filename to the Corruption message
+      assert(reader.file());
+
+      // build a new error message
+      std::stringstream message;
+      // append previous dynamic state message
+      const char* state = s.getState();
+      if (state != nullptr) {
+        message << state;
+        message << ' ';
+      }
+      // append the filename to the corruption message
+      message << "in file " << reader.file()->file_name();
+      // overwrite the status with the extended status
+      s = Status(s.code(), s.subcode(), s.severity(), message.str());
+    }
+    status_ = s;
+  }
+  TEST_SYNC_POINT_CALLBACK("VersionEditHandlerBase::Iterate:Finish",
+                           &recovered_edits);
+}
+
+Status ListColumnFamiliesHandler::ApplyVersionEdit(
+    VersionEdit& edit, ColumnFamilyData** /*unused*/) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    if (column_family_names_.find(edit.column_family_) !=
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest adding the same column family twice");
+    } else {
+      column_family_names_.insert(
+          {edit.column_family_, edit.column_family_name_});
+    }
+  } else if (edit.is_column_family_drop_) {
+    if (column_family_names_.find(edit.column_family_) ==
+        column_family_names_.end()) {
+      s = Status::Corruption("Manifest - dropping non-existing column family");
+    } else {
+      column_family_names_.erase(edit.column_family_);
+    }
+  }
+  return s;
+}
+
+Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,
+                                               ColumnFamilyData** /*unused*/) {
+  for (const auto& deleted_file : edit.GetDeletedFiles()) {
+    Status s = file_checksum_list_.RemoveOneFileChecksum(deleted_file.second);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_file : edit.GetNewFiles()) {
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_file.second.fd.GetNumber(), new_file.second.file_checksum,
+        new_file.second.file_checksum_func_name);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  for (const auto& new_blob_file : edit.GetBlobFileAdditions()) {
+    std::string checksum_value = new_blob_file.GetChecksumValue();
+    std::string checksum_method = new_blob_file.GetChecksumMethod();
+    assert(checksum_value.empty() == checksum_method.empty());
+    if (checksum_method.empty()) {
+      checksum_value = kUnknownFileChecksum;
+      checksum_method = kUnknownFileChecksumFuncName;
+    }
+    Status s = file_checksum_list_.InsertOneFileChecksum(
+        new_blob_file.GetBlobFileNumber(), checksum_value, checksum_method);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+  return Status::OK();
+}
+
+VersionEditHandler::VersionEditHandler(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, bool track_missing_files,
+    bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
+    bool skip_load_table_files)
+    : VersionEditHandlerBase(),
+      read_only_(read_only),
+      column_families_(std::move(column_families)),
+      version_set_(version_set),
+      track_missing_files_(track_missing_files),
+      no_error_if_files_missing_(no_error_if_files_missing),
+      io_tracer_(io_tracer),
+      skip_load_table_files_(skip_load_table_files),
+      initialized_(false) {
+  assert(version_set_ != nullptr);
+}
+
+Status VersionEditHandler::Initialize() {
+  Status s;
+  if (!initialized_) {
+    for (const auto& cf_desc : column_families_) {
+      name_to_options_.emplace(cf_desc.name, cf_desc.options);
+    }
+    auto default_cf_iter = name_to_options_.find(kDefaultColumnFamilyName);
+    if (default_cf_iter == name_to_options_.end()) {
+      s = Status::InvalidArgument("Default column family not specified");
+    }
+    if (s.ok()) {
+      VersionEdit default_cf_edit;
+      default_cf_edit.AddColumnFamily(kDefaultColumnFamilyName);
+      default_cf_edit.SetColumnFamily(0);
+      ColumnFamilyData* cfd =
+          CreateCfAndInit(default_cf_iter->second, default_cf_edit);
+      assert(cfd != nullptr);
+#ifdef NDEBUG
+      (void)cfd;
+#endif
+      initialized_ = true;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::ApplyVersionEdit(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  Status s;
+  if (edit.is_column_family_add_) {
+    s = OnColumnFamilyAdd(edit, cfd);
+  } else if (edit.is_column_family_drop_) {
+    s = OnColumnFamilyDrop(edit, cfd);
+  } else if (edit.IsWalAddition()) {
+    s = OnWalAddition(edit);
+  } else if (edit.IsWalDeletion()) {
+    s = OnWalDeletion(edit);
+  } else {
+    s = OnNonCfOperation(edit, cfd);
+  }
+  if (s.ok()) {
+    assert(cfd != nullptr);
+    s = ExtractInfoFromVersionEdit(*cfd, edit);
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyAdd(VersionEdit& edit,
+                                             ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (cf_in_builders || cf_in_not_found) {
+    s = Status::Corruption("MANIFEST adding the same column family twice: " +
+                           edit.column_family_name_);
+  }
+  if (s.ok()) {
+    auto cf_options = name_to_options_.find(edit.column_family_name_);
+    // implicitly add persistent_stats column family without requiring user
+    // to specify
+    ColumnFamilyData* tmp_cfd = nullptr;
+    bool is_persistent_stats_column_family =
+        edit.column_family_name_.compare(kPersistentStatsColumnFamilyName) == 0;
+    if (cf_options == name_to_options_.end() &&
+        !is_persistent_stats_column_family) {
+      column_families_not_found_.emplace(edit.column_family_,
+                                         edit.column_family_name_);
+    } else {
+      if (is_persistent_stats_column_family) {
+        ColumnFamilyOptions cfo;
+        OptimizeForPersistentStats(&cfo);
+        tmp_cfd = CreateCfAndInit(cfo, edit);
+      } else {
+        tmp_cfd = CreateCfAndInit(cf_options->second, edit);
+      }
+      *cfd = tmp_cfd;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::OnColumnFamilyDrop(VersionEdit& edit,
+                                              ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  ColumnFamilyData* tmp_cfd = nullptr;
+  Status s;
+  if (cf_in_builders) {
+    tmp_cfd = DestroyCfAndCleanup(edit);
+  } else if (cf_in_not_found) {
+    column_families_not_found_.erase(edit.column_family_);
+  } else {
+    s = Status::Corruption("MANIFEST - dropping non-existing column family");
+  }
+  *cfd = tmp_cfd;
+  return s;
+}
+
+Status VersionEditHandler::OnWalAddition(VersionEdit& edit) {
+  assert(edit.IsWalAddition());
+  return version_set_->wals_.AddWals(edit.GetWalAdditions());
+}
+
+Status VersionEditHandler::OnWalDeletion(VersionEdit& edit) {
+  assert(edit.IsWalDeletion());
+  return version_set_->wals_.DeleteWalsBefore(
+      edit.GetWalDeletion().GetLogNumber());
+}
+
+Status VersionEditHandler::OnNonCfOperation(VersionEdit& edit,
+                                            ColumnFamilyData** cfd) {
+  bool cf_in_not_found = false;
+  bool cf_in_builders = false;
+  CheckColumnFamilyId(edit, &cf_in_not_found, &cf_in_builders);
+
+  assert(cfd != nullptr);
+  *cfd = nullptr;
+  Status s;
+  if (!cf_in_not_found) {
+    if (!cf_in_builders) {
+      s = Status::Corruption(
+          "MANIFEST record referencing unknown column family");
+    }
+    ColumnFamilyData* tmp_cfd = nullptr;
+    if (s.ok()) {
+      auto builder_iter = builders_.find(edit.column_family_);
+      assert(builder_iter != builders_.end());
+      tmp_cfd = version_set_->GetColumnFamilySet()->GetColumnFamily(
+          edit.column_family_);
+      assert(tmp_cfd != nullptr);
+      s = MaybeCreateVersion(edit, tmp_cfd, /*force_create_version=*/false);
+      if (s.ok()) {
+        s = builder_iter->second->version_builder()->Apply(&edit);
+      }
+    }
+    *cfd = tmp_cfd;
+  }
+  return s;
+}
+
+// TODO maybe cache the computation result
+bool VersionEditHandler::HasMissingFiles() const {
+  bool ret = false;
+  for (const auto& elem : cf_to_missing_files_) {
+    const auto& missing_files = elem.second;
+    if (!missing_files.empty()) {
+      ret = true;
+      break;
+    }
+  }
+  if (!ret) {
+    for (const auto& elem : cf_to_missing_blob_files_high_) {
+      if (elem.second != kInvalidBlobFileNumber) {
+        ret = true;
+        break;
+      }
+    }
+  }
+  return ret;
+}
+
+void VersionEditHandler::CheckColumnFamilyId(const VersionEdit& edit,
+                                             bool* cf_in_not_found,
+                                             bool* cf_in_builders) const {
+  assert(cf_in_not_found != nullptr);
+  assert(cf_in_builders != nullptr);
+  // Not found means that user didn't supply that column
+  // family option AND we encountered column family add
+  // record. Once we encounter column family drop record,
+  // we will delete the column family from
+  // column_families_not_found.
+  bool in_not_found = column_families_not_found_.find(edit.column_family_) !=
+                      column_families_not_found_.end();
+  // in builders means that user supplied that column family
+  // option AND that we encountered column family add record
+  bool in_builders = builders_.find(edit.column_family_) != builders_.end();
+  // They cannot both be true
+  assert(!(in_not_found && in_builders));
+  *cf_in_not_found = in_not_found;
+  *cf_in_builders = in_builders;
+}
+
+void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
+                                              Status* s) {
+  assert(s != nullptr);
+  if (!s->ok()) {
+    // Do nothing here.
+  } else if (!version_edit_params_.has_log_number_ ||
+             !version_edit_params_.has_next_file_number_ ||
+             !version_edit_params_.has_last_sequence_) {
+    std::string msg("no ");
+    if (!version_edit_params_.has_log_number_) {
+      msg.append("log_file_number, ");
+    }
+    if (!version_edit_params_.has_next_file_number_) {
+      msg.append("next_file_number, ");
+    }
+    if (!version_edit_params_.has_last_sequence_) {
+      msg.append("last_sequence, ");
+    }
+    msg = msg.substr(0, msg.size() - 2);
+    msg.append(" entry in MANIFEST");
+    *s = Status::Corruption(msg);
+  }
+  // There were some column families in the MANIFEST that weren't specified
+  // in the argument. This is OK in read_only mode
+  if (s->ok() && MustOpenAllColumnFamilies() &&
+      !column_families_not_found_.empty()) {
+    std::string msg;
+    for (const auto& cf : column_families_not_found_) {
+      msg.append(", ");
+      msg.append(cf.second);
+    }
+    msg = msg.substr(2);
+    *s = Status::InvalidArgument("Column families not opened: " + msg);
+  }
+  if (s->ok()) {
+    version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
+        version_edit_params_.max_column_family_);
+    version_set_->MarkMinLogNumberToKeep(
+        version_edit_params_.min_log_number_to_keep_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
+    version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      auto* builder = builder_iter->second->version_builder();
+      if (!builder->CheckConsistencyForNumLevels()) {
+        *s = Status::InvalidArgument(
+            "db has more levels than options.num_levels");
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->GetColumnFamilySet())) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      if (read_only_) {
+        cfd->table_cache()->SetTablesAreImmortal();
+      }
+      *s = LoadTables(cfd, /*prefetch_index_and_filter_in_cache=*/false,
+                      /*is_initial_load=*/true);
+      if (!s->ok()) {
+        // If s is IOError::PathNotFound, then we mark the db as corrupted.
+        if (s->IsPathNotFound()) {
+          *s = Status::Corruption("Corruption: " + s->ToString());
+        }
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      VersionEdit edit;
+      *s = MaybeCreateVersion(edit, cfd, /*force_create_version=*/true);
+      if (!s->ok()) {
+        break;
+      }
+    }
+  }
+  if (s->ok()) {
+    version_set_->manifest_file_size_ = reader.GetReadOffset();
+    assert(version_set_->manifest_file_size_ > 0);
+    version_set_->next_file_number_.store(
+        version_edit_params_.next_file_number_ + 1);
+    SequenceNumber last_seq = version_edit_params_.last_sequence_;
+    assert(last_seq != kMaxSequenceNumber);
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_allocated_sequence_.load()) {
+      version_set_->last_allocated_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_published_sequence_.load()) {
+      version_set_->last_published_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->last_sequence_.load()) {
+      version_set_->last_sequence_.store(last_seq);
+    }
+    if (last_seq != kMaxSequenceNumber &&
+        last_seq > version_set_->descriptor_last_sequence_) {
+      // This is the maximum last sequence of all `VersionEdit`s iterated. It
+      // may be greater than the maximum `largest_seqno` of all files in case
+      // the newest data referred to by the MANIFEST has been dropped or had its
+      // sequence number zeroed through compaction.
+      version_set_->descriptor_last_sequence_ = last_seq;
+    }
+    version_set_->prev_log_number_ = version_edit_params_.prev_log_number_;
+  }
+}
+
+ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
+    const ColumnFamilyOptions& cf_options, const VersionEdit& edit) {
+  ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit);
+  assert(cfd != nullptr);
+  cfd->set_initialized();
+  assert(builders_.find(edit.column_family_) == builders_.end());
+  builders_.emplace(edit.column_family_,
+                    VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
+  if (track_missing_files_) {
+    cf_to_missing_files_.emplace(edit.column_family_,
+                                 std::unordered_set<uint64_t>());
+    cf_to_missing_blob_files_high_.emplace(edit.column_family_,
+                                           kInvalidBlobFileNumber);
+  }
+  return cfd;
+}
+
+ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  auto builder_iter = builders_.find(edit.column_family_);
+  assert(builder_iter != builders_.end());
+  builders_.erase(builder_iter);
+  if (track_missing_files_) {
+    auto missing_files_iter = cf_to_missing_files_.find(edit.column_family_);
+    assert(missing_files_iter != cf_to_missing_files_.end());
+    cf_to_missing_files_.erase(missing_files_iter);
+
+    auto missing_blob_files_high_iter =
+        cf_to_missing_blob_files_high_.find(edit.column_family_);
+    assert(missing_blob_files_high_iter !=
+           cf_to_missing_blob_files_high_.end());
+    cf_to_missing_blob_files_high_.erase(missing_blob_files_high_iter);
+  }
+  ColumnFamilyData* ret =
+      version_set_->GetColumnFamilySet()->GetColumnFamily(edit.column_family_);
+  assert(ret != nullptr);
+  ret->SetDropped();
+  ret->UnrefAndTryDelete();
+  ret = nullptr;
+  return ret;
+}
+
+Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/,
+                                              ColumnFamilyData* cfd,
+                                              bool force_create_version) {
+  assert(cfd->initialized());
+  Status s;
+  if (force_create_version) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    auto* builder = builder_iter->second->version_builder();
+    auto* v = new Version(cfd, version_set_, version_set_->file_options_,
+                          *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                          version_set_->current_version_number_++);
+    s = builder->SaveTo(v->storage_info());
+    if (s.ok()) {
+      // Install new version
+      v->PrepareAppend(
+          *cfd->GetLatestMutableCFOptions(),
+          !(version_set_->db_options_->skip_stats_update_on_db_open));
+      version_set_->AppendVersion(cfd, v);
+    } else {
+      delete v;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd,
+                                      bool prefetch_index_and_filter_in_cache,
+                                      bool is_initial_load) {
+  bool skip_load_table_files = skip_load_table_files_;
+  TEST_SYNC_POINT_CALLBACK(
+      "VersionEditHandler::LoadTables:skip_load_table_files",
+      &skip_load_table_files);
+  if (skip_load_table_files) {
+    return Status::OK();
+  }
+  assert(cfd != nullptr);
+  assert(!cfd->IsDropped());
+  auto builder_iter = builders_.find(cfd->GetID());
+  assert(builder_iter != builders_.end());
+  assert(builder_iter->second != nullptr);
+  VersionBuilder* builder = builder_iter->second->version_builder();
+  assert(builder);
+  Status s = builder->LoadTableHandlers(
+      cfd->internal_stats(),
+      version_set_->db_options_->max_file_opening_threads,
+      prefetch_index_and_filter_in_cache, is_initial_load,
+      cfd->GetLatestMutableCFOptions()->prefix_extractor,
+      MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+  if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) {
+    s = Status::OK();
+  }
+  if (!s.ok() && !version_set_->db_options_->paranoid_checks) {
+    s = Status::OK();
+  }
+  return s;
+}
+
+Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                                      const VersionEdit& edit) {
+  Status s;
+  if (edit.has_db_id_) {
+    version_set_->db_id_ = edit.GetDbId();
+    version_edit_params_.SetDBId(edit.db_id_);
+  }
+  if (cfd != nullptr) {
+    if (edit.has_log_number_) {
+      if (cfd->GetLogNumber() > edit.log_number_) {
+        ROCKS_LOG_WARN(
+            version_set_->db_options()->info_log,
+            "MANIFEST corruption detected, but ignored - Log numbers in "
+            "records NOT monotonically increasing");
+      } else {
+        cfd->SetLogNumber(edit.log_number_);
+        version_edit_params_.SetLogNumber(edit.log_number_);
+      }
+    }
+    if (edit.has_comparator_ &&
+        edit.comparator_ != cfd->user_comparator()->Name()) {
+      if (!cf_to_cmp_names_) {
+        s = Status::InvalidArgument(
+            cfd->user_comparator()->Name(),
+            "does not match existing comparator " + edit.comparator_);
+      } else {
+        cf_to_cmp_names_->emplace(cfd->GetID(), edit.comparator_);
+      }
+    }
+    if (edit.HasFullHistoryTsLow()) {
+      const std::string& new_ts = edit.GetFullHistoryTsLow();
+      cfd->SetFullHistoryTsLow(new_ts);
+    }
+  }
+
+  if (s.ok()) {
+    if (edit.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(edit.prev_log_number_);
+    }
+    if (edit.has_next_file_number_) {
+      version_edit_params_.SetNextFile(edit.next_file_number_);
+    }
+    if (edit.has_max_column_family_) {
+      version_edit_params_.SetMaxColumnFamily(edit.max_column_family_);
+    }
+    if (edit.has_min_log_number_to_keep_) {
+      version_edit_params_.min_log_number_to_keep_ =
+          std::max(version_edit_params_.min_log_number_to_keep_,
+                   edit.min_log_number_to_keep_);
+    }
+    if (edit.has_last_sequence_) {
+      // `VersionEdit::last_sequence_`s are assumed to be non-decreasing. This
+      // is legacy behavior that cannot change without breaking downgrade
+      // compatibility.
+      assert(!version_edit_params_.has_last_sequence_ ||
+             version_edit_params_.last_sequence_ <= edit.last_sequence_);
+      version_edit_params_.SetLastSequence(edit.last_sequence_);
+    }
+    if (!version_edit_params_.has_prev_log_number_) {
+      version_edit_params_.SetPrevLogNumber(0);
+    }
+  }
+  return s;
+}
+
+VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
+    bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+    VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer)
+    : VersionEditHandler(read_only, column_families, version_set,
+                         /*track_missing_files=*/true,
+                         /*no_error_if_files_missing=*/true, io_tracer) {}
+
+VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() {
+  for (const auto& elem : versions_) {
+    delete elem.second;
+  }
+  versions_.clear();
+}
+
+void VersionEditHandlerPointInTime::CheckIterationResult(
+    const log::Reader& reader, Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  assert(s != nullptr);
+  if (s->ok()) {
+    for (auto* cfd : *(version_set_->column_family_set_)) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      assert(cfd->initialized());
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        assert(v_iter->second != nullptr);
+
+        version_set_->AppendVersion(cfd, v_iter->second);
+        versions_.erase(v_iter);
+      }
+    }
+  } else {
+    for (const auto& elem : versions_) {
+      delete elem.second;
+    }
+    versions_.clear();
+  }
+}
+
+ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(
+    const VersionEdit& edit) {
+  ColumnFamilyData* cfd = VersionEditHandler::DestroyCfAndCleanup(edit);
+  auto v_iter = versions_.find(edit.column_family_);
+  if (v_iter != versions_.end()) {
+    delete v_iter->second;
+    versions_.erase(v_iter);
+  }
+  return cfd;
+}
+
+Status VersionEditHandlerPointInTime::MaybeCreateVersion(
+    const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
+  assert(cfd != nullptr);
+  if (!force_create_version) {
+    assert(edit.column_family_ == cfd->GetID());
+  }
+  auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
+  assert(missing_files_iter != cf_to_missing_files_.end());
+  std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
+
+  auto missing_blob_files_high_iter =
+      cf_to_missing_blob_files_high_.find(cfd->GetID());
+  assert(missing_blob_files_high_iter != cf_to_missing_blob_files_high_.end());
+  const uint64_t prev_missing_blob_file_high =
+      missing_blob_files_high_iter->second;
+
+  VersionBuilder* builder = nullptr;
+
+  if (prev_missing_blob_file_high != kInvalidBlobFileNumber) {
+    auto builder_iter = builders_.find(cfd->GetID());
+    assert(builder_iter != builders_.end());
+    builder = builder_iter->second->version_builder();
+    assert(builder != nullptr);
+  }
+
+  // At this point, we have not yet applied the new version edits read from the
+  // MANIFEST. We check whether we have any missing table and blob files.
+  const bool prev_has_missing_files =
+      !missing_files.empty() ||
+      (prev_missing_blob_file_high != kInvalidBlobFileNumber &&
+       prev_missing_blob_file_high >= builder->GetMinOldestBlobFileNumber());
+
+  for (const auto& file : edit.GetDeletedFiles()) {
+    uint64_t file_num = file.second;
+    auto fiter = missing_files.find(file_num);
+    if (fiter != missing_files.end()) {
+      missing_files.erase(fiter);
+    }
+  }
+
+  assert(!cfd->ioptions()->cf_paths.empty());
+  Status s;
+  for (const auto& elem : edit.GetNewFiles()) {
+    int level = elem.first;
+    const FileMetaData& meta = elem.second;
+    const FileDescriptor& fd = meta.fd;
+    uint64_t file_num = fd.GetNumber();
+    const std::string fpath =
+        MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
+    s = VerifyFile(cfd, fpath, level, meta);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_files.insert(file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  uint64_t missing_blob_file_num = prev_missing_blob_file_high;
+  for (const auto& elem : edit.GetBlobFileAdditions()) {
+    uint64_t file_num = elem.GetBlobFileNumber();
+    s = VerifyBlobFile(cfd, file_num, elem);
+    if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
+      missing_blob_file_num = std::max(missing_blob_file_num, file_num);
+      s = Status::OK();
+    } else if (!s.ok()) {
+      break;
+    }
+  }
+
+  bool has_missing_blob_files = false;
+  if (missing_blob_file_num != kInvalidBlobFileNumber &&
+      missing_blob_file_num >= prev_missing_blob_file_high) {
+    missing_blob_files_high_iter->second = missing_blob_file_num;
+    has_missing_blob_files = true;
+  } else if (missing_blob_file_num < prev_missing_blob_file_high) {
+    assert(false);
+  }
+
+  // We still have not applied the new version edit, but have tried to add new
+  // table and blob files after verifying their presence and consistency.
+  // Therefore, we know whether we will see new missing table and blob files
+  // later after actually applying the version edit. We perform the check here
+  // and record the result.
+  const bool has_missing_files =
+      !missing_files.empty() || has_missing_blob_files;
+
+  bool missing_info = !version_edit_params_.has_log_number_ ||
+                      !version_edit_params_.has_next_file_number_ ||
+                      !version_edit_params_.has_last_sequence_;
+
+  // Create version before apply edit. The version will represent the state
+  // before applying the version edit.
+  // A new version will created if:
+  // 1) no error has occurred so far, and
+  // 2) log_number_, next_file_number_ and last_sequence_ are known, and
+  // 3) any of the following:
+  //   a) no missing file before, but will have missing file(s) after applying
+  //      this version edit.
+  //   b) no missing file after applying the version edit, and the caller
+  //      explicitly request that a new version be created.
+  if (s.ok() && !missing_info &&
+      ((has_missing_files && !prev_has_missing_files) ||
+       (!has_missing_files && force_create_version))) {
+    if (!builder) {
+      auto builder_iter = builders_.find(cfd->GetID());
+      assert(builder_iter != builders_.end());
+      builder = builder_iter->second->version_builder();
+      assert(builder);
+    }
+
+    auto* version = new Version(cfd, version_set_, version_set_->file_options_,
+                                *cfd->GetLatestMutableCFOptions(), io_tracer_,
+                                version_set_->current_version_number_++);
+    s = builder->LoadTableHandlers(
+        cfd->internal_stats(),
+        version_set_->db_options_->max_file_opening_threads, false, true,
+        cfd->GetLatestMutableCFOptions()->prefix_extractor,
+        MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
+    if (!s.ok()) {
+      delete version;
+      if (s.IsCorruption()) {
+        s = Status::OK();
+      }
+      return s;
+    }
+    s = builder->SaveTo(version->storage_info());
+    if (s.ok()) {
+      version->PrepareAppend(
+          *cfd->GetLatestMutableCFOptions(),
+          !version_set_->db_options_->skip_stats_update_on_db_open);
+      auto v_iter = versions_.find(cfd->GetID());
+      if (v_iter != versions_.end()) {
+        delete v_iter->second;
+        v_iter->second = version;
+      } else {
+        versions_.emplace(cfd->GetID(), version);
+      }
+    } else {
+      delete version;
+    }
+  }
+  return s;
+}
+
+Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd,
+                                                 const std::string& fpath,
+                                                 int level,
+                                                 const FileMetaData& fmeta) {
+  return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta);
+}
+
+Status VersionEditHandlerPointInTime::VerifyBlobFile(
+    ColumnFamilyData* cfd, uint64_t blob_file_num,
+    const BlobFileAddition& blob_addition) {
+  BlobSource* blob_source = cfd->blob_source();
+  assert(blob_source);
+  CacheHandleGuard<BlobFileReader> blob_file_reader;
+  Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  // TODO: verify checksum
+  (void)blob_addition;
+  return s;
+}
+
+Status VersionEditHandlerPointInTime::LoadTables(
+    ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/,
+    bool /*is_initial_load*/) {
+  return Status::OK();
+}
+
+Status ManifestTailer::Initialize() {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::Initialize();
+  }
+  assert(Mode::kCatchUp == mode_);
+  Status s;
+  if (!initialized_) {
+    ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+    assert(cfd_set);
+    ColumnFamilyData* default_cfd = cfd_set->GetDefault();
+    assert(default_cfd);
+    auto builder_iter = builders_.find(default_cfd->GetID());
+    assert(builder_iter != builders_.end());
+
+    Version* dummy_version = default_cfd->dummy_versions();
+    assert(dummy_version);
+    Version* base_version = dummy_version->Next();
+    assert(base_version);
+    base_version->Ref();
+    VersionBuilderUPtr new_builder(
+        new BaseReferencedVersionBuilder(default_cfd, base_version));
+    builder_iter->second = std::move(new_builder);
+
+    initialized_ = true;
+  }
+  return s;
+}
+
+Status ManifestTailer::ApplyVersionEdit(VersionEdit& edit,
+                                        ColumnFamilyData** cfd) {
+  Status s = VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  if (s.ok()) {
+    assert(cfd);
+    if (*cfd) {
+      cfds_changed_.insert(*cfd);
+    }
+  }
+  return s;
+}
+
+Status ManifestTailer::OnColumnFamilyAdd(VersionEdit& edit,
+                                         ColumnFamilyData** cfd) {
+  if (Mode::kRecovery == mode_) {
+    return VersionEditHandler::OnColumnFamilyAdd(edit, cfd);
+  }
+  assert(Mode::kCatchUp == mode_);
+  ColumnFamilySet* cfd_set = version_set_->GetColumnFamilySet();
+  assert(cfd_set);
+  ColumnFamilyData* tmp_cfd = cfd_set->GetColumnFamily(edit.GetColumnFamily());
+  assert(cfd);
+  *cfd = tmp_cfd;
+  if (!tmp_cfd) {
+    // For now, ignore new column families created after Recover() succeeds.
+    return Status::OK();
+  }
+  auto builder_iter = builders_.find(edit.GetColumnFamily());
+  assert(builder_iter != builders_.end());
+
+  Version* dummy_version = tmp_cfd->dummy_versions();
+  assert(dummy_version);
+  Version* base_version = dummy_version->Next();
+  assert(base_version);
+  base_version->Ref();
+  VersionBuilderUPtr new_builder(
+      new BaseReferencedVersionBuilder(tmp_cfd, base_version));
+  builder_iter->second = std::move(new_builder);
+
+#ifndef NDEBUG
+  auto version_iter = versions_.find(edit.GetColumnFamily());
+  assert(version_iter == versions_.end());
+#endif  // !NDEBUG
+  return Status::OK();
+}
+
+void ManifestTailer::CheckIterationResult(const log::Reader& reader,
+                                          Status* s) {
+  VersionEditHandlerPointInTime::CheckIterationResult(reader, s);
+  assert(s);
+  if (s->ok()) {
+    if (Mode::kRecovery == mode_) {
+      mode_ = Mode::kCatchUp;
+    } else {
+      assert(Mode::kCatchUp == mode_);
+    }
+  }
+}
+
+Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
+                                  const std::string& fpath, int level,
+                                  const FileMetaData& fmeta) {
+  Status s =
+      VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta);
+  // TODO: Open file or create hard link to prevent the file from being
+  // deleted.
+  return s;
+}
+
+void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
+                                               Status* s) {
+  VersionEditHandler::CheckIterationResult(reader, s);
+  if (!s->ok()) {
+    fprintf(stdout, "%s\n", s->ToString().c_str());
+    return;
+  }
+  assert(cf_to_cmp_names_);
+  for (auto* cfd : *(version_set_->column_family_set_)) {
+    fprintf(stdout,
+            "--------------- Column family \"%s\"  (ID %" PRIu32
+            ") --------------\n",
+            cfd->GetName().c_str(), cfd->GetID());
+    fprintf(stdout, "log number: %" PRIu64 "\n", cfd->GetLogNumber());
+    auto it = cf_to_cmp_names_->find(cfd->GetID());
+    if (it != cf_to_cmp_names_->end()) {
+      fprintf(stdout,
+              "comparator: <%s>, but the comparator object is not available.\n",
+              it->second.c_str());
+    } else {
+      fprintf(stdout, "comparator: %s\n", cfd->user_comparator()->Name());
+    }
+    assert(cfd->current());
+
+    // Print out DebugStrings. Can include non-terminating null characters.
+    fwrite(cfd->current()->DebugString(hex_).data(), sizeof(char),
+           cfd->current()->DebugString(hex_).size(), stdout);
+  }
+  fprintf(stdout,
+          "next_file_number %" PRIu64 " last_sequence %" PRIu64
+          "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
+          " min_log_number_to_keep %" PRIu64 "\n",
+          version_set_->current_next_file_number(),
+          version_set_->LastSequence(), version_set_->prev_log_number(),
+          version_set_->column_family_set_->GetMaxColumnFamily(),
+          version_set_->min_log_number_to_keep());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_handler.h b/src/rocksdb/db/version_edit_handler.h
new file mode 100644
index 000000000..fd2379b07
--- /dev/null
+++ b/src/rocksdb/db/version_edit_handler.h
@@ -0,0 +1,313 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct FileMetaData;
+
+class VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandlerBase()
+      : max_manifest_read_size_(std::numeric_limits<uint64_t>::max()) {}
+
+  virtual ~VersionEditHandlerBase() {}
+
+  void Iterate(log::Reader& reader, Status* log_read_status);
+
+  const Status& status() const { return status_; }
+
+  AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; }
+
+ protected:
+  explicit VersionEditHandlerBase(uint64_t max_read_size)
+      : max_manifest_read_size_(max_read_size) {}
+  virtual Status Initialize() { return Status::OK(); }
+
+  virtual Status ApplyVersionEdit(VersionEdit& edit,
+                                  ColumnFamilyData** cfd) = 0;
+
+  virtual void CheckIterationResult(const log::Reader& /*reader*/,
+                                    Status* /*s*/) {}
+
+  void ClearReadBuffer() { read_buffer_.Clear(); }
+
+  Status status_;
+
+ private:
+  AtomicGroupReadBuffer read_buffer_;
+  const uint64_t max_manifest_read_size_;
+};
+
+class ListColumnFamiliesHandler : public VersionEditHandlerBase {
+ public:
+  ListColumnFamiliesHandler() : VersionEditHandlerBase() {}
+
+  ~ListColumnFamiliesHandler() override {}
+
+  const std::map<uint32_t, std::string> GetColumnFamilyNames() const {
+    return column_family_names_;
+  }
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  // default column family is always implicitly there
+  std::map<uint32_t, std::string> column_family_names_{
+      {0, kDefaultColumnFamilyName}};
+};
+
+class FileChecksumRetriever : public VersionEditHandlerBase {
+ public:
+  FileChecksumRetriever(uint64_t max_read_size,
+                        FileChecksumList& file_checksum_list)
+      : VersionEditHandlerBase(max_read_size),
+        file_checksum_list_(file_checksum_list) {}
+
+  ~FileChecksumRetriever() override {}
+
+ protected:
+  Status ApplyVersionEdit(VersionEdit& edit,
+                          ColumnFamilyData** /*unused*/) override;
+
+ private:
+  FileChecksumList& file_checksum_list_;
+};
+
+using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
+
+// A class used for scanning MANIFEST file.
+// VersionEditHandler reads a MANIFEST file, parses the version edits, and
+// builds the version set's in-memory state, e.g. the version storage info for
+// the versions of column families.
+// To use this class and its subclasses,
+// 1. Create an object of VersionEditHandler or its subclasses.
+//    VersionEditHandler handler(read_only, column_families, version_set,
+//                               track_missing_files,
+//                               no_error_if_files_missing);
+// 2. Status s = handler.Iterate(reader, &db_id);
+// 3. Check s and handle possible errors.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandler is shared by multiple threads.
+class VersionEditHandler : public VersionEditHandlerBase {
+ public:
+  explicit VersionEditHandler(
+      bool read_only,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandler(read_only, column_families, version_set,
+                           track_missing_files, no_error_if_files_missing,
+                           io_tracer, /*skip_load_table_files=*/false) {}
+
+  ~VersionEditHandler() override {}
+
+  const VersionEditParams& GetVersionEditParams() const {
+    return version_edit_params_;
+  }
+
+  bool HasMissingFiles() const;
+
+  void GetDbId(std::string* db_id) const {
+    if (db_id && version_edit_params_.has_db_id_) {
+      *db_id = version_edit_params_.db_id_;
+    }
+  }
+
+ protected:
+  explicit VersionEditHandler(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, bool track_missing_files,
+      bool no_error_if_files_missing,
+      const std::shared_ptr<IOTracer>& io_tracer, bool skip_load_table_files);
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  virtual Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnColumnFamilyDrop(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnNonCfOperation(VersionEdit& edit, ColumnFamilyData** cfd);
+
+  Status OnWalAddition(VersionEdit& edit);
+
+  Status OnWalDeletion(VersionEdit& edit);
+
+  Status Initialize() override;
+
+  void CheckColumnFamilyId(const VersionEdit& edit, bool* cf_in_not_found,
+                           bool* cf_in_builders) const;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  ColumnFamilyData* CreateCfAndInit(const ColumnFamilyOptions& cf_options,
+                                    const VersionEdit& edit);
+
+  virtual ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit);
+
+  virtual Status MaybeCreateVersion(const VersionEdit& edit,
+                                    ColumnFamilyData* cfd,
+                                    bool force_create_version);
+
+  virtual Status LoadTables(ColumnFamilyData* cfd,
+                            bool prefetch_index_and_filter_in_cache,
+                            bool is_initial_load);
+
+  virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
+
+  const bool read_only_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  VersionSet* version_set_;
+  std::unordered_map<uint32_t, VersionBuilderUPtr> builders_;
+  std::unordered_map<std::string, ColumnFamilyOptions> name_to_options_;
+  // Keeps track of column families in manifest that were not found in
+  // column families parameters. if those column families are not dropped
+  // by subsequent manifest records, Recover() will return failure status.
+  std::unordered_map<uint32_t, std::string> column_families_not_found_;
+  VersionEditParams version_edit_params_;
+  const bool track_missing_files_;
+  std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
+      cf_to_missing_files_;
+  std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
+  bool no_error_if_files_missing_;
+  std::shared_ptr<IOTracer> io_tracer_;
+  bool skip_load_table_files_;
+  bool initialized_;
+  std::unique_ptr<std::unordered_map<uint32_t, std::string>> cf_to_cmp_names_;
+
+ private:
+  Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd,
+                                    const VersionEdit& edit);
+};
+
+// A class similar to its base class, i.e. VersionEditHandler.
+// VersionEditHandlerPointInTime restores the versions to the most recent point
+// in time such that at this point, the version does not have missing files.
+//
+// Not thread-safe, external synchronization is necessary if an object of
+// VersionEditHandlerPointInTime is shared by multiple threads.
+class VersionEditHandlerPointInTime : public VersionEditHandler {
+ public:
+  VersionEditHandlerPointInTime(
+      bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
+      VersionSet* version_set, const std::shared_ptr<IOTracer>& io_tracer);
+  ~VersionEditHandlerPointInTime() override;
+
+ protected:
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+  ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
+  Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
+                            bool force_create_version) override;
+  virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
+                            int level, const FileMetaData& fmeta);
+  virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
+                                const BlobFileAddition& blob_addition);
+
+  Status LoadTables(ColumnFamilyData* cfd,
+                    bool prefetch_index_and_filter_in_cache,
+                    bool is_initial_load) override;
+
+  std::unordered_map<uint32_t, Version*> versions_;
+};
+
+class ManifestTailer : public VersionEditHandlerPointInTime {
+ public:
+  explicit ManifestTailer(std::vector<ColumnFamilyDescriptor> column_families,
+                          VersionSet* version_set,
+                          const std::shared_ptr<IOTracer>& io_tracer)
+      : VersionEditHandlerPointInTime(/*read_only=*/false, column_families,
+                                      version_set, io_tracer),
+        mode_(Mode::kRecovery) {}
+
+  void PrepareToReadNewManifest() {
+    initialized_ = false;
+    ClearReadBuffer();
+  }
+
+  std::unordered_set<ColumnFamilyData*>& GetUpdatedColumnFamilies() {
+    return cfds_changed_;
+  }
+
+ protected:
+  Status Initialize() override;
+
+  bool MustOpenAllColumnFamilies() const override { return false; }
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  Status OnColumnFamilyAdd(VersionEdit& edit, ColumnFamilyData** cfd) override;
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+  Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
+                    const FileMetaData& fmeta) override;
+
+  enum Mode : uint8_t {
+    kRecovery = 0,
+    kCatchUp = 1,
+  };
+
+  Mode mode_;
+  std::unordered_set<ColumnFamilyData*> cfds_changed_;
+};
+
+class DumpManifestHandler : public VersionEditHandler {
+ public:
+  DumpManifestHandler(std::vector<ColumnFamilyDescriptor> column_families,
+                      VersionSet* version_set,
+                      const std::shared_ptr<IOTracer>& io_tracer, bool verbose,
+                      bool hex, bool json)
+      : VersionEditHandler(
+            /*read_only=*/true, column_families, version_set,
+            /*track_missing_files=*/false,
+            /*no_error_if_files_missing=*/false, io_tracer,
+            /*skip_load_table_files=*/true),
+        verbose_(verbose),
+        hex_(hex),
+        json_(json),
+        count_(0) {
+    cf_to_cmp_names_.reset(new std::unordered_map<uint32_t, std::string>());
+  }
+
+  ~DumpManifestHandler() override {}
+
+  Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override {
+    // Write out each individual edit
+    if (verbose_ && !json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    } else if (json_) {
+      // Print out DebugStrings. Can include non-terminating null characters.
+      fwrite(edit.DebugString(hex_).data(), sizeof(char),
+             edit.DebugString(hex_).size(), stdout);
+    }
+    ++count_;
+    return VersionEditHandler::ApplyVersionEdit(edit, cfd);
+  }
+
+  void CheckIterationResult(const log::Reader& reader, Status* s) override;
+
+ private:
+  const bool verbose_;
+  const bool hex_;
+  const bool json_;
+  int count_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_edit_test.cc b/src/rocksdb/db/version_edit_test.cc
new file mode 100644
index 000000000..c7f271d83
--- /dev/null
+++ b/src/rocksdb/db/version_edit_test.cc
@@ -0,0 +1,730 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_edit.h"
+
+#include "db/blob/blob_index.h"
+#include "rocksdb/advanced_options.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/coding.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static void TestEncodeDecode(const VersionEdit& edit) {
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  parsed.EncodeTo(&encoded2);
+  ASSERT_EQ(encoded, encoded2);
+}
+
+class VersionEditTest : public testing::Test {};
+
+TEST_F(VersionEditTest, EncodeDecode) {
+  static const uint64_t kBig = 1ull << 50;
+  static const uint32_t kBig32Bit = 1ull << 30;
+
+  VersionEdit edit;
+  for (int i = 0; i < 4; i++) {
+    TestEncodeDecode(edit);
+    edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0,
+                 InternalKey("foo", kBig + 500 + i, kTypeValue),
+                 InternalKey("zoo", kBig + 600 + i, kTypeDeletion),
+                 kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown,
+                 kInvalidBlobFileNumber, 888, 678, "234", "crc32c",
+                 kNullUniqueId64x2);
+    edit.DeleteFile(4, kBig + 700 + i);
+  }
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
+               InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
+               kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex),
+               InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503,
+               kBig + 603, true, Temperature::kUnknown, 1001,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+  TestEncodeDecode(edit);
+
+  std::string encoded, encoded2;
+  edit.EncodeTo(&encoded);
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[2].second.marked_for_compaction);
+  ASSERT_TRUE(new_files[3].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[2].second.fd.GetPathId());
+  ASSERT_EQ(0u, new_files[3].second.fd.GetPathId());
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[0].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[1].second.oldest_blob_file_number);
+  ASSERT_EQ(kInvalidBlobFileNumber,
+            new_files[2].second.oldest_blob_file_number);
+  ASSERT_EQ(1001, new_files[3].second.oldest_blob_file_number);
+}
+
+TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
+               InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
+               kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber,
+               686, 868, "234", "crc32c", kNullUniqueId64x2);
+  edit.DeleteFile(4, 700);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  bool first = true;
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        PutVarint32(str, 33);
+        const std::string str1 = "random_string";
+        PutLengthPrefixedSlice(str, str1);
+        if (first) {
+          first = false;
+          PutVarint32(str, 22);
+          const std::string str2 = "s";
+          PutLengthPrefixedSlice(str, str2);
+        }
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_TRUE(s.ok()) << s.ToString();
+  ASSERT_TRUE(!first);
+  auto& new_files = parsed.GetNewFiles();
+  ASSERT_TRUE(new_files[0].second.marked_for_compaction);
+  ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
+  ASSERT_EQ(3u, new_files[0].second.fd.GetPathId());
+  ASSERT_EQ(3u, new_files[1].second.fd.GetPathId());
+  ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
+}
+
+TEST_F(VersionEditTest, NewFile4NotSupportedField) {
+  static const uint64_t kBig = 1ull << 50;
+  VersionEdit edit;
+  edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
+               InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
+               kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+
+  edit.SetComparatorName("foo");
+  edit.SetLogNumber(kBig + 100);
+  edit.SetNextFile(kBig + 200);
+  edit.SetLastSequence(kBig + 1000);
+
+  std::string encoded;
+
+  // Call back function to add extra customized builds.
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
+        std::string* str = reinterpret_cast<std::string*>(arg);
+        const std::string str1 = "s";
+        PutLengthPrefixedSlice(str, str1);
+      });
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+  edit.EncodeTo(&encoded);
+  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+  VersionEdit parsed;
+  Status s = parsed.DecodeFrom(encoded);
+  ASSERT_NOK(s);
+}
+
+TEST_F(VersionEditTest, EncodeEmptyFile) {
+  VersionEdit edit;
+  edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false,
+               Temperature::kUnknown, kInvalidBlobFileNumber,
+               kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+               kUnknownFileChecksum, kUnknownFileChecksumFuncName,
+               kNullUniqueId64x2);
+  std::string buffer;
+  ASSERT_TRUE(!edit.EncodeTo(&buffer));
+}
+
+TEST_F(VersionEditTest, ColumnFamilyTest) {
+  VersionEdit edit;
+  edit.SetColumnFamily(2);
+  edit.AddColumnFamily("column_family");
+  edit.SetMaxColumnFamily(5);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetColumnFamily(3);
+  edit.DropColumnFamily();
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, MinLogNumberToKeep) {
+  VersionEdit edit;
+  edit.SetMinLogNumberToKeep(13);
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetMinLogNumberToKeep(23);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AtomicGroupTest) {
+  VersionEdit edit;
+  edit.MarkAtomicGroup(1);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, IgnorableField) {
+  VersionEdit ve;
+  std::string encoded;
+
+  // Size of ignorable field is too large
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      5 /* fieldlength 5 */);
+  encoded += "abc";  // Only fills 3 bytes,
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  encoded.clear();
+  // Error when seeing unidentified tag that is not ignorable
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded, 666 /* A field with kTagSafeIgnoreMask unset */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* next file number */, 88);
+  ASSERT_NOK(ve.DecodeFrom(encoded));
+
+  // Safely ignore an identified but safely ignorable entry
+  encoded.clear();
+  PutVarint32Varint64(&encoded, 2 /* kLogNumber */, 66);
+  // This is a customized ignorable tag
+  PutVarint32Varint64(&encoded,
+                      0x2710 /* A field with kTagSafeIgnoreMask set */,
+                      3 /* fieldlength 3 */);
+  encoded += "abc";  //  Fill 3 bytes
+  PutVarint32Varint64(&encoded, 3 /* kNextFileNumber */, 88);
+
+  ASSERT_OK(ve.DecodeFrom(encoded));
+
+  ASSERT_TRUE(ve.HasLogNumber());
+  ASSERT_TRUE(ve.HasNextFile());
+  ASSERT_EQ(66, ve.GetLogNumber());
+  ASSERT_EQ(88, ve.GetNextFile());
+}
+
+TEST_F(VersionEditTest, DbId) {
+  VersionEdit edit;
+  edit.SetDBId("ab34-cd12-435f-er00");
+  TestEncodeDecode(edit);
+
+  edit.Clear();
+  edit.SetDBId("34ba-cd12-435f-er01");
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, BlobFileAdditionAndGarbage) {
+  VersionEdit edit;
+
+  const std::string checksum_method_prefix = "Hash";
+  const std::string checksum_value_prefix = "Value";
+
+  for (uint64_t blob_file_number = 1; blob_file_number <= 10;
+       ++blob_file_number) {
+    const uint64_t total_blob_count = blob_file_number << 10;
+    const uint64_t total_blob_bytes = blob_file_number << 20;
+
+    std::string checksum_method(checksum_method_prefix);
+    AppendNumberTo(&checksum_method, blob_file_number);
+
+    std::string checksum_value(checksum_value_prefix);
+    AppendNumberTo(&checksum_value, blob_file_number);
+
+    edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                     checksum_method, checksum_value);
+
+    const uint64_t garbage_blob_count = total_blob_count >> 2;
+    const uint64_t garbage_blob_bytes = total_blob_bytes >> 1;
+
+    edit.AddBlobFileGarbage(blob_file_number, garbage_blob_count,
+                            garbage_blob_bytes);
+  }
+
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, AddWalEncodeDecode) {
+  VersionEdit edit;
+  for (uint64_t log_number = 1; log_number <= 20; log_number++) {
+    WalMetadata meta;
+    bool has_size = rand() % 2 == 0;
+    if (has_size) {
+      meta.SetSyncedSizeInBytes(rand() % 1000);
+    }
+    edit.AddWal(log_number, meta);
+  }
+  TestEncodeDecode(edit);
+}
+
+static std::string PrefixEncodedWalAdditionWithLength(
+    const std::string& encoded) {
+  std::string ret;
+  PutVarint32(&ret, Tag::kWalAddition2);
+  PutLengthPrefixedSlice(&ret, encoded);
+  return ret;
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadLogNumber) {
+  std::string encoded;
+
+  {
+    // No log number.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // log number should be varint64,
+    // but we only encode 128 which is not a valid representation of varint64.
+    char c = 0;
+    unsigned char* ptr = reinterpret_cast<unsigned char*>(&c);
+    *ptr = 128;
+    encoded.append(1, c);
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL log number") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeBadTag) {
+  constexpr WalNumber kLogNumber = 100;
+  constexpr uint64_t kSizeInBytes = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+
+  {
+    // No tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has size tag, no terminate tag.
+    std::string encoded_with_size = encoded;
+    PutVarint32(&encoded_with_size,
+                static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(&encoded_with_size, kSizeInBytes);
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_size);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // Only has terminate tag.
+    std::string encoded_with_terminate = encoded;
+    PutVarint32(&encoded_with_terminate,
+                static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit =
+        PrefixEncodedWalAdditionWithLength(encoded_with_terminate);
+    VersionEdit edit;
+    ASSERT_OK(edit.DecodeFrom(encoded_edit));
+    auto& wal_addition = edit.GetWalAdditions()[0];
+    ASSERT_EQ(wal_addition.GetLogNumber(), kLogNumber);
+    ASSERT_FALSE(wal_addition.GetMetadata().HasSyncedSize());
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDecodeNoSize) {
+  constexpr WalNumber kLogNumber = 100;
+
+  std::string encoded;
+  PutVarint64(&encoded, kLogNumber);
+  PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+  // No real size after the size tag.
+
+  {
+    // Without terminate tag.
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("Error decoding WAL file size") !=
+                std::string::npos)
+        << s.ToString();
+  }
+
+  {
+    // With terminate tag.
+    PutVarint32(&encoded, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+
+    std::string encoded_edit = PrefixEncodedWalAdditionWithLength(encoded);
+    VersionEdit edit;
+    Status s = edit.DecodeFrom(encoded_edit);
+    ASSERT_TRUE(s.IsCorruption());
+    // The terminate tag is misunderstood as the size.
+    ASSERT_TRUE(s.ToString().find("Error decoding tag") != std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionEditTest, AddWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+  constexpr std::array<uint64_t, n> kSizeInBytes{{100, 200}};
+
+  VersionEdit edit;
+  for (int i = 0; i < n; i++) {
+    edit.AddWal(kLogNumbers[i], WalMetadata(kSizeInBytes[i]));
+  }
+
+  const WalAdditions& wals = edit.GetWalAdditions();
+
+  ASSERT_TRUE(edit.IsWalAddition());
+  ASSERT_EQ(wals.size(), n);
+  for (int i = 0; i < n; i++) {
+    const WalAddition& wal = wals[i];
+    ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[i]);
+    ASSERT_EQ(wal.GetMetadata().GetSyncedSizeInBytes(), kSizeInBytes[i]);
+  }
+
+  std::string expected_str = "VersionEdit {\n";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "  WalAddition: log_number: " << kLogNumbers[i]
+       << " synced_size_in_bytes: " << kSizeInBytes[i] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalAdditions\": [";
+  for (int i = 0; i < n; i++) {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[i] << ", "
+       << "\"SyncedSizeInBytes\": " << kSizeInBytes[i] << "}";
+    if (i < n - 1) ss << ", ";
+    expected_json += ss.str();
+  }
+  expected_json += "], \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, DeleteWalEncodeDecode) {
+  VersionEdit edit;
+  edit.DeleteWalsBefore(rand() % 100);
+  TestEncodeDecode(edit);
+}
+
+TEST_F(VersionEditTest, DeleteWalDebug) {
+  constexpr int n = 2;
+  constexpr std::array<uint64_t, n> kLogNumbers{{10, 20}};
+
+  VersionEdit edit;
+  edit.DeleteWalsBefore(kLogNumbers[n - 1]);
+
+  const WalDeletion& wal = edit.GetWalDeletion();
+
+  ASSERT_TRUE(edit.IsWalDeletion());
+  ASSERT_EQ(wal.GetLogNumber(), kLogNumbers[n - 1]);
+
+  std::string expected_str = "VersionEdit {\n";
+  {
+    std::stringstream ss;
+    ss << "  WalDeletion: log_number: " << kLogNumbers[n - 1] << "\n";
+    expected_str += ss.str();
+  }
+  expected_str += "  ColumnFamily: 0\n}\n";
+  ASSERT_EQ(edit.DebugString(true), expected_str);
+
+  std::string expected_json = "{\"EditNumber\": 4, \"WalDeletion\": ";
+  {
+    std::stringstream ss;
+    ss << "{\"LogNumber\": " << kLogNumbers[n - 1] << "}";
+    expected_json += ss.str();
+  }
+  expected_json += ", \"ColumnFamily\": 0}";
+  ASSERT_EQ(edit.DebugJSON(4, true), expected_json);
+}
+
+TEST_F(VersionEditTest, FullHistoryTsLow) {
+  VersionEdit edit;
+  ASSERT_FALSE(edit.HasFullHistoryTsLow());
+  std::string ts = test::EncodeInt(0);
+  edit.SetFullHistoryTsLow(ts);
+  TestEncodeDecode(edit);
+}
+
+// Tests that if RocksDB is downgraded, the new types of VersionEdits
+// that have a tag larger than kTagSafeIgnoreMask can be safely ignored.
+TEST_F(VersionEditTest, IgnorableTags) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionEdit::EncodeTo:IgnoreIgnorableTags", [&](void* arg) {
+        bool* ignore = static_cast<bool*>(arg);
+        *ignore = true;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  constexpr uint64_t kPrevLogNumber = 100;
+  constexpr uint64_t kLogNumber = 200;
+  constexpr uint64_t kNextFileNumber = 300;
+  constexpr uint64_t kColumnFamilyId = 400;
+
+  VersionEdit edit;
+  // Add some ignorable entries.
+  for (int i = 0; i < 2; i++) {
+    edit.AddWal(i + 1, WalMetadata(i + 2));
+  }
+  edit.SetDBId("db_id");
+  // Add unignorable entries.
+  edit.SetPrevLogNumber(kPrevLogNumber);
+  edit.SetLogNumber(kLogNumber);
+  // Add more ignorable entries.
+  edit.DeleteWalsBefore(100);
+  // Add unignorable entry.
+  edit.SetNextFile(kNextFileNumber);
+  // Add more ignorable entries.
+  edit.SetFullHistoryTsLow("ts");
+  // Add unignorable entry.
+  edit.SetColumnFamily(kColumnFamilyId);
+
+  std::string encoded;
+  ASSERT_TRUE(edit.EncodeTo(&encoded));
+
+  VersionEdit decoded;
+  ASSERT_OK(decoded.DecodeFrom(encoded));
+
+  // Check that all ignorable entries are ignored.
+  ASSERT_FALSE(decoded.HasDbId());
+  ASSERT_FALSE(decoded.HasFullHistoryTsLow());
+  ASSERT_FALSE(decoded.IsWalAddition());
+  ASSERT_FALSE(decoded.IsWalDeletion());
+  ASSERT_TRUE(decoded.GetWalAdditions().empty());
+  ASSERT_TRUE(decoded.GetWalDeletion().IsEmpty());
+
+  // Check that unignorable entries are still present.
+  ASSERT_EQ(edit.GetPrevLogNumber(), kPrevLogNumber);
+  ASSERT_EQ(edit.GetLogNumber(), kLogNumber);
+  ASSERT_EQ(edit.GetNextFile(), kNextFileNumber);
+  ASSERT_EQ(edit.GetColumnFamily(), kColumnFamilyId);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+}
+
+TEST(FileMetaDataTest, UpdateBoundariesBlobIndex) {
+  FileMetaData meta;
+
+  {
+    constexpr uint64_t file_number = 10;
+    constexpr uint32_t path_id = 0;
+    constexpr uint64_t file_size = 0;
+
+    meta.fd = FileDescriptor(file_number, path_id, file_size);
+  }
+
+  constexpr char key[] = "foo";
+
+  constexpr uint64_t expected_oldest_blob_file_number = 20;
+
+  // Plain old value (does not affect oldest_blob_file_number)
+  {
+    constexpr char value[] = "value";
+    constexpr SequenceNumber seq = 200;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, value, seq, kTypeValue));
+    ASSERT_EQ(meta.oldest_blob_file_number, kInvalidBlobFileNumber);
+  }
+
+  // Non-inlined, non-TTL blob index (sets oldest_blob_file_number)
+  {
+    constexpr uint64_t blob_file_number = 25;
+    static_assert(blob_file_number > expected_oldest_blob_file_number,
+                  "unexpected");
+
+    constexpr uint64_t offset = 1000;
+    constexpr uint64_t size = 100;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, blob_file_number, offset, size,
+                          kNoCompression);
+
+    constexpr SequenceNumber seq = 201;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, blob_file_number);
+  }
+
+  // Another one, with the oldest blob file number (updates
+  // oldest_blob_file_number)
+  {
+    constexpr uint64_t offset = 2000;
+    constexpr uint64_t size = 300;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, expected_oldest_blob_file_number, offset,
+                          size, kNoCompression);
+
+    constexpr SequenceNumber seq = 202;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Inlined TTL blob index (does not affect oldest_blob_file_number)
+  {
+    constexpr uint64_t expiration = 9876543210;
+    constexpr char value[] = "value";
+
+    std::string blob_index;
+    BlobIndex::EncodeInlinedTTL(&blob_index, expiration, value);
+
+    constexpr SequenceNumber seq = 203;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Non-inlined TTL blob index (does not affect oldest_blob_file_number, even
+  // though file number is smaller)
+  {
+    constexpr uint64_t expiration = 9876543210;
+    constexpr uint64_t blob_file_number = 15;
+    static_assert(blob_file_number < expected_oldest_blob_file_number,
+                  "unexpected");
+
+    constexpr uint64_t offset = 2000;
+    constexpr uint64_t size = 500;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlobTTL(&blob_index, expiration, blob_file_number, offset,
+                             size, kNoCompression);
+
+    constexpr SequenceNumber seq = 204;
+
+    ASSERT_OK(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex));
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Corrupt blob index
+  {
+    constexpr char corrupt_blob_index[] = "!corrupt!";
+    constexpr SequenceNumber seq = 205;
+
+    ASSERT_TRUE(
+        meta.UpdateBoundaries(key, corrupt_blob_index, seq, kTypeBlobIndex)
+            .IsCorruption());
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+
+  // Invalid blob file number
+  {
+    constexpr uint64_t offset = 10000;
+    constexpr uint64_t size = 1000;
+
+    std::string blob_index;
+    BlobIndex::EncodeBlob(&blob_index, kInvalidBlobFileNumber, offset, size,
+                          kNoCompression);
+
+    constexpr SequenceNumber seq = 206;
+
+    ASSERT_TRUE(meta.UpdateBoundaries(key, blob_index, seq, kTypeBlobIndex)
+                    .IsCorruption());
+    ASSERT_EQ(meta.oldest_blob_file_number, expected_oldest_blob_file_number);
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_set.cc b/src/rocksdb/db/version_set.cc
new file mode 100644
index 000000000..427af6e25
--- /dev/null
+++ b/src/rocksdb/db/version_set.cc
@@ -0,0 +1,6903 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+#include <array>
+#include <cinttypes>
+#include <cstdio>
+#include <list>
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "db/blob/blob_fetcher.h"
+#include "db/blob/blob_file_cache.h"
+#include "db/blob/blob_file_reader.h"
+#include "db/blob/blob_index.h"
+#include "db/blob/blob_log_format.h"
+#include "db/blob/blob_source.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/file_pri.h"
+#include "db/dbformat.h"
+#include "db/internal_stats.h"
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/merge_helper.h"
+#include "db/pinned_iterators_manager.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit_handler.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "file/filename.h"
+#include "file/random_access_file_reader.h"
+#include "file/read_write_util.h"
+#include "file/writable_file_writer.h"
+#include "logging/logging.h"
+#include "monitoring/file_read_sample.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/persistent_stats_history.h"
+#include "options/options_helper.h"
+#include "rocksdb/env.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/format.h"
+#include "table/get_context.h"
+#include "table/internal_iterator.h"
+#include "table/merging_iterator.h"
+#include "table/meta_blocks.h"
+#include "table/multiget_context.h"
+#include "table/plain/plain_table_factory.h"
+#include "table/table_reader.h"
+#include "table/two_level_iterator.h"
+#include "table/unique_id_impl.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/coro_utils.h"
+#include "util/stop_watch.h"
+#include "util/string_util.h"
+#include "util/user_comparator_wrapper.h"
+
+// Generate the regular and coroutine versions of some methods by
+// including version_set_sync_and_async.h twice
+// Macros in the header will expand differently based on whether
+// WITH_COROUTINES or WITHOUT_COROUTINES is defined
+// clang-format off
+#define WITHOUT_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITHOUT_COROUTINES
+#define WITH_COROUTINES
+#include "db/version_set_sync_and_async.h"
+#undef WITH_COROUTINES
+// clang-format on
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+
+// Find File in LevelFilesBrief data structure
+// Within an index range defined by left and right
+int FindFileInRange(const InternalKeyComparator& icmp,
+                    const LevelFilesBrief& file_level, const Slice& key,
+                    uint32_t left, uint32_t right) {
+  auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
+    return icmp.InternalKeyComparator::Compare(f.largest_key, k) < 0;
+  };
+  const auto& b = file_level.files;
+  return static_cast<int>(std::lower_bound(b + left, b + right, key, cmp) - b);
+}
+
+Status OverlapWithIterator(const Comparator* ucmp,
+                           const Slice& smallest_user_key,
+                           const Slice& largest_user_key,
+                           InternalIterator* iter, bool* overlap) {
+  InternalKey range_start(smallest_user_key, kMaxSequenceNumber,
+                          kValueTypeForSeek);
+  iter->Seek(range_start.Encode());
+  if (!iter->status().ok()) {
+    return iter->status();
+  }
+
+  *overlap = false;
+  if (iter->Valid()) {
+    ParsedInternalKey seek_result;
+    Status s = ParseInternalKey(iter->key(), &seek_result,
+                                false /* log_err_key */);  // TODO
+    if (!s.ok()) return s;
+
+    if (ucmp->CompareWithoutTimestamp(seek_result.user_key, largest_user_key) <=
+        0) {
+      *overlap = true;
+    }
+  }
+
+  return iter->status();
+}
+
+// Class to help choose the next file to search for the particular key.
+// Searches and returns files level by level.
+// We can search level-by-level since entries never hop across
+// levels. Therefore we are guaranteed that if we find data
+// in a smaller level, later levels are irrelevant (unless we
+// are MergeInProgress).
+class FilePicker {
+ public:
+  FilePicker(const Slice& user_key, const Slice& ikey,
+             autovector<LevelFilesBrief>* file_levels, unsigned int num_levels,
+             FileIndexer* file_indexer, const Comparator* user_comparator,
+             const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        search_left_bound_(0),
+        search_right_bound_(FileIndexer::kLevelMaxIndex),
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        user_key_(user_key),
+        ikey_(ikey),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator) {
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          r->Prepare(ikey);
+        }
+      }
+    }
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  FdWithKeyRange* GetNextFile() {
+    while (!search_ended_) {  // Loops over different levels.
+      while (curr_index_in_curr_level_ < curr_file_level_->num_files) {
+        // Loops over all files in current level.
+        FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_];
+        hit_file_level_ = curr_level_;
+        is_hit_file_last_in_level_ =
+            curr_index_in_curr_level_ == curr_file_level_->num_files - 1;
+        int cmp_largest = -1;
+
+        // Do key range filtering of files or/and fractional cascading if:
+        // (1) not all the files are in level 0, or
+        // (2) there are more than 3 current level files
+        // If there are only 3 or less current level files in the system, we
+        // skip the key range filtering. In this case, more likely, the system
+        // is highly tuned to minimize number of tables queried by each query,
+        // so it is unlikely that key range filtering is more efficient than
+        // querying the files.
+        if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+          // Check if key is within a file's range. If search left bound and
+          // right bound point to the same find, we are sure key falls in
+          // range.
+          assert(curr_level_ == 0 ||
+                 curr_index_in_curr_level_ == start_index_in_curr_level_ ||
+                 user_comparator_->CompareWithoutTimestamp(
+                     user_key_, ExtractUserKey(f->smallest_key)) <= 0);
+
+          int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+              user_key_, ExtractUserKey(f->smallest_key));
+          if (cmp_smallest >= 0) {
+            cmp_largest = user_comparator_->CompareWithoutTimestamp(
+                user_key_, ExtractUserKey(f->largest_key));
+          }
+
+          // Setup file search bound for the next level based on the
+          // comparison results
+          if (curr_level_ > 0) {
+            file_indexer_->GetNextLevelIndex(
+                curr_level_, curr_index_in_curr_level_, cmp_smallest,
+                cmp_largest, &search_left_bound_, &search_right_bound_);
+          }
+          // Key falls out of current file's range
+          if (cmp_smallest < 0 || cmp_largest > 0) {
+            if (curr_level_ == 0) {
+              ++curr_index_in_curr_level_;
+              continue;
+            } else {
+              // Search next level.
+              break;
+            }
+          }
+        }
+
+        returned_file_level_ = curr_level_;
+        if (curr_level_ > 0 && cmp_largest < 0) {
+          // No more files to search in this level.
+          search_ended_ = !PrepareNextLevel();
+        } else {
+          ++curr_index_in_curr_level_;
+        }
+        return f;
+      }
+      // Start searching next level.
+      search_ended_ = !PrepareNextLevel();
+    }
+    // Search ended.
+    return nullptr;
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+  int32_t search_left_bound_;
+  int32_t search_right_bound_;
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  unsigned int curr_index_in_curr_level_;
+  unsigned int start_index_in_curr_level_;
+  Slice user_key_;
+  Slice ikey_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    curr_level_++;
+    while (curr_level_ < num_levels_) {
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+        assert(search_left_bound_ == 0);
+        assert(search_right_bound_ == -1 ||
+               search_right_bound_ == FileIndexer::kLevelMaxIndex);
+        // Since current level is empty, it will need to search all files in
+        // the next level
+        search_left_bound_ = 0;
+        search_right_bound_ = FileIndexer::kLevelMaxIndex;
+        curr_level_++;
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index;
+      if (curr_level_ == 0) {
+        // On Level-0, we read through all files to check for overlap.
+        start_index = 0;
+      } else {
+        // On Level-n (n>=1), files are sorted. Binary search to find the
+        // earliest file whose largest key >= ikey. Search left bound and
+        // right bound are used to narrow the range.
+        if (search_left_bound_ <= search_right_bound_) {
+          if (search_right_bound_ == FileIndexer::kLevelMaxIndex) {
+            search_right_bound_ =
+                static_cast<int32_t>(curr_file_level_->num_files) - 1;
+          }
+          // `search_right_bound_` is an inclusive upper-bound, but since it was
+          // determined based on user key, it is still possible the lookup key
+          // falls to the right of `search_right_bound_`'s corresponding file.
+          // So, pass a limit one higher, which allows us to detect this case.
+          start_index =
+              FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_,
+                              static_cast<uint32_t>(search_left_bound_),
+                              static_cast<uint32_t>(search_right_bound_) + 1);
+          if (start_index == search_right_bound_ + 1) {
+            // `ikey_` comes after `search_right_bound_`. The lookup key does
+            // not exist on this level, so let's skip this level and do a full
+            // binary search on the next level.
+            search_left_bound_ = 0;
+            search_right_bound_ = FileIndexer::kLevelMaxIndex;
+            curr_level_++;
+            continue;
+          }
+        } else {
+          // search_left_bound > search_right_bound, key does not exist in
+          // this level. Since no comparison is done in this level, it will
+          // need to search all files in the next level.
+          search_left_bound_ = 0;
+          search_right_bound_ = FileIndexer::kLevelMaxIndex;
+          curr_level_++;
+          continue;
+        }
+      }
+      start_index_in_curr_level_ = start_index;
+      curr_index_in_curr_level_ = start_index;
+
+      return true;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+}  // anonymous namespace
+
+class FilePickerMultiGet {
+ private:
+  struct FilePickerContext;
+
+ public:
+  FilePickerMultiGet(MultiGetRange* range,
+                     autovector<LevelFilesBrief>* file_levels,
+                     unsigned int num_levels, FileIndexer* file_indexer,
+                     const Comparator* user_comparator,
+                     const InternalKeyComparator* internal_comparator)
+      : num_levels_(num_levels),
+        curr_level_(static_cast<unsigned int>(-1)),
+        returned_file_level_(static_cast<unsigned int>(-1)),
+        hit_file_level_(static_cast<unsigned int>(-1)),
+        range_(*range, range->begin(), range->end()),
+        maybe_repeat_key_(false),
+        current_level_range_(*range, range->begin(), range->end()),
+        current_file_range_(*range, range->begin(), range->end()),
+        batch_iter_(range->begin()),
+        batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
+        level_files_brief_(file_levels),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(nullptr),
+        file_indexer_(file_indexer),
+        user_comparator_(user_comparator),
+        internal_comparator_(internal_comparator),
+        hit_file_(nullptr) {
+    for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+      fp_ctx_array_[iter.index()] =
+          FilePickerContext(0, FileIndexer::kLevelMaxIndex);
+    }
+
+    // Setup member variables to search first level.
+    search_ended_ = !PrepareNextLevel();
+    if (!search_ended_) {
+      // REVISIT
+      // Prefetch Level 0 table data to avoid cache miss if possible.
+      // As of now, only PlainTableReader and CuckooTableReader do any
+      // prefetching. This may not be necessary anymore once we implement
+      // batching in those table readers
+      for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) {
+        auto* r = (*level_files_brief_)[0].files[i].fd.table_reader;
+        if (r) {
+          for (auto iter = range_.begin(); iter != range_.end(); ++iter) {
+            r->Prepare(iter->ikey);
+          }
+        }
+      }
+    }
+  }
+
+  FilePickerMultiGet(MultiGetRange* range, const FilePickerMultiGet& other)
+      : num_levels_(other.num_levels_),
+        curr_level_(other.curr_level_),
+        returned_file_level_(other.returned_file_level_),
+        hit_file_level_(other.hit_file_level_),
+        fp_ctx_array_(other.fp_ctx_array_),
+        range_(*range, range->begin(), range->end()),
+        maybe_repeat_key_(false),
+        current_level_range_(*range, range->begin(), range->end()),
+        current_file_range_(*range, range->begin(), range->end()),
+        batch_iter_(range->begin()),
+        batch_iter_prev_(range->begin()),
+        upper_key_(range->begin()),
+        level_files_brief_(other.level_files_brief_),
+        is_hit_file_last_in_level_(false),
+        curr_file_level_(other.curr_file_level_),
+        file_indexer_(other.file_indexer_),
+        user_comparator_(other.user_comparator_),
+        internal_comparator_(other.internal_comparator_),
+        hit_file_(nullptr) {
+    PrepareNextLevelForSearch();
+  }
+
+  int GetCurrentLevel() const { return curr_level_; }
+
+  void PrepareNextLevelForSearch() { search_ended_ = !PrepareNextLevel(); }
+
+  FdWithKeyRange* GetNextFileInLevel() {
+    if (batch_iter_ == current_level_range_.end() || search_ended_) {
+      hit_file_ = nullptr;
+      return nullptr;
+    } else {
+      if (maybe_repeat_key_) {
+        maybe_repeat_key_ = false;
+        // Check if we found the final value for the last key in the
+        // previous lookup range. If we did, then there's no need to look
+        // any further for that key, so advance batch_iter_. Else, keep
+        // batch_iter_ positioned on that key so we look it up again in
+        // the next file
+        // For L0, always advance the key because we will look in the next
+        // file regardless for all keys not found yet
+        if (current_level_range_.CheckKeyDone(batch_iter_) ||
+            curr_level_ == 0) {
+          batch_iter_ = upper_key_;
+        }
+      }
+      // batch_iter_prev_ will become the start key for the next file
+      // lookup
+      batch_iter_prev_ = batch_iter_;
+    }
+
+    MultiGetRange next_file_range(current_level_range_, batch_iter_prev_,
+                                  current_level_range_.end());
+    size_t curr_file_index =
+        (batch_iter_ != current_level_range_.end())
+            ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+            : curr_file_level_->num_files;
+    FdWithKeyRange* f;
+    bool is_last_key_in_file;
+    if (!GetNextFileInLevelWithKeys(&next_file_range, &curr_file_index, &f,
+                                    &is_last_key_in_file)) {
+      hit_file_ = nullptr;
+      return nullptr;
+    } else {
+      if (is_last_key_in_file) {
+        // Since cmp_largest is 0, batch_iter_ still points to the last key
+        // that falls in this file, instead of the next one. Increment
+        // the file index for all keys between batch_iter_ and upper_key_
+        auto tmp_iter = batch_iter_;
+        while (tmp_iter != upper_key_) {
+          ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level);
+          ++tmp_iter;
+        }
+        maybe_repeat_key_ = true;
+      }
+      // Set the range for this file
+      current_file_range_ =
+          MultiGetRange(next_file_range, batch_iter_prev_, upper_key_);
+      returned_file_level_ = curr_level_;
+      hit_file_level_ = curr_level_;
+      is_hit_file_last_in_level_ =
+          curr_file_index == curr_file_level_->num_files - 1;
+      hit_file_ = f;
+      return f;
+    }
+  }
+
+  // getter for current file level
+  // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts
+  unsigned int GetHitFileLevel() { return hit_file_level_; }
+
+  FdWithKeyRange* GetHitFile() { return hit_file_; }
+
+  // Returns true if the most recent "hit file" (i.e., one returned by
+  // GetNextFile()) is at the last index in its level.
+  bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; }
+
+  bool KeyMaySpanNextFile() { return maybe_repeat_key_; }
+
+  bool IsSearchEnded() { return search_ended_; }
+
+  const MultiGetRange& CurrentFileRange() { return current_file_range_; }
+
+  bool RemainingOverlapInLevel() {
+    return !current_level_range_.Suffix(current_file_range_).empty();
+  }
+
+  MultiGetRange& GetRange() { return range_; }
+
+  void ReplaceRange(const MultiGetRange& other) {
+    assert(hit_file_ == nullptr);
+    range_ = other;
+    current_level_range_ = other;
+  }
+
+  FilePickerMultiGet(FilePickerMultiGet&& other)
+      : num_levels_(other.num_levels_),
+        curr_level_(other.curr_level_),
+        returned_file_level_(other.returned_file_level_),
+        hit_file_level_(other.hit_file_level_),
+        fp_ctx_array_(std::move(other.fp_ctx_array_)),
+        range_(std::move(other.range_)),
+        maybe_repeat_key_(other.maybe_repeat_key_),
+        current_level_range_(std::move(other.current_level_range_)),
+        current_file_range_(std::move(other.current_file_range_)),
+        batch_iter_(other.batch_iter_, &current_level_range_),
+        batch_iter_prev_(other.batch_iter_prev_, &current_level_range_),
+        upper_key_(other.upper_key_, &current_level_range_),
+        level_files_brief_(other.level_files_brief_),
+        search_ended_(other.search_ended_),
+        is_hit_file_last_in_level_(other.is_hit_file_last_in_level_),
+        curr_file_level_(other.curr_file_level_),
+        file_indexer_(other.file_indexer_),
+        user_comparator_(other.user_comparator_),
+        internal_comparator_(other.internal_comparator_),
+        hit_file_(other.hit_file_) {}
+
+ private:
+  unsigned int num_levels_;
+  unsigned int curr_level_;
+  unsigned int returned_file_level_;
+  unsigned int hit_file_level_;
+
+  struct FilePickerContext {
+    int32_t search_left_bound;
+    int32_t search_right_bound;
+    unsigned int curr_index_in_curr_level;
+    unsigned int start_index_in_curr_level;
+
+    FilePickerContext(int32_t left, int32_t right)
+        : search_left_bound(left),
+          search_right_bound(right),
+          curr_index_in_curr_level(0),
+          start_index_in_curr_level(0) {}
+
+    FilePickerContext() = default;
+  };
+  std::array<FilePickerContext, MultiGetContext::MAX_BATCH_SIZE> fp_ctx_array_;
+  MultiGetRange range_;
+  bool maybe_repeat_key_;
+  MultiGetRange current_level_range_;
+  MultiGetRange current_file_range_;
+  // Iterator to iterate through the keys in a MultiGet batch, that gets reset
+  // at the beginning of each level. Each call to GetNextFile() will position
+  // batch_iter_ at or right after the last key that was found in the returned
+  // SST file
+  MultiGetRange::Iterator batch_iter_;
+  // An iterator that records the previous position of batch_iter_, i.e last
+  // key found in the previous SST file, in order to serve as the start of
+  // the batch key range for the next SST file
+  MultiGetRange::Iterator batch_iter_prev_;
+  MultiGetRange::Iterator upper_key_;
+  autovector<LevelFilesBrief>* level_files_brief_;
+  bool search_ended_;
+  bool is_hit_file_last_in_level_;
+  LevelFilesBrief* curr_file_level_;
+  FileIndexer* file_indexer_;
+  const Comparator* user_comparator_;
+  const InternalKeyComparator* internal_comparator_;
+  FdWithKeyRange* hit_file_;
+
+  // Iterates through files in the current level until it finds a file that
+  // contains at least one key from the MultiGet batch
+  bool GetNextFileInLevelWithKeys(MultiGetRange* next_file_range,
+                                  size_t* file_index, FdWithKeyRange** fd,
+                                  bool* is_last_key_in_file) {
+    size_t curr_file_index = *file_index;
+    FdWithKeyRange* f = nullptr;
+    bool file_hit = false;
+    int cmp_largest = -1;
+    if (curr_file_index >= curr_file_level_->num_files) {
+      // In the unlikely case the next key is a duplicate of the current key,
+      // and the current key is the last in the level and the internal key
+      // was not found, we need to skip lookup for the remaining keys and
+      // reset the search bounds
+      if (batch_iter_ != current_level_range_.end()) {
+        ++batch_iter_;
+        for (; batch_iter_ != current_level_range_.end(); ++batch_iter_) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+      }
+      return false;
+    }
+    // Loops over keys in the MultiGet batch until it finds a file with
+    // atleast one of the keys. Then it keeps moving forward until the
+    // last key in the batch that falls in that file
+    while (batch_iter_ != current_level_range_.end() &&
+           (fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level ==
+                curr_file_index ||
+            !file_hit)) {
+      struct FilePickerContext& fp_ctx = fp_ctx_array_[batch_iter_.index()];
+      f = &curr_file_level_->files[fp_ctx.curr_index_in_curr_level];
+      Slice& user_key = batch_iter_->ukey_without_ts;
+
+      // Do key range filtering of files or/and fractional cascading if:
+      // (1) not all the files are in level 0, or
+      // (2) there are more than 3 current level files
+      // If there are only 3 or less current level files in the system, we
+      // skip the key range filtering. In this case, more likely, the system
+      // is highly tuned to minimize number of tables queried by each query,
+      // so it is unlikely that key range filtering is more efficient than
+      // querying the files.
+      if (num_levels_ > 1 || curr_file_level_->num_files > 3) {
+        // Check if key is within a file's range. If search left bound and
+        // right bound point to the same find, we are sure key falls in
+        // range.
+        int cmp_smallest = user_comparator_->CompareWithoutTimestamp(
+            user_key, false, ExtractUserKey(f->smallest_key), true);
+
+        assert(curr_level_ == 0 ||
+               fp_ctx.curr_index_in_curr_level ==
+                   fp_ctx.start_index_in_curr_level ||
+               cmp_smallest <= 0);
+
+        if (cmp_smallest >= 0) {
+          cmp_largest = user_comparator_->CompareWithoutTimestamp(
+              user_key, false, ExtractUserKey(f->largest_key), true);
+        } else {
+          cmp_largest = -1;
+        }
+
+        // Setup file search bound for the next level based on the
+        // comparison results
+        if (curr_level_ > 0) {
+          file_indexer_->GetNextLevelIndex(
+              curr_level_, fp_ctx.curr_index_in_curr_level, cmp_smallest,
+              cmp_largest, &fp_ctx.search_left_bound,
+              &fp_ctx.search_right_bound);
+        }
+        // Key falls out of current file's range
+        if (cmp_smallest < 0 || cmp_largest > 0) {
+          next_file_range->SkipKey(batch_iter_);
+        } else {
+          file_hit = true;
+        }
+      } else {
+        file_hit = true;
+      }
+      if (cmp_largest == 0) {
+        // cmp_largest is 0, which means the next key will not be in this
+        // file, so stop looking further. However, its possible there are
+        // duplicates in the batch, so find the upper bound for the batch
+        // in this file (upper_key_) by skipping past the duplicates. We
+        // leave batch_iter_ as is since we may have to pick up from there
+        // for the next file, if this file has a merge value rather than
+        // final value
+        upper_key_ = batch_iter_;
+        ++upper_key_;
+        while (upper_key_ != current_level_range_.end() &&
+               user_comparator_->CompareWithoutTimestamp(
+                   batch_iter_->ukey_without_ts, false,
+                   upper_key_->ukey_without_ts, false) == 0) {
+          ++upper_key_;
+        }
+        break;
+      } else {
+        if (curr_level_ == 0) {
+          // We need to look through all files in level 0
+          ++fp_ctx.curr_index_in_curr_level;
+        }
+        ++batch_iter_;
+      }
+      if (!file_hit) {
+        curr_file_index =
+            (batch_iter_ != current_level_range_.end())
+                ? fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level
+                : curr_file_level_->num_files;
+      }
+    }
+
+    *fd = f;
+    *file_index = curr_file_index;
+    *is_last_key_in_file = cmp_largest == 0;
+    if (!*is_last_key_in_file) {
+      // If the largest key in the batch overlapping the file is not the
+      // largest key in the file, upper_ley_ would not have been updated so
+      // update it here
+      upper_key_ = batch_iter_;
+    }
+    return file_hit;
+  }
+
+  // Setup local variables to search next level.
+  // Returns false if there are no more levels to search.
+  bool PrepareNextLevel() {
+    if (curr_level_ == 0) {
+      MultiGetRange::Iterator mget_iter = current_level_range_.begin();
+      if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level <
+          curr_file_level_->num_files) {
+        batch_iter_prev_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+    }
+
+    curr_level_++;
+    // Reset key range to saved value
+    while (curr_level_ < num_levels_) {
+      bool level_contains_keys = false;
+      curr_file_level_ = &(*level_files_brief_)[curr_level_];
+      if (curr_file_level_->num_files == 0) {
+        // When current level is empty, the search bound generated from upper
+        // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is
+        // also empty.
+
+        for (auto mget_iter = current_level_range_.begin();
+             mget_iter != current_level_range_.end(); ++mget_iter) {
+          struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+
+          assert(fp_ctx.search_left_bound == 0);
+          assert(fp_ctx.search_right_bound == -1 ||
+                 fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex);
+          // Since current level is empty, it will need to search all files in
+          // the next level
+          fp_ctx.search_left_bound = 0;
+          fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+        }
+        // Skip all subsequent empty levels
+        do {
+          ++curr_level_;
+        } while ((curr_level_ < num_levels_) &&
+                 (*level_files_brief_)[curr_level_].num_files == 0);
+        continue;
+      }
+
+      // Some files may overlap each other. We find
+      // all files that overlap user_key and process them in order from
+      // newest to oldest. In the context of merge-operator, this can occur at
+      // any level. Otherwise, it only occurs at Level-0 (since Put/Deletes
+      // are always compacted into a single entry).
+      int32_t start_index = -1;
+      current_level_range_ =
+          MultiGetRange(range_, range_.begin(), range_.end());
+      for (auto mget_iter = current_level_range_.begin();
+           mget_iter != current_level_range_.end(); ++mget_iter) {
+        struct FilePickerContext& fp_ctx = fp_ctx_array_[mget_iter.index()];
+        if (curr_level_ == 0) {
+          // On Level-0, we read through all files to check for overlap.
+          start_index = 0;
+          level_contains_keys = true;
+        } else {
+          // On Level-n (n>=1), files are sorted. Binary search to find the
+          // earliest file whose largest key >= ikey. Search left bound and
+          // right bound are used to narrow the range.
+          if (fp_ctx.search_left_bound <= fp_ctx.search_right_bound) {
+            if (fp_ctx.search_right_bound == FileIndexer::kLevelMaxIndex) {
+              fp_ctx.search_right_bound =
+                  static_cast<int32_t>(curr_file_level_->num_files) - 1;
+            }
+            // `search_right_bound_` is an inclusive upper-bound, but since it
+            // was determined based on user key, it is still possible the lookup
+            // key falls to the right of `search_right_bound_`'s corresponding
+            // file. So, pass a limit one higher, which allows us to detect this
+            // case.
+            Slice& ikey = mget_iter->ikey;
+            start_index = FindFileInRange(
+                *internal_comparator_, *curr_file_level_, ikey,
+                static_cast<uint32_t>(fp_ctx.search_left_bound),
+                static_cast<uint32_t>(fp_ctx.search_right_bound) + 1);
+            if (start_index == fp_ctx.search_right_bound + 1) {
+              // `ikey_` comes after `search_right_bound_`. The lookup key does
+              // not exist on this level, so let's skip this level and do a full
+              // binary search on the next level.
+              fp_ctx.search_left_bound = 0;
+              fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+              current_level_range_.SkipKey(mget_iter);
+              continue;
+            } else {
+              level_contains_keys = true;
+            }
+          } else {
+            // search_left_bound > search_right_bound, key does not exist in
+            // this level. Since no comparison is done in this level, it will
+            // need to search all files in the next level.
+            fp_ctx.search_left_bound = 0;
+            fp_ctx.search_right_bound = FileIndexer::kLevelMaxIndex;
+            current_level_range_.SkipKey(mget_iter);
+            continue;
+          }
+        }
+        fp_ctx.start_index_in_curr_level = start_index;
+        fp_ctx.curr_index_in_curr_level = start_index;
+      }
+      if (level_contains_keys) {
+        batch_iter_prev_ = current_level_range_.begin();
+        upper_key_ = batch_iter_ = current_level_range_.begin();
+        return true;
+      }
+      curr_level_++;
+    }
+    // curr_level_ = num_levels_. So, no more levels to search.
+    return false;
+  }
+};
+
+VersionStorageInfo::~VersionStorageInfo() { delete[] files_; }
+
+Version::~Version() {
+  assert(refs_ == 0);
+
+  // Remove from linked list
+  prev_->next_ = next_;
+  next_->prev_ = prev_;
+
+  // Drop references to files
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (size_t i = 0; i < storage_info_.files_[level].size(); i++) {
+      FileMetaData* f = storage_info_.files_[level][i];
+      assert(f->refs > 0);
+      f->refs--;
+      if (f->refs <= 0) {
+        assert(cfd_ != nullptr);
+        uint32_t path_id = f->fd.GetPathId();
+        assert(path_id < cfd_->ioptions()->cf_paths.size());
+        vset_->obsolete_files_.push_back(
+            ObsoleteFileInfo(f, cfd_->ioptions()->cf_paths[path_id].path,
+                             cfd_->GetFileMetadataCacheReservationManager()));
+      }
+    }
+  }
+}
+
+int FindFile(const InternalKeyComparator& icmp,
+             const LevelFilesBrief& file_level, const Slice& key) {
+  return FindFileInRange(icmp, file_level, key, 0,
+                         static_cast<uint32_t>(file_level.num_files));
+}
+
+void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                               const std::vector<FileMetaData*>& files,
+                               Arena* arena) {
+  assert(file_level);
+  assert(arena);
+
+  size_t num = files.size();
+  file_level->num_files = num;
+  char* mem = arena->AllocateAligned(num * sizeof(FdWithKeyRange));
+  file_level->files = new (mem) FdWithKeyRange[num];
+
+  for (size_t i = 0; i < num; i++) {
+    Slice smallest_key = files[i]->smallest.Encode();
+    Slice largest_key = files[i]->largest.Encode();
+
+    // Copy key slice to sequential memory
+    size_t smallest_size = smallest_key.size();
+    size_t largest_size = largest_key.size();
+    mem = arena->AllocateAligned(smallest_size + largest_size);
+    memcpy(mem, smallest_key.data(), smallest_size);
+    memcpy(mem + smallest_size, largest_key.data(), largest_size);
+
+    FdWithKeyRange& f = file_level->files[i];
+    f.fd = files[i]->fd;
+    f.file_metadata = files[i];
+    f.smallest_key = Slice(mem, smallest_size);
+    f.largest_key = Slice(mem + smallest_size, largest_size);
+  }
+}
+
+static bool AfterFile(const Comparator* ucmp, const Slice* user_key,
+                      const FdWithKeyRange* f) {
+  // nullptr user_key occurs before all keys and is therefore never after *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->largest_key)) > 0);
+}
+
+static bool BeforeFile(const Comparator* ucmp, const Slice* user_key,
+                       const FdWithKeyRange* f) {
+  // nullptr user_key occurs after all keys and is therefore never before *f
+  return (user_key != nullptr &&
+          ucmp->CompareWithoutTimestamp(*user_key,
+                                        ExtractUserKey(f->smallest_key)) < 0);
+}
+
+bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                           bool disjoint_sorted_files,
+                           const LevelFilesBrief& file_level,
+                           const Slice* smallest_user_key,
+                           const Slice* largest_user_key) {
+  const Comparator* ucmp = icmp.user_comparator();
+  if (!disjoint_sorted_files) {
+    // Need to check against all files
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      const FdWithKeyRange* f = &(file_level.files[i]);
+      if (AfterFile(ucmp, smallest_user_key, f) ||
+          BeforeFile(ucmp, largest_user_key, f)) {
+        // No overlap
+      } else {
+        return true;  // Overlap
+      }
+    }
+    return false;
+  }
+
+  // Binary search over file list
+  uint32_t index = 0;
+  if (smallest_user_key != nullptr) {
+    // Find the leftmost possible internal key for smallest_user_key
+    InternalKey small;
+    small.SetMinPossibleForUserKey(*smallest_user_key);
+    index = FindFile(icmp, file_level, small.Encode());
+  }
+
+  if (index >= file_level.num_files) {
+    // beginning of range is after all files, so no overlap.
+    return false;
+  }
+
+  return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]);
+}
+
+namespace {
+
+class LevelIterator final : public InternalIterator {
+ public:
+  // @param read_options Must outlive this iterator.
+  LevelIterator(
+      TableCache* table_cache, const ReadOptions& read_options,
+      const FileOptions& file_options, const InternalKeyComparator& icomparator,
+      const LevelFilesBrief* flevel,
+      const std::shared_ptr<const SliceTransform>& prefix_extractor,
+      bool should_sample, HistogramImpl* file_read_hist,
+      TableReaderCaller caller, bool skip_filters, int level,
+      RangeDelAggregator* range_del_agg,
+      const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries =
+          nullptr,
+      bool allow_unprepared_value = false,
+      TruncatedRangeDelIterator**** range_tombstone_iter_ptr_ = nullptr)
+      : table_cache_(table_cache),
+        read_options_(read_options),
+        file_options_(file_options),
+        icomparator_(icomparator),
+        user_comparator_(icomparator.user_comparator()),
+        flevel_(flevel),
+        prefix_extractor_(prefix_extractor),
+        file_read_hist_(file_read_hist),
+        should_sample_(should_sample),
+        caller_(caller),
+        skip_filters_(skip_filters),
+        allow_unprepared_value_(allow_unprepared_value),
+        file_index_(flevel_->num_files),
+        level_(level),
+        range_del_agg_(range_del_agg),
+        pinned_iters_mgr_(nullptr),
+        compaction_boundaries_(compaction_boundaries),
+        is_next_read_sequential_(false),
+        range_tombstone_iter_(nullptr),
+        to_return_sentinel_(false) {
+    // Empty level is not supported.
+    assert(flevel_ != nullptr && flevel_->num_files > 0);
+    if (range_tombstone_iter_ptr_) {
+      *range_tombstone_iter_ptr_ = &range_tombstone_iter_;
+    }
+  }
+
+  ~LevelIterator() override { delete file_iter_.Set(nullptr); }
+
+  // Seek to the first file with a key >= target.
+  // If range_tombstone_iter_ is not nullptr, then we pretend that file
+  // boundaries are fake keys (sentinel keys). These keys are used to keep range
+  // tombstones alive even when all point keys in an SST file are exhausted.
+  // These sentinel keys will be skipped in merging iterator.
+  void Seek(const Slice& target) override;
+  void SeekForPrev(const Slice& target) override;
+  void SeekToFirst() override;
+  void SeekToLast() override;
+  void Next() final override;
+  bool NextAndGetResult(IterateResult* result) override;
+  void Prev() override;
+
+  // In addition to valid and invalid state (!file_iter.Valid() and
+  // status.ok()), a third state of the iterator is when !file_iter_.Valid() and
+  // to_return_sentinel_. This means we are at the end of a file, and a sentinel
+  // key (the file boundary that we pretend as a key) is to be returned next.
+  // file_iter_.Valid() and to_return_sentinel_ should not both be true.
+  bool Valid() const override {
+    assert(!(file_iter_.Valid() && to_return_sentinel_));
+    return file_iter_.Valid() || to_return_sentinel_;
+  }
+  Slice key() const override {
+    assert(Valid());
+    if (to_return_sentinel_) {
+      // Sentinel should be returned after file_iter_ reaches the end of the
+      // file
+      assert(!file_iter_.Valid());
+      return sentinel_;
+    }
+    return file_iter_.key();
+  }
+
+  Slice value() const override {
+    assert(Valid());
+    assert(!to_return_sentinel_);
+    return file_iter_.value();
+  }
+
+  Status status() const override {
+    return file_iter_.iter() ? file_iter_.status() : Status::OK();
+  }
+
+  bool PrepareValue() override { return file_iter_.PrepareValue(); }
+
+  inline bool MayBeOutOfLowerBound() override {
+    assert(Valid());
+    return may_be_out_of_lower_bound_ && file_iter_.MayBeOutOfLowerBound();
+  }
+
+  inline IterBoundCheck UpperBoundCheckResult() override {
+    if (Valid()) {
+      return file_iter_.UpperBoundCheckResult();
+    } else {
+      return IterBoundCheck::kUnknown;
+    }
+  }
+
+  void SetPinnedItersMgr(PinnedIteratorsManager* pinned_iters_mgr) override {
+    pinned_iters_mgr_ = pinned_iters_mgr;
+    if (file_iter_.iter()) {
+      file_iter_.SetPinnedItersMgr(pinned_iters_mgr);
+    }
+  }
+
+  bool IsKeyPinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsKeyPinned();
+  }
+
+  bool IsValuePinned() const override {
+    return pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled() &&
+           file_iter_.iter() && file_iter_.IsValuePinned();
+  }
+
+  bool IsDeleteRangeSentinelKey() const override { return to_return_sentinel_; }
+
+ private:
+  // Return true if at least one invalid file is seen and skipped.
+  bool SkipEmptyFileForward();
+  void SkipEmptyFileBackward();
+  void SetFileIterator(InternalIterator* iter);
+  void InitFileIterator(size_t new_file_index);
+
+  const Slice& file_smallest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].smallest_key;
+  }
+
+  const Slice& file_largest_key(size_t file_index) {
+    assert(file_index < flevel_->num_files);
+    return flevel_->files[file_index].largest_key;
+  }
+
+  bool KeyReachedUpperBound(const Slice& internal_key) {
+    return read_options_.iterate_upper_bound != nullptr &&
+           user_comparator_.CompareWithoutTimestamp(
+               ExtractUserKey(internal_key), /*a_has_ts=*/true,
+               *read_options_.iterate_upper_bound, /*b_has_ts=*/false) >= 0;
+  }
+
+  void ClearRangeTombstoneIter() {
+    if (range_tombstone_iter_ && *range_tombstone_iter_) {
+      delete *range_tombstone_iter_;
+      *range_tombstone_iter_ = nullptr;
+    }
+  }
+
+  // Move file_iter_ to the file at file_index_.
+  // range_tombstone_iter_ is updated with a range tombstone iterator
+  // into the new file. Old range tombstone iterator is cleared.
+  InternalIterator* NewFileIterator() {
+    assert(file_index_ < flevel_->num_files);
+    auto file_meta = flevel_->files[file_index_];
+    if (should_sample_) {
+      sample_file_read_inc(file_meta.file_metadata);
+    }
+
+    const InternalKey* smallest_compaction_key = nullptr;
+    const InternalKey* largest_compaction_key = nullptr;
+    if (compaction_boundaries_ != nullptr) {
+      smallest_compaction_key = (*compaction_boundaries_)[file_index_].smallest;
+      largest_compaction_key = (*compaction_boundaries_)[file_index_].largest;
+    }
+    CheckMayBeOutOfLowerBound();
+    ClearRangeTombstoneIter();
+    return table_cache_->NewIterator(
+        read_options_, file_options_, icomparator_, *file_meta.file_metadata,
+        range_del_agg_, prefix_extractor_,
+        nullptr /* don't need reference to table */, file_read_hist_, caller_,
+        /*arena=*/nullptr, skip_filters_, level_,
+        /*max_file_size_for_l0_meta_pin=*/0, smallest_compaction_key,
+        largest_compaction_key, allow_unprepared_value_, range_tombstone_iter_);
+  }
+
+  // Check if current file being fully within iterate_lower_bound.
+  //
+  // Note MyRocks may update iterate bounds between seek. To workaround it,
+  // we need to check and update may_be_out_of_lower_bound_ accordingly.
+  void CheckMayBeOutOfLowerBound() {
+    if (read_options_.iterate_lower_bound != nullptr &&
+        file_index_ < flevel_->num_files) {
+      may_be_out_of_lower_bound_ =
+          user_comparator_.CompareWithoutTimestamp(
+              ExtractUserKey(file_smallest_key(file_index_)), /*a_has_ts=*/true,
+              *read_options_.iterate_lower_bound, /*b_has_ts=*/false) < 0;
+    }
+  }
+
+  TableCache* table_cache_;
+  const ReadOptions& read_options_;
+  const FileOptions& file_options_;
+  const InternalKeyComparator& icomparator_;
+  const UserComparatorWrapper user_comparator_;
+  const LevelFilesBrief* flevel_;
+  mutable FileDescriptor current_value_;
+  // `prefix_extractor_` may be non-null even for total order seek. Checking
+  // this variable is not the right way to identify whether prefix iterator
+  // is used.
+  const std::shared_ptr<const SliceTransform>& prefix_extractor_;
+
+  HistogramImpl* file_read_hist_;
+  bool should_sample_;
+  TableReaderCaller caller_;
+  bool skip_filters_;
+  bool allow_unprepared_value_;
+  bool may_be_out_of_lower_bound_ = true;
+  size_t file_index_;
+  int level_;
+  RangeDelAggregator* range_del_agg_;
+  IteratorWrapper file_iter_;  // May be nullptr
+  PinnedIteratorsManager* pinned_iters_mgr_;
+
+  // To be propagated to RangeDelAggregator in order to safely truncate range
+  // tombstones.
+  const std::vector<AtomicCompactionUnitBoundary>* compaction_boundaries_;
+
+  bool is_next_read_sequential_;
+
+  // This is set when this level iterator is used under a merging iterator
+  // that processes range tombstones. range_tombstone_iter_ points to where the
+  // merging iterator stores the range tombstones iterator for this level. When
+  // this level iterator moves to a new SST file, it updates the range
+  // tombstones accordingly through this pointer. So the merging iterator always
+  // has access to the current SST file's range tombstones.
+  //
+  // The level iterator treats file boundary as fake keys (sentinel keys) to
+  // keep range tombstones alive if needed and make upper level, i.e. merging
+  // iterator, aware of file changes (when level iterator moves to a new SST
+  // file, there is some bookkeeping work that needs to be done at merging
+  // iterator end).
+  //
+  // *range_tombstone_iter_ points to range tombstones of the current SST file
+  TruncatedRangeDelIterator** range_tombstone_iter_;
+
+  // Whether next/prev key is a sentinel key.
+  bool to_return_sentinel_ = false;
+  // The sentinel key to be returned
+  Slice sentinel_;
+  // Sets flags for if we should return the sentinel key next.
+  // The condition for returning sentinel is reaching the end of current
+  // file_iter_: !Valid() && status.().ok().
+  void TrySetDeleteRangeSentinel(const Slice& boundary_key);
+  void ClearSentinel() { to_return_sentinel_ = false; }
+
+  // Set in Seek() when a prefix seek reaches end of the current file,
+  // and the next file has a different prefix. SkipEmptyFileForward()
+  // will not move to next file when this flag is set.
+  bool prefix_exhausted_ = false;
+};
+
+void LevelIterator::TrySetDeleteRangeSentinel(const Slice& boundary_key) {
+  assert(range_tombstone_iter_);
+  if (file_iter_.iter() != nullptr && !file_iter_.Valid() &&
+      file_iter_.status().ok()) {
+    to_return_sentinel_ = true;
+    sentinel_ = boundary_key;
+  }
+}
+
+void LevelIterator::Seek(const Slice& target) {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  // Check whether the seek key fall under the same file
+  bool need_to_reseek = true;
+  if (file_iter_.iter() != nullptr && file_index_ < flevel_->num_files) {
+    const FdWithKeyRange& cur_file = flevel_->files[file_index_];
+    if (icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.largest_key) <= 0 &&
+        icomparator_.InternalKeyComparator::Compare(
+            target, cur_file.smallest_key) >= 0) {
+      need_to_reseek = false;
+      assert(static_cast<size_t>(FindFile(icomparator_, *flevel_, target)) ==
+             file_index_);
+    }
+  }
+  if (need_to_reseek) {
+    TEST_SYNC_POINT("LevelIterator::Seek:BeforeFindFile");
+    size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+    InitFileIterator(new_file_index);
+  }
+
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.Seek(target);
+    // Status::TryAgain indicates asynchronous request for retrieval of data
+    // blocks has been submitted. So it should return at this point and Seek
+    // should be called again to retrieve the requested block and execute the
+    // remaining code.
+    if (file_iter_.status() == Status::TryAgain()) {
+      return;
+    }
+    if (!file_iter_.Valid() && file_iter_.status().ok() &&
+        prefix_extractor_ != nullptr && !read_options_.total_order_seek &&
+        !read_options_.auto_prefix_mode &&
+        file_index_ < flevel_->num_files - 1) {
+      size_t ts_sz = user_comparator_.user_comparator()->timestamp_size();
+      Slice target_user_key_without_ts =
+          ExtractUserKeyAndStripTimestamp(target, ts_sz);
+      Slice next_file_first_user_key_without_ts =
+          ExtractUserKeyAndStripTimestamp(file_smallest_key(file_index_ + 1),
+                                          ts_sz);
+      if (prefix_extractor_->InDomain(target_user_key_without_ts) &&
+          (!prefix_extractor_->InDomain(next_file_first_user_key_without_ts) ||
+           user_comparator_.CompareWithoutTimestamp(
+               prefix_extractor_->Transform(target_user_key_without_ts), false,
+               prefix_extractor_->Transform(
+                   next_file_first_user_key_without_ts),
+               false) != 0)) {
+        // SkipEmptyFileForward() will not advance to next file when this flag
+        // is set for reason detailed below.
+        //
+        // The file we initially positioned to has no keys under the target
+        // prefix, and the next file's smallest key has a different prefix than
+        // target. When doing prefix iterator seek, when keys for one prefix
+        // have been exhausted, it can jump to any key that is larger. Here we
+        // are enforcing a stricter contract than that, in order to make it
+        // easier for higher layers (merging and DB iterator) to reason the
+        // correctness:
+        // 1. Within the prefix, the result should be accurate.
+        // 2. If keys for the prefix is exhausted, it is either positioned to
+        // the next key after the prefix, or make the iterator invalid.
+        // A side benefit will be that it invalidates the iterator earlier so
+        // that the upper level merging iterator can merge fewer child
+        // iterators.
+        //
+        // The flag is cleared in Seek*() calls. There is no need to clear the
+        // flag in Prev() since Prev() will not be called when the flag is set
+        // for reasons explained below. If range_tombstone_iter_ is nullptr,
+        // then there is no file boundary sentinel key. Since
+        // !file_iter_.Valid() from the if condition above, this level iterator
+        // is !Valid(), so Prev() will not be called. If range_tombstone_iter_
+        // is not nullptr, there are two cases depending on if this level
+        // iterator reaches top of the heap in merging iterator (the upper
+        // layer).
+        //  If so, merging iterator will see the sentinel key, call
+        //  NextAndGetResult() and the call to NextAndGetResult() will skip the
+        //  sentinel key and makes this level iterator invalid. If not, then it
+        //  could be because the upper layer is done before any method of this
+        //  level iterator is called or another Seek*() call is invoked. Either
+        //  way, Prev() is never called before Seek*().
+        // The flag should not be cleared at the beginning of
+        // Next/NextAndGetResult() since it is used in SkipEmptyFileForward()
+        // called in Next/NextAndGetResult().
+        prefix_exhausted_ = true;
+      }
+    }
+
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekForPrev(const Slice& target) {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  size_t new_file_index = FindFile(icomparator_, *flevel_, target);
+  // Seek beyond this level's smallest key
+  if (new_file_index == 0 &&
+      icomparator_.Compare(target, file_smallest_key(0)) < 0) {
+    SetFileIterator(nullptr);
+    ClearRangeTombstoneIter();
+    CheckMayBeOutOfLowerBound();
+    return;
+  }
+  if (new_file_index >= flevel_->num_files) {
+    new_file_index = flevel_->num_files - 1;
+  }
+
+  InitFileIterator(new_file_index);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekForPrev(target);
+    if (range_tombstone_iter_ &&
+        icomparator_.Compare(target, file_smallest_key(file_index_)) >= 0) {
+      // In SeekForPrev() case, it is possible that the target is less than
+      // file's lower boundary since largest key is used to determine file index
+      // (FindFile()). When target is less than file's lower boundary, sentinel
+      // key should not be set so that SeekForPrev() does not result in a key
+      // larger than target. This is correct in that there is no need to keep
+      // the range tombstones in this file alive as they only cover keys
+      // starting from the file's lower boundary, which is after `target`.
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+    SkipEmptyFileBackward();
+  }
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToFirst() {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  InitFileIterator(0);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToFirst();
+    if (range_tombstone_iter_) {
+      // We do this in SeekToFirst() and SeekToLast() since
+      // we could have an empty file with only range tombstones.
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::SeekToLast() {
+  prefix_exhausted_ = false;
+  ClearSentinel();
+  InitFileIterator(flevel_->num_files - 1);
+  if (file_iter_.iter() != nullptr) {
+    file_iter_.SeekToLast();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+  }
+  SkipEmptyFileBackward();
+  CheckMayBeOutOfLowerBound();
+}
+
+void LevelIterator::Next() {
+  assert(Valid());
+  if (to_return_sentinel_) {
+    // file_iter_ is at EOF already when to_return_sentinel_
+    ClearSentinel();
+  } else {
+    file_iter_.Next();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+  }
+  SkipEmptyFileForward();
+}
+
+bool LevelIterator::NextAndGetResult(IterateResult* result) {
+  assert(Valid());
+  // file_iter_ is at EOF already when to_return_sentinel_
+  bool is_valid = !to_return_sentinel_ && file_iter_.NextAndGetResult(result);
+  if (!is_valid) {
+    if (to_return_sentinel_) {
+      ClearSentinel();
+    } else if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+    }
+    is_next_read_sequential_ = true;
+    SkipEmptyFileForward();
+    is_next_read_sequential_ = false;
+    is_valid = Valid();
+    if (is_valid) {
+      // This could be set in TrySetDeleteRangeSentinel() or
+      // SkipEmptyFileForward() above.
+      if (to_return_sentinel_) {
+        result->key = sentinel_;
+        result->bound_check_result = IterBoundCheck::kUnknown;
+        result->value_prepared = true;
+      } else {
+        result->key = key();
+        result->bound_check_result = file_iter_.UpperBoundCheckResult();
+        // Ideally, we should return the real file_iter_.value_prepared but the
+        // information is not here. It would casue an extra PrepareValue()
+        // for the first key of a file.
+        result->value_prepared = !allow_unprepared_value_;
+      }
+    }
+  }
+  return is_valid;
+}
+
+void LevelIterator::Prev() {
+  assert(Valid());
+  if (to_return_sentinel_) {
+    ClearSentinel();
+  } else {
+    file_iter_.Prev();
+    if (range_tombstone_iter_) {
+      TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+    }
+  }
+  SkipEmptyFileBackward();
+}
+
+bool LevelIterator::SkipEmptyFileForward() {
+  bool seen_empty_file = false;
+  // Pause at sentinel key
+  while (!to_return_sentinel_ &&
+         (file_iter_.iter() == nullptr ||
+          (!file_iter_.Valid() && file_iter_.status().ok() &&
+           file_iter_.iter()->UpperBoundCheckResult() !=
+               IterBoundCheck::kOutOfBound))) {
+    seen_empty_file = true;
+    // Move to next file
+    if (file_index_ >= flevel_->num_files - 1 ||
+        KeyReachedUpperBound(file_smallest_key(file_index_ + 1)) ||
+        prefix_exhausted_) {
+      SetFileIterator(nullptr);
+      ClearRangeTombstoneIter();
+      break;
+    }
+    // may init a new *range_tombstone_iter
+    InitFileIterator(file_index_ + 1);
+    // We moved to a new SST file
+    // Seek range_tombstone_iter_ to reset its !Valid() default state.
+    // We do not need to call range_tombstone_iter_.Seek* in
+    // LevelIterator::Seek* since when the merging iterator calls
+    // LevelIterator::Seek*, it should also call Seek* into the corresponding
+    // range tombstone iterator.
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToFirst();
+      if (range_tombstone_iter_) {
+        if (*range_tombstone_iter_) {
+          (*range_tombstone_iter_)->SeekToFirst();
+        }
+        TrySetDeleteRangeSentinel(file_largest_key(file_index_));
+      }
+    }
+  }
+  return seen_empty_file;
+}
+
+void LevelIterator::SkipEmptyFileBackward() {
+  // Pause at sentinel key
+  while (!to_return_sentinel_ &&
+         (file_iter_.iter() == nullptr ||
+          (!file_iter_.Valid() && file_iter_.status().ok()))) {
+    // Move to previous file
+    if (file_index_ == 0) {
+      // Already the first file
+      SetFileIterator(nullptr);
+      ClearRangeTombstoneIter();
+      return;
+    }
+    InitFileIterator(file_index_ - 1);
+    // We moved to a new SST file
+    // Seek range_tombstone_iter_ to reset its !Valid() default state.
+    if (file_iter_.iter() != nullptr) {
+      file_iter_.SeekToLast();
+      if (range_tombstone_iter_) {
+        if (*range_tombstone_iter_) {
+          (*range_tombstone_iter_)->SeekToLast();
+        }
+        TrySetDeleteRangeSentinel(file_smallest_key(file_index_));
+        if (to_return_sentinel_) {
+          break;
+        }
+      }
+    }
+  }
+}
+
+void LevelIterator::SetFileIterator(InternalIterator* iter) {
+  if (pinned_iters_mgr_ && iter) {
+    iter->SetPinnedItersMgr(pinned_iters_mgr_);
+  }
+
+  InternalIterator* old_iter = file_iter_.Set(iter);
+
+  // Update the read pattern for PrefetchBuffer.
+  if (is_next_read_sequential_) {
+    file_iter_.UpdateReadaheadState(old_iter);
+  }
+
+  if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) {
+    pinned_iters_mgr_->PinIterator(old_iter);
+  } else {
+    delete old_iter;
+  }
+}
+
+void LevelIterator::InitFileIterator(size_t new_file_index) {
+  if (new_file_index >= flevel_->num_files) {
+    file_index_ = new_file_index;
+    SetFileIterator(nullptr);
+    ClearRangeTombstoneIter();
+    return;
+  } else {
+    // If the file iterator shows incomplete, we try it again if users seek
+    // to the same file, as this time we may go to a different data block
+    // which is cached in block cache.
+    //
+    if (file_iter_.iter() != nullptr && !file_iter_.status().IsIncomplete() &&
+        new_file_index == file_index_) {
+      // file_iter_ is already constructed with this iterator, so
+      // no need to change anything
+    } else {
+      file_index_ = new_file_index;
+      InternalIterator* iter = NewFileIterator();
+      SetFileIterator(iter);
+    }
+  }
+}
+}  // anonymous namespace
+
+Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                                   const FileMetaData* file_meta,
+                                   const std::string* fname) const {
+  auto table_cache = cfd_->table_cache();
+  auto ioptions = cfd_->ioptions();
+  Status s = table_cache->GetTableProperties(
+      file_options_, cfd_->internal_comparator(), *file_meta, tp,
+      mutable_cf_options_.prefix_extractor, true /* no io */);
+  if (s.ok()) {
+    return s;
+  }
+
+  // We only ignore error type `Incomplete` since it's by design that we
+  // disallow table when it's not in table cache.
+  if (!s.IsIncomplete()) {
+    return s;
+  }
+
+  // 2. Table is not present in table cache, we'll read the table properties
+  // directly from the properties block in the file.
+  std::unique_ptr<FSRandomAccessFile> file;
+  std::string file_name;
+  if (fname != nullptr) {
+    file_name = *fname;
+  } else {
+    file_name = TableFileName(ioptions->cf_paths, file_meta->fd.GetNumber(),
+                              file_meta->fd.GetPathId());
+  }
+  s = ioptions->fs->NewRandomAccessFile(file_name, file_options_, &file,
+                                        nullptr);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // By setting the magic number to kNullTableMagicNumber, we can bypass
+  // the magic number check in the footer.
+  std::unique_ptr<RandomAccessFileReader> file_reader(
+      new RandomAccessFileReader(
+          std::move(file), file_name, nullptr /* env */, io_tracer_,
+          nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */,
+          nullptr /* rate_limiter */, ioptions->listeners));
+  std::unique_ptr<TableProperties> props;
+  s = ReadTableProperties(
+      file_reader.get(), file_meta->fd.GetFileSize(),
+      Footer::kNullTableMagicNumber /* table's magic number */, *ioptions,
+      &props);
+  if (!s.ok()) {
+    return s;
+  }
+  *tp = std::move(props);
+  RecordTick(ioptions->stats, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
+  return s;
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
+  Status s;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    s = GetPropertiesOfAllTables(props, level);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::TablesRangeTombstoneSummary(int max_entries_to_print,
+                                            std::string* out_str) {
+  if (max_entries_to_print <= 0) {
+    return Status::OK();
+  }
+  int num_entries_left = max_entries_to_print;
+
+  std::stringstream ss;
+
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.files_[level]) {
+      auto fname =
+          TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                        file_meta->fd.GetPathId());
+
+      ss << "=== file : " << fname << " ===\n";
+
+      TableCache* table_cache = cfd_->table_cache();
+      std::unique_ptr<FragmentedRangeTombstoneIterator> tombstone_iter;
+
+      Status s = table_cache->GetRangeTombstoneIterator(
+          ReadOptions(), cfd_->internal_comparator(), *file_meta,
+          &tombstone_iter);
+      if (!s.ok()) {
+        return s;
+      }
+      if (tombstone_iter) {
+        tombstone_iter->SeekToFirst();
+
+        // TODO: print timestamp
+        while (tombstone_iter->Valid() && num_entries_left > 0) {
+          ss << "start: " << tombstone_iter->start_key().ToString(true)
+             << " end: " << tombstone_iter->end_key().ToString(true)
+             << " seq: " << tombstone_iter->seq() << '\n';
+          tombstone_iter->Next();
+          num_entries_left--;
+        }
+        if (num_entries_left <= 0) {
+          break;
+        }
+      }
+    }
+    if (num_entries_left <= 0) {
+      break;
+    }
+  }
+  assert(num_entries_left >= 0);
+  if (num_entries_left <= 0) {
+    ss << "(results may not be complete)\n";
+  }
+
+  *out_str = ss.str();
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props,
+                                         int level) {
+  for (const auto& file_meta : storage_info_.files_[level]) {
+    auto fname =
+        TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                      file_meta->fd.GetPathId());
+    // 1. If the table is already present in table cache, load table
+    // properties from there.
+    std::shared_ptr<const TableProperties> table_properties;
+    Status s = GetTableProperties(&table_properties, file_meta, &fname);
+    if (s.ok()) {
+      props->insert({fname, table_properties});
+    } else {
+      return s;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetPropertiesOfTablesInRange(
+    const Range* range, std::size_t n, TablePropertiesCollection* props) const {
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    for (decltype(n) i = 0; i < n; i++) {
+      // Convert user_key into a corresponding internal key.
+      InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
+      InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
+      std::vector<FileMetaData*> files;
+      storage_info_.GetOverlappingInputs(level, &k1, &k2, &files, -1, nullptr,
+                                         false);
+      for (const auto& file_meta : files) {
+        auto fname =
+            TableFileName(cfd_->ioptions()->cf_paths, file_meta->fd.GetNumber(),
+                          file_meta->fd.GetPathId());
+        if (props->count(fname) == 0) {
+          // 1. If the table is already present in table cache, load table
+          // properties from there.
+          std::shared_ptr<const TableProperties> table_properties;
+          Status s = GetTableProperties(&table_properties, file_meta, &fname);
+          if (s.ok()) {
+            props->insert({fname, table_properties});
+          } else {
+            return s;
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status Version::GetAggregatedTableProperties(
+    std::shared_ptr<const TableProperties>* tp, int level) {
+  TablePropertiesCollection props;
+  Status s;
+  if (level < 0) {
+    s = GetPropertiesOfAllTables(&props);
+  } else {
+    s = GetPropertiesOfAllTables(&props, level);
+  }
+  if (!s.ok()) {
+    return s;
+  }
+
+  auto* new_tp = new TableProperties();
+  for (const auto& item : props) {
+    new_tp->Add(*item.second);
+  }
+  tp->reset(new_tp);
+  return Status::OK();
+}
+
+size_t Version::GetMemoryUsageByTableReaders() {
+  size_t total_usage = 0;
+  for (auto& file_level : storage_info_.level_files_brief_) {
+    for (size_t i = 0; i < file_level.num_files; i++) {
+      total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
+          file_options_, cfd_->internal_comparator(),
+          *file_level.files[i].file_metadata,
+          mutable_cf_options_.prefix_extractor);
+    }
+  }
+  return total_usage;
+}
+
+void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) {
+  assert(cf_meta);
+  assert(cfd_);
+
+  cf_meta->name = cfd_->GetName();
+  cf_meta->size = 0;
+  cf_meta->file_count = 0;
+  cf_meta->levels.clear();
+
+  cf_meta->blob_file_size = 0;
+  cf_meta->blob_file_count = 0;
+  cf_meta->blob_files.clear();
+
+  auto* ioptions = cfd_->ioptions();
+  auto* vstorage = storage_info();
+
+  for (int level = 0; level < cfd_->NumberLevels(); level++) {
+    uint64_t level_size = 0;
+    cf_meta->file_count += vstorage->LevelFiles(level).size();
+    std::vector<SstFileMetaData> files;
+    for (const auto& file : vstorage->LevelFiles(level)) {
+      uint32_t path_id = file->fd.GetPathId();
+      std::string file_path;
+      if (path_id < ioptions->cf_paths.size()) {
+        file_path = ioptions->cf_paths[path_id].path;
+      } else {
+        assert(!ioptions->cf_paths.empty());
+        file_path = ioptions->cf_paths.back().path;
+      }
+      const uint64_t file_number = file->fd.GetNumber();
+      files.emplace_back(
+          MakeTableFileName("", file_number), file_number, file_path,
+          file->fd.GetFileSize(), file->fd.smallest_seqno,
+          file->fd.largest_seqno, file->smallest.user_key().ToString(),
+          file->largest.user_key().ToString(),
+          file->stats.num_reads_sampled.load(std::memory_order_relaxed),
+          file->being_compacted, file->temperature,
+          file->oldest_blob_file_number, file->TryGetOldestAncesterTime(),
+          file->TryGetFileCreationTime(), file->file_checksum,
+          file->file_checksum_func_name);
+      files.back().num_entries = file->num_entries;
+      files.back().num_deletions = file->num_deletions;
+      level_size += file->fd.GetFileSize();
+    }
+    cf_meta->levels.emplace_back(level, level_size, std::move(files));
+    cf_meta->size += level_size;
+  }
+  for (const auto& meta : vstorage->GetBlobFiles()) {
+    assert(meta);
+
+    cf_meta->blob_files.emplace_back(
+        meta->GetBlobFileNumber(), BlobFileName("", meta->GetBlobFileNumber()),
+        ioptions->cf_paths.front().path, meta->GetBlobFileSize(),
+        meta->GetTotalBlobCount(), meta->GetTotalBlobBytes(),
+        meta->GetGarbageBlobCount(), meta->GetGarbageBlobBytes(),
+        meta->GetChecksumMethod(), meta->GetChecksumValue());
+    ++cf_meta->blob_file_count;
+    cf_meta->blob_file_size += meta->GetBlobFileSize();
+  }
+}
+
+uint64_t Version::GetSstFilesSize() {
+  uint64_t sst_files_size = 0;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    for (const auto& file_meta : storage_info_.LevelFiles(level)) {
+      sst_files_size += file_meta->fd.GetFileSize();
+    }
+  }
+  return sst_files_size;
+}
+
+void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) {
+  uint64_t oldest_time = std::numeric_limits<uint64_t>::max();
+  for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) {
+    for (FileMetaData* meta : storage_info_.LevelFiles(level)) {
+      assert(meta->fd.table_reader != nullptr);
+      uint64_t file_creation_time = meta->TryGetFileCreationTime();
+      if (file_creation_time == kUnknownFileCreationTime) {
+        *creation_time = 0;
+        return;
+      }
+      if (file_creation_time < oldest_time) {
+        oldest_time = file_creation_time;
+      }
+    }
+  }
+  *creation_time = oldest_time;
+}
+
+InternalIterator* Version::TEST_GetLevelIterator(
+    const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+    int level, bool allow_unprepared_value) {
+  auto* arena = merge_iter_builder->GetArena();
+  auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+  TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+  auto level_iter = new (mem) LevelIterator(
+      cfd_->table_cache(), read_options, file_options_,
+      cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+      mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+      cfd_->internal_stats()->GetFileReadHist(level),
+      TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+      nullptr /* range_del_agg */, nullptr /* compaction_boundaries */,
+      allow_unprepared_value, &tombstone_iter_ptr);
+  if (read_options.ignore_range_deletions) {
+    merge_iter_builder->AddIterator(level_iter);
+  } else {
+    merge_iter_builder->AddPointAndTombstoneIterator(
+        level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+  }
+  return level_iter;
+}
+
+uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const {
+  // Estimation will be inaccurate when:
+  // (1) there exist merge keys
+  // (2) keys are directly overwritten
+  // (3) deletion on non-existing keys
+  // (4) low number of samples
+  if (current_num_samples_ == 0) {
+    return 0;
+  }
+
+  if (current_num_non_deletions_ <= current_num_deletions_) {
+    return 0;
+  }
+
+  uint64_t est = current_num_non_deletions_ - current_num_deletions_;
+
+  uint64_t file_count = 0;
+  for (int level = 0; level < num_levels_; ++level) {
+    file_count += files_[level].size();
+  }
+
+  if (current_num_samples_ < file_count) {
+    // casting to avoid overflowing
+    return static_cast<uint64_t>(
+        (est * static_cast<double>(file_count) / current_num_samples_));
+  } else {
+    return est;
+  }
+}
+
+double VersionStorageInfo::GetEstimatedCompressionRatioAtLevel(
+    int level) const {
+  assert(level < num_levels_);
+  uint64_t sum_file_size_bytes = 0;
+  uint64_t sum_data_size_bytes = 0;
+  for (auto* file_meta : files_[level]) {
+    sum_file_size_bytes += file_meta->fd.GetFileSize();
+    sum_data_size_bytes += file_meta->raw_key_size + file_meta->raw_value_size;
+  }
+  if (sum_file_size_bytes == 0) {
+    return -1.0;
+  }
+  return static_cast<double>(sum_data_size_bytes) / sum_file_size_bytes;
+}
+
+void Version::AddIterators(const ReadOptions& read_options,
+                           const FileOptions& soptions,
+                           MergeIteratorBuilder* merge_iter_builder,
+                           bool allow_unprepared_value) {
+  assert(storage_info_.finalized_);
+
+  for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) {
+    AddIteratorsForLevel(read_options, soptions, merge_iter_builder, level,
+                         allow_unprepared_value);
+  }
+}
+
+void Version::AddIteratorsForLevel(const ReadOptions& read_options,
+                                   const FileOptions& soptions,
+                                   MergeIteratorBuilder* merge_iter_builder,
+                                   int level, bool allow_unprepared_value) {
+  assert(storage_info_.finalized_);
+  if (level >= storage_info_.num_non_empty_levels()) {
+    // This is an empty level
+    return;
+  } else if (storage_info_.LevelFilesBrief(level).num_files == 0) {
+    // No files in this level
+    return;
+  }
+
+  bool should_sample = should_sample_file_read();
+
+  auto* arena = merge_iter_builder->GetArena();
+  if (level == 0) {
+    // Merge all level zero files together since they may overlap
+    TruncatedRangeDelIterator* tombstone_iter = nullptr;
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto& file = storage_info_.LevelFilesBrief(0).files[i];
+      auto table_iter = cfd_->table_cache()->NewIterator(
+          read_options, soptions, cfd_->internal_comparator(),
+          *file.file_metadata, /*range_del_agg=*/nullptr,
+          mutable_cf_options_.prefix_extractor, nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, arena,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr, allow_unprepared_value,
+          &tombstone_iter);
+      if (read_options.ignore_range_deletions) {
+        merge_iter_builder->AddIterator(table_iter);
+      } else {
+        merge_iter_builder->AddPointAndTombstoneIterator(table_iter,
+                                                         tombstone_iter);
+      }
+    }
+    if (should_sample) {
+      // Count ones for every L0 files. This is done per iterator creation
+      // rather than Seek(), while files in other levels are recored per seek.
+      // If users execute one range query per iterator, there may be some
+      // discrepancy here.
+      for (FileMetaData* meta : storage_info_.LevelFiles(0)) {
+        sample_file_read_inc(meta);
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    // For levels > 0, we can use a concatenating iterator that sequentially
+    // walks through the non-overlapping files in the level, opening them
+    // lazily.
+    auto* mem = arena->AllocateAligned(sizeof(LevelIterator));
+    TruncatedRangeDelIterator*** tombstone_iter_ptr = nullptr;
+    auto level_iter = new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, soptions,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        /*range_del_agg=*/nullptr, /*compaction_boundaries=*/nullptr,
+        allow_unprepared_value, &tombstone_iter_ptr);
+    if (read_options.ignore_range_deletions) {
+      merge_iter_builder->AddIterator(level_iter);
+    } else {
+      merge_iter_builder->AddPointAndTombstoneIterator(
+          level_iter, nullptr /* tombstone_iter */, tombstone_iter_ptr);
+    }
+  }
+}
+
+Status Version::OverlapWithLevelIterator(const ReadOptions& read_options,
+                                         const FileOptions& file_options,
+                                         const Slice& smallest_user_key,
+                                         const Slice& largest_user_key,
+                                         int level, bool* overlap) {
+  assert(storage_info_.finalized_);
+
+  auto icmp = cfd_->internal_comparator();
+  auto ucmp = icmp.user_comparator();
+
+  Arena arena;
+  Status status;
+  ReadRangeDelAggregator range_del_agg(&icmp,
+                                       kMaxSequenceNumber /* upper_bound */);
+
+  *overlap = false;
+
+  if (level == 0) {
+    for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) {
+      const auto file = &storage_info_.LevelFilesBrief(0).files[i];
+      if (AfterFile(ucmp, &smallest_user_key, file) ||
+          BeforeFile(ucmp, &largest_user_key, file)) {
+        continue;
+      }
+      ScopedArenaIterator iter(cfd_->table_cache()->NewIterator(
+          read_options, file_options, cfd_->internal_comparator(),
+          *file->file_metadata, &range_del_agg,
+          mutable_cf_options_.prefix_extractor, nullptr,
+          cfd_->internal_stats()->GetFileReadHist(0),
+          TableReaderCaller::kUserIterator, &arena,
+          /*skip_filters=*/false, /*level=*/0, max_file_size_for_l0_meta_pin_,
+          /*smallest_compaction_key=*/nullptr,
+          /*largest_compaction_key=*/nullptr,
+          /*allow_unprepared_value=*/false));
+      status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+                                   iter.get(), overlap);
+      if (!status.ok() || *overlap) {
+        break;
+      }
+    }
+  } else if (storage_info_.LevelFilesBrief(level).num_files > 0) {
+    auto mem = arena.AllocateAligned(sizeof(LevelIterator));
+    ScopedArenaIterator iter(new (mem) LevelIterator(
+        cfd_->table_cache(), read_options, file_options,
+        cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level),
+        mutable_cf_options_.prefix_extractor, should_sample_file_read(),
+        cfd_->internal_stats()->GetFileReadHist(level),
+        TableReaderCaller::kUserIterator, IsFilterSkipped(level), level,
+        &range_del_agg));
+    status = OverlapWithIterator(ucmp, smallest_user_key, largest_user_key,
+                                 iter.get(), overlap);
+  }
+
+  if (status.ok() && *overlap == false &&
+      range_del_agg.IsRangeOverlapped(smallest_user_key, largest_user_key)) {
+    *overlap = true;
+  }
+  return status;
+}
+
+VersionStorageInfo::VersionStorageInfo(
+    const InternalKeyComparator* internal_comparator,
+    const Comparator* user_comparator, int levels,
+    CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage,
+    bool _force_consistency_checks)
+    : internal_comparator_(internal_comparator),
+      user_comparator_(user_comparator),
+      // cfd is nullptr if Version is dummy
+      num_levels_(levels),
+      num_non_empty_levels_(0),
+      file_indexer_(user_comparator),
+      compaction_style_(compaction_style),
+      files_(new std::vector<FileMetaData*>[num_levels_]),
+      base_level_(num_levels_ == 1 ? -1 : 1),
+      level_multiplier_(0.0),
+      files_by_compaction_pri_(num_levels_),
+      level0_non_overlapping_(false),
+      next_file_to_compact_by_size_(num_levels_),
+      compaction_score_(num_levels_),
+      compaction_level_(num_levels_),
+      l0_delay_trigger_count_(0),
+      compact_cursor_(num_levels_),
+      accumulated_file_size_(0),
+      accumulated_raw_key_size_(0),
+      accumulated_raw_value_size_(0),
+      accumulated_num_non_deletions_(0),
+      accumulated_num_deletions_(0),
+      current_num_non_deletions_(0),
+      current_num_deletions_(0),
+      current_num_samples_(0),
+      estimated_compaction_needed_bytes_(0),
+      finalized_(false),
+      force_consistency_checks_(_force_consistency_checks) {
+  if (ref_vstorage != nullptr) {
+    accumulated_file_size_ = ref_vstorage->accumulated_file_size_;
+    accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_;
+    accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_;
+    accumulated_num_non_deletions_ =
+        ref_vstorage->accumulated_num_non_deletions_;
+    accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_;
+    current_num_non_deletions_ = ref_vstorage->current_num_non_deletions_;
+    current_num_deletions_ = ref_vstorage->current_num_deletions_;
+    current_num_samples_ = ref_vstorage->current_num_samples_;
+    oldest_snapshot_seqnum_ = ref_vstorage->oldest_snapshot_seqnum_;
+    compact_cursor_ = ref_vstorage->compact_cursor_;
+    compact_cursor_.resize(num_levels_);
+  }
+}
+
+Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset,
+                 const FileOptions& file_opt,
+                 const MutableCFOptions mutable_cf_options,
+                 const std::shared_ptr<IOTracer>& io_tracer,
+                 uint64_t version_number)
+    : env_(vset->env_),
+      clock_(vset->clock_),
+      cfd_(column_family_data),
+      info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->logger),
+      db_statistics_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->stats),
+      table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()),
+      blob_source_(cfd_ ? cfd_->blob_source() : nullptr),
+      merge_operator_(
+          (cfd_ == nullptr) ? nullptr : cfd_->ioptions()->merge_operator.get()),
+      storage_info_(
+          (cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(),
+          (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(),
+          cfd_ == nullptr ? 0 : cfd_->NumberLevels(),
+          cfd_ == nullptr ? kCompactionStyleLevel
+                          : cfd_->ioptions()->compaction_style,
+          (cfd_ == nullptr || cfd_->current() == nullptr)
+              ? nullptr
+              : cfd_->current()->storage_info(),
+          cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks),
+      vset_(vset),
+      next_(this),
+      prev_(this),
+      refs_(0),
+      file_options_(file_opt),
+      mutable_cf_options_(mutable_cf_options),
+      max_file_size_for_l0_meta_pin_(
+          MaxFileSizeForL0MetaPin(mutable_cf_options_)),
+      version_number_(version_number),
+      io_tracer_(io_tracer) {}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const Slice& blob_index_slice,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  BlobIndex blob_index;
+
+  {
+    Status s = blob_index.DecodeFrom(blob_index_slice);
+    if (!s.ok()) {
+      return s;
+    }
+  }
+
+  return GetBlob(read_options, user_key, blob_index, prefetch_buffer, value,
+                 bytes_read);
+}
+
+Status Version::GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                        const BlobIndex& blob_index,
+                        FilePrefetchBuffer* prefetch_buffer,
+                        PinnableSlice* value, uint64_t* bytes_read) const {
+  assert(value);
+
+  if (blob_index.HasTTL() || blob_index.IsInlined()) {
+    return Status::Corruption("Unexpected TTL/inlined blob index");
+  }
+
+  const uint64_t blob_file_number = blob_index.file_number();
+
+  auto blob_file_meta = storage_info_.GetBlobFileMetaData(blob_file_number);
+  if (!blob_file_meta) {
+    return Status::Corruption("Invalid blob file number");
+  }
+
+  assert(blob_source_);
+  value->Reset();
+  const Status s = blob_source_->GetBlob(
+      read_options, user_key, blob_file_number, blob_index.offset(),
+      blob_file_meta->GetBlobFileSize(), blob_index.size(),
+      blob_index.compression(), prefetch_buffer, value, bytes_read);
+
+  return s;
+}
+
+void Version::MultiGetBlob(
+    const ReadOptions& read_options, MultiGetRange& range,
+    std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs) {
+  assert(!blob_ctxs.empty());
+
+  autovector<BlobFileReadRequests> blob_reqs;
+
+  for (auto& ctx : blob_ctxs) {
+    const auto file_number = ctx.first;
+    const auto blob_file_meta = storage_info_.GetBlobFileMetaData(file_number);
+
+    autovector<BlobReadRequest> blob_reqs_in_file;
+    BlobReadContexts& blobs_in_file = ctx.second;
+    for (const auto& blob : blobs_in_file) {
+      const BlobIndex& blob_index = blob.first;
+      const KeyContext& key_context = blob.second;
+
+      if (!blob_file_meta) {
+        *key_context.s = Status::Corruption("Invalid blob file number");
+        continue;
+      }
+
+      if (blob_index.HasTTL() || blob_index.IsInlined()) {
+        *key_context.s =
+            Status::Corruption("Unexpected TTL/inlined blob index");
+        continue;
+      }
+
+      key_context.value->Reset();
+      blob_reqs_in_file.emplace_back(
+          key_context.ukey_with_ts, blob_index.offset(), blob_index.size(),
+          blob_index.compression(), key_context.value, key_context.s);
+    }
+    if (blob_reqs_in_file.size() > 0) {
+      const auto file_size = blob_file_meta->GetBlobFileSize();
+      blob_reqs.emplace_back(file_number, file_size, blob_reqs_in_file);
+    }
+  }
+
+  if (blob_reqs.size() > 0) {
+    blob_source_->MultiGetBlob(read_options, blob_reqs, /*bytes_read=*/nullptr);
+  }
+
+  for (auto& ctx : blob_ctxs) {
+    BlobReadContexts& blobs_in_file = ctx.second;
+    for (const auto& blob : blobs_in_file) {
+      const KeyContext& key_context = blob.second;
+      if (key_context.s->ok()) {
+        range.AddValueSize(key_context.value->size());
+        if (range.GetValueSize() > read_options.value_size_soft_limit) {
+          *key_context.s = Status::Aborted();
+        }
+      } else if (key_context.s->IsIncomplete()) {
+        // read_options.read_tier == kBlockCacheTier
+        // Cannot read blob(s): no disk I/O allowed
+        assert(key_context.get_context);
+        auto& get_context = *(key_context.get_context);
+        get_context.MarkKeyMayExist();
+      }
+    }
+  }
+}
+
+void Version::Get(const ReadOptions& read_options, const LookupKey& k,
+                  PinnableSlice* value, PinnableWideColumns* columns,
+                  std::string* timestamp, Status* status,
+                  MergeContext* merge_context,
+                  SequenceNumber* max_covering_tombstone_seq,
+                  PinnedIteratorsManager* pinned_iters_mgr, bool* value_found,
+                  bool* key_exists, SequenceNumber* seq, ReadCallback* callback,
+                  bool* is_blob, bool do_merge) {
+  Slice ikey = k.internal_key();
+  Slice user_key = k.user_key();
+
+  assert(status->ok() || status->IsMergeInProgress());
+
+  if (key_exists != nullptr) {
+    // will falsify below if not found
+    *key_exists = true;
+  }
+
+  uint64_t tracing_get_id = BlockCacheTraceHelper::kReservedGetId;
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_get_id = vset_->block_cache_tracer_->NextGetId();
+  }
+
+  // Note: the old StackableDB-based BlobDB passes in
+  // GetImplOptions::is_blob_index; for the integrated BlobDB implementation, we
+  // need to provide it here.
+  bool is_blob_index = false;
+  bool* const is_blob_to_use = is_blob ? is_blob : &is_blob_index;
+  BlobFetcher blob_fetcher(this, read_options);
+
+  assert(pinned_iters_mgr);
+  GetContext get_context(
+      user_comparator(), merge_operator_, info_log_, db_statistics_,
+      status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key,
+      do_merge ? value : nullptr, do_merge ? columns : nullptr,
+      do_merge ? timestamp : nullptr, value_found, merge_context, do_merge,
+      max_covering_tombstone_seq, clock_, seq,
+      merge_operator_ ? pinned_iters_mgr : nullptr, callback, is_blob_to_use,
+      tracing_get_id, &blob_fetcher);
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr->StartPinning();
+  }
+
+  FilePicker fp(user_key, ikey, &storage_info_.level_files_brief_,
+                storage_info_.num_non_empty_levels_,
+                &storage_info_.file_indexer_, user_comparator(),
+                internal_comparator());
+  FdWithKeyRange* f = fp.GetNextFile();
+
+  while (f != nullptr) {
+    if (*max_covering_tombstone_seq > 0) {
+      // The remaining files we look at will only contain covered keys, so we
+      // stop here.
+      break;
+    }
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+
+    bool timer_enabled =
+        GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+        get_perf_context()->per_level_perf_context_enabled;
+    StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+    *status = table_cache_->Get(
+        read_options, *internal_comparator(), *f->file_metadata, ikey,
+        &get_context, mutable_cf_options_.prefix_extractor,
+        cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+        IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                        fp.IsHitFileLastInLevel()),
+        fp.GetHitFileLevel(), max_file_size_for_l0_meta_pin_);
+    // TODO: examine the behavior for corrupted key
+    if (timer_enabled) {
+      PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                                fp.GetHitFileLevel());
+    }
+    if (!status->ok()) {
+      if (db_statistics_ != nullptr) {
+        get_context.ReportCounters();
+      }
+      return;
+    }
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (fp.GetHitFileLevel() == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (fp.GetHitFileLevel() == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (fp.GetHitFileLevel() >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1,
+                                  fp.GetHitFileLevel());
+
+        if (is_blob_index) {
+          if (do_merge && value) {
+            TEST_SYNC_POINT_CALLBACK("Version::Get::TamperWithBlobIndex",
+                                     value);
+
+            constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
+            constexpr uint64_t* bytes_read = nullptr;
+
+            *status = GetBlob(read_options, user_key, *value, prefetch_buffer,
+                              value, bytes_read);
+            if (!status->ok()) {
+              if (status->IsIncomplete()) {
+                get_context.MarkKeyMayExist();
+              }
+              return;
+            }
+          }
+        }
+
+        return;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        return;
+      case GetContext::kCorrupt:
+        *status = Status::Corruption("corrupted key for ", user_key);
+        return;
+      case GetContext::kUnexpectedBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        return;
+    }
+    f = fp.GetNextFile();
+  }
+  if (db_statistics_ != nullptr) {
+    get_context.ReportCounters();
+  }
+  if (GetContext::kMerge == get_context.State()) {
+    if (!do_merge) {
+      *status = Status::OK();
+      return;
+    }
+    if (!merge_operator_) {
+      *status = Status::InvalidArgument(
+          "merge_operator is not properly initialized.");
+      return;
+    }
+    // merge_operands are in saver and we hit the beginning of the key history
+    // do a final merge of nullptr and operands;
+    if (value || columns) {
+      std::string result;
+      *status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key, nullptr, merge_context->GetOperands(),
+          &result, info_log_, db_statistics_, clock_,
+          /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+      if (status->ok()) {
+        if (LIKELY(value != nullptr)) {
+          *(value->GetSelf()) = std::move(result);
+          value->PinSelf();
+        } else {
+          assert(columns != nullptr);
+          columns->SetPlainValue(result);
+        }
+      }
+    }
+  } else {
+    if (key_exists != nullptr) {
+      *key_exists = false;
+    }
+    *status = Status::NotFound();  // Use an empty error message for speed
+  }
+}
+
+void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
+                       ReadCallback* callback) {
+  PinnedIteratorsManager pinned_iters_mgr;
+
+  // Pin blocks that we read to hold merge operands
+  if (merge_operator_) {
+    pinned_iters_mgr.StartPinning();
+  }
+  uint64_t tracing_mget_id = BlockCacheTraceHelper::kReservedGetId;
+
+  if (vset_ && vset_->block_cache_tracer_ &&
+      vset_->block_cache_tracer_->is_tracing_enabled()) {
+    tracing_mget_id = vset_->block_cache_tracer_->NextGetId();
+  }
+  // Even though we know the batch size won't be > MAX_BATCH_SIZE,
+  // use autovector in order to avoid unnecessary construction of GetContext
+  // objects, which is expensive
+  autovector<GetContext, 16> get_ctx;
+  BlobFetcher blob_fetcher(this, read_options);
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    assert(iter->s->ok() || iter->s->IsMergeInProgress());
+    get_ctx.emplace_back(
+        user_comparator(), merge_operator_, info_log_, db_statistics_,
+        iter->s->ok() ? GetContext::kNotFound : GetContext::kMerge,
+        iter->ukey_with_ts, iter->value, /*columns=*/nullptr, iter->timestamp,
+        nullptr, &(iter->merge_context), true,
+        &iter->max_covering_tombstone_seq, clock_, nullptr,
+        merge_operator_ ? &pinned_iters_mgr : nullptr, callback,
+        &iter->is_blob_index, tracing_mget_id, &blob_fetcher);
+    // MergeInProgress status, if set, has been transferred to the get_context
+    // state, so we set status to ok here. From now on, the iter status will
+    // be used for IO errors, and get_context state will be used for any
+    // key level errors
+    *(iter->s) = Status::OK();
+  }
+  int get_ctx_index = 0;
+  for (auto iter = range->begin(); iter != range->end();
+       ++iter, get_ctx_index++) {
+    iter->get_context = &(get_ctx[get_ctx_index]);
+  }
+
+  Status s;
+  // blob_file => [[blob_idx, it], ...]
+  std::unordered_map<uint64_t, BlobReadContexts> blob_ctxs;
+  MultiGetRange keys_with_blobs_range(*range, range->begin(), range->end());
+#if USE_COROUTINES
+  if (read_options.async_io && read_options.optimize_multiget_for_io &&
+      using_coroutines()) {
+    s = MultiGetAsync(read_options, range, &blob_ctxs);
+  } else
+#endif  // USE_COROUTINES
+  {
+    MultiGetRange file_picker_range(*range, range->begin(), range->end());
+    FilePickerMultiGet fp(&file_picker_range, &storage_info_.level_files_brief_,
+                          storage_info_.num_non_empty_levels_,
+                          &storage_info_.file_indexer_, user_comparator(),
+                          internal_comparator());
+    FdWithKeyRange* f = fp.GetNextFileInLevel();
+    uint64_t num_index_read = 0;
+    uint64_t num_filter_read = 0;
+    uint64_t num_sst_read = 0;
+    uint64_t num_level_read = 0;
+
+    int prev_level = -1;
+
+    while (!fp.IsSearchEnded()) {
+      // This will be set to true later if we actually look up in a file in L0.
+      // For per level stats purposes, an L0 file is treated as a level
+      bool dump_stats_for_l0_file = false;
+
+      // Avoid using the coroutine version if we're looking in a L0 file, since
+      // L0 files won't be parallelized anyway. The regular synchronous version
+      // is faster.
+      if (!read_options.async_io || !using_coroutines() ||
+          fp.GetHitFileLevel() == 0 || !fp.RemainingOverlapInLevel()) {
+        if (f) {
+          bool skip_filters =
+              IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                              fp.IsHitFileLastInLevel());
+          // Call MultiGetFromSST for looking up a single file
+          s = MultiGetFromSST(read_options, fp.CurrentFileRange(),
+                              fp.GetHitFileLevel(), skip_filters,
+                              /*skip_range_deletions=*/false, f, blob_ctxs,
+                              /*table_handle=*/nullptr, num_filter_read,
+                              num_index_read, num_sst_read);
+          if (fp.GetHitFileLevel() == 0) {
+            dump_stats_for_l0_file = true;
+          }
+        }
+        if (s.ok()) {
+          f = fp.GetNextFileInLevel();
+        }
+#if USE_COROUTINES
+      } else {
+        std::vector<folly::coro::Task<Status>> mget_tasks;
+        while (f != nullptr) {
+          MultiGetRange file_range = fp.CurrentFileRange();
+          Cache::Handle* table_handle = nullptr;
+          bool skip_filters =
+              IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                              fp.IsHitFileLastInLevel());
+          bool skip_range_deletions = false;
+          if (!skip_filters) {
+            Status status = table_cache_->MultiGetFilter(
+                read_options, *internal_comparator(), *f->file_metadata,
+                mutable_cf_options_.prefix_extractor,
+                cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+                fp.GetHitFileLevel(), &file_range, &table_handle);
+            skip_range_deletions = true;
+            if (status.ok()) {
+              skip_filters = true;
+            } else if (!status.IsNotSupported()) {
+              s = status;
+            }
+          }
+
+          if (!s.ok()) {
+            break;
+          }
+
+          if (!file_range.empty()) {
+            mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+                read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+                skip_range_deletions, f, blob_ctxs, table_handle,
+                num_filter_read, num_index_read, num_sst_read));
+          }
+          if (fp.KeyMaySpanNextFile()) {
+            break;
+          }
+          f = fp.GetNextFileInLevel();
+        }
+        if (mget_tasks.size() > 0) {
+          RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT,
+                     mget_tasks.size());
+          // Collect all results so far
+          std::vector<Status> statuses = folly::coro::blockingWait(
+              folly::coro::collectAllRange(std::move(mget_tasks))
+                  .scheduleOn(&range->context()->executor()));
+          if (s.ok()) {
+            for (Status stat : statuses) {
+              if (!stat.ok()) {
+                s = std::move(stat);
+                break;
+              }
+            }
+          }
+
+          if (s.ok() && fp.KeyMaySpanNextFile()) {
+            f = fp.GetNextFileInLevel();
+          }
+        }
+#endif  // USE_COROUTINES
+      }
+      // If bad status or we found final result for all the keys
+      if (!s.ok() || file_picker_range.empty()) {
+        break;
+      }
+      if (!f) {
+        // Reached the end of this level. Prepare the next level
+        fp.PrepareNextLevelForSearch();
+        if (!fp.IsSearchEnded()) {
+          // Its possible there is no overlap on this level and f is nullptr
+          f = fp.GetNextFileInLevel();
+        }
+        if (dump_stats_for_l0_file ||
+            (prev_level != 0 && prev_level != (int)fp.GetHitFileLevel())) {
+          // Dump the stats if the search has moved to the next level and
+          // reset for next level.
+          if (num_filter_read + num_index_read) {
+            RecordInHistogram(db_statistics_,
+                              NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                              num_index_read + num_filter_read);
+          }
+          if (num_sst_read) {
+            RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL,
+                              num_sst_read);
+            num_level_read++;
+          }
+          num_filter_read = 0;
+          num_index_read = 0;
+          num_sst_read = 0;
+        }
+        prev_level = fp.GetHitFileLevel();
+      }
+    }
+
+    // Dump stats for most recent level
+    if (num_filter_read + num_index_read) {
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_index_read + num_filter_read);
+    }
+    if (num_sst_read) {
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_read);
+      num_level_read++;
+    }
+    if (num_level_read) {
+      RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET,
+                        num_level_read);
+    }
+  }
+
+  if (s.ok() && !blob_ctxs.empty()) {
+    MultiGetBlob(read_options, keys_with_blobs_range, blob_ctxs);
+  }
+
+  // Process any left over keys
+  for (auto iter = range->begin(); s.ok() && iter != range->end(); ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    Slice user_key = iter->lkey->user_key();
+
+    if (db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    }
+    if (GetContext::kMerge == get_context.State()) {
+      if (!merge_operator_) {
+        *status = Status::InvalidArgument(
+            "merge_operator is not properly initialized.");
+        range->MarkKeyDone(iter);
+        continue;
+      }
+      // merge_operands are in saver and we hit the beginning of the key history
+      // do a final merge of nullptr and operands;
+      std::string* str_value =
+          iter->value != nullptr ? iter->value->GetSelf() : nullptr;
+      *status = MergeHelper::TimedFullMerge(
+          merge_operator_, user_key, nullptr, iter->merge_context.GetOperands(),
+          str_value, info_log_, db_statistics_, clock_,
+          /* result_operand */ nullptr, /* update_num_ops_stats */ true);
+      if (LIKELY(iter->value != nullptr)) {
+        iter->value->PinSelf();
+        range->AddValueSize(iter->value->size());
+        range->MarkKeyDone(iter);
+        if (range->GetValueSize() > read_options.value_size_soft_limit) {
+          s = Status::Aborted();
+          break;
+        }
+      }
+    } else {
+      range->MarkKeyDone(iter);
+      *status = Status::NotFound();  // Use an empty error message for speed
+    }
+  }
+
+  for (auto iter = range->begin(); iter != range->end(); ++iter) {
+    range->MarkKeyDone(iter);
+    *(iter->s) = s;
+  }
+}
+
+#ifdef USE_COROUTINES
+Status Version::ProcessBatch(
+    const ReadOptions& read_options, FilePickerMultiGet* batch,
+    std::vector<folly::coro::Task<Status>>& mget_tasks,
+    std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+    autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+    std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+    std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+        mget_stats) {
+  FilePickerMultiGet& fp = *batch;
+  MultiGetRange range = fp.GetRange();
+  // Initialize a new empty range. Any keys that are not in this level will
+  // eventually become part of the new range.
+  MultiGetRange leftover(range, range.begin(), range.begin());
+  FdWithKeyRange* f = nullptr;
+  Status s;
+
+  f = fp.GetNextFileInLevel();
+  while (!f) {
+    fp.PrepareNextLevelForSearch();
+    if (!fp.IsSearchEnded()) {
+      f = fp.GetNextFileInLevel();
+    } else {
+      break;
+    }
+  }
+  while (f) {
+    MultiGetRange file_range = fp.CurrentFileRange();
+    Cache::Handle* table_handle = nullptr;
+    bool skip_filters = IsFilterSkipped(static_cast<int>(fp.GetHitFileLevel()),
+                                        fp.IsHitFileLastInLevel());
+    bool skip_range_deletions = false;
+    if (!skip_filters) {
+      Status status = table_cache_->MultiGetFilter(
+          read_options, *internal_comparator(), *f->file_metadata,
+          mutable_cf_options_.prefix_extractor,
+          cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()),
+          fp.GetHitFileLevel(), &file_range, &table_handle);
+      if (status.ok()) {
+        skip_filters = true;
+        skip_range_deletions = true;
+      } else if (!status.IsNotSupported()) {
+        s = status;
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+    // At this point, file_range contains any keys that are likely in this
+    // file. It may have false positives, but that's ok since higher level
+    // lookups for the key are dependent on this lookup anyway.
+    // Add the complement of file_range to leftover. That's the set of keys
+    // definitely not in this level.
+    // Subtract the complement of file_range from range, since they will be
+    // processed in a separate batch in parallel.
+    leftover += ~file_range;
+    range -= ~file_range;
+    if (!file_range.empty()) {
+      int level = fp.GetHitFileLevel();
+      auto stat = mget_stats.find(level);
+      if (stat == mget_stats.end()) {
+        auto entry = mget_stats.insert({level, {0, 0, 0}});
+        assert(entry.second);
+        stat = entry.first;
+      }
+
+      if (waiting.empty() && to_process.empty() &&
+          !fp.RemainingOverlapInLevel() && leftover.empty() &&
+          mget_tasks.empty()) {
+        // All keys are in one SST file, so take the fast path
+        s = MultiGetFromSST(read_options, file_range, fp.GetHitFileLevel(),
+                            skip_filters, skip_range_deletions, f, *blob_ctxs,
+                            table_handle, std::get<0>(stat->second),
+                            std::get<1>(stat->second),
+                            std::get<2>(stat->second));
+      } else {
+        mget_tasks.emplace_back(MultiGetFromSSTCoroutine(
+            read_options, file_range, fp.GetHitFileLevel(), skip_filters,
+            skip_range_deletions, f, *blob_ctxs, table_handle,
+            std::get<0>(stat->second), std::get<1>(stat->second),
+            std::get<2>(stat->second)));
+        ++num_tasks_queued;
+      }
+    }
+    if (fp.KeyMaySpanNextFile() && !file_range.empty()) {
+      break;
+    }
+    f = fp.GetNextFileInLevel();
+  }
+  // Split the current batch only if some keys are likely in this level and
+  // some are not. Only split if we're done with this level, i.e f is null.
+  // Otherwise, it means there are more files in this level to look at.
+  if (s.ok() && !f && !leftover.empty() && !range.empty()) {
+    fp.ReplaceRange(range);
+    batches.emplace_back(&leftover, fp);
+    to_process.emplace_back(batches.size() - 1);
+  }
+  // 1. If f is non-null, that means we might not be done with this level.
+  //    This can happen if one of the keys is the last key in the file, i.e
+  //    fp.KeyMaySpanNextFile() is true.
+  // 2. If range is empty, then we're done with this range and no need to
+  //    prepare the next level
+  // 3. If some tasks were queued for this range, then the next level will be
+  //    prepared after executing those tasks
+  if (!f && !range.empty() && !num_tasks_queued) {
+    fp.PrepareNextLevelForSearch();
+  }
+  return s;
+}
+
+Status Version::MultiGetAsync(
+    const ReadOptions& options, MultiGetRange* range,
+    std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs) {
+  autovector<FilePickerMultiGet, 4> batches;
+  std::deque<size_t> waiting;
+  std::deque<size_t> to_process;
+  Status s;
+  std::vector<folly::coro::Task<Status>> mget_tasks;
+  std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>> mget_stats;
+
+  // Create the initial batch with the input range
+  batches.emplace_back(range, &storage_info_.level_files_brief_,
+                       storage_info_.num_non_empty_levels_,
+                       &storage_info_.file_indexer_, user_comparator(),
+                       internal_comparator());
+  to_process.emplace_back(0);
+
+  while (!to_process.empty()) {
+    // As we process a batch, it may get split into two. So reserve space for
+    // an additional batch in the autovector in order to prevent later moves
+    // of elements in ProcessBatch().
+    batches.reserve(batches.size() + 1);
+
+    size_t idx = to_process.front();
+    FilePickerMultiGet* batch = &batches.at(idx);
+    unsigned int num_tasks_queued = 0;
+    to_process.pop_front();
+    if (batch->IsSearchEnded() || batch->GetRange().empty()) {
+      // If to_process is empty, i.e no more batches to look at, then we need
+      // schedule the enqueued coroutines and wait for them. Otherwise, we
+      // skip this batch and move to the next one in to_process.
+      if (!to_process.empty()) {
+        continue;
+      }
+    } else {
+      // Look through one level. This may split the batch and enqueue it to
+      // to_process
+      s = ProcessBatch(options, batch, mget_tasks, blob_ctxs, batches, waiting,
+                       to_process, num_tasks_queued, mget_stats);
+      // If ProcessBatch didn't enqueue any coroutine tasks, it means all
+      // keys were filtered out. So put the batch back in to_process to
+      // lookup in the next level
+      if (!num_tasks_queued && !batch->IsSearchEnded()) {
+        // Put this back in the processing queue
+        to_process.emplace_back(idx);
+      } else if (num_tasks_queued) {
+        waiting.emplace_back(idx);
+      }
+    }
+    // If ProcessBatch() returned an error, then schedule the enqueued
+    // coroutines and wait for them, then abort the MultiGet.
+    if (to_process.empty() || !s.ok()) {
+      if (mget_tasks.size() > 0) {
+        assert(waiting.size());
+        RecordTick(db_statistics_, MULTIGET_COROUTINE_COUNT, mget_tasks.size());
+        // Collect all results so far
+        std::vector<Status> statuses = folly::coro::blockingWait(
+            folly::coro::collectAllRange(std::move(mget_tasks))
+                .scheduleOn(&range->context()->executor()));
+        mget_tasks.clear();
+        if (s.ok()) {
+          for (Status stat : statuses) {
+            if (!stat.ok()) {
+              s = std::move(stat);
+              break;
+            }
+          }
+        }
+
+        if (!s.ok()) {
+          break;
+        }
+
+        for (size_t wait_idx : waiting) {
+          FilePickerMultiGet& fp = batches.at(wait_idx);
+          // 1. If fp.GetHitFile() is non-null, then there could be more
+          // overlap in this level. So skip preparing next level.
+          // 2. If fp.GetRange() is empty, then this batch is completed
+          // and no need to prepare the next level.
+          if (!fp.GetHitFile() && !fp.GetRange().empty()) {
+            fp.PrepareNextLevelForSearch();
+          }
+        }
+        to_process.swap(waiting);
+      } else {
+        assert(!s.ok() || waiting.size() == 0);
+      }
+    }
+    if (!s.ok()) {
+      break;
+    }
+  }
+
+  uint64_t num_levels = 0;
+  for (auto& stat : mget_stats) {
+    if (stat.first == 0) {
+      num_levels += std::get<2>(stat.second);
+    } else {
+      num_levels++;
+    }
+
+    uint64_t num_meta_reads =
+        std::get<0>(stat.second) + std::get<1>(stat.second);
+    uint64_t num_sst_reads = std::get<2>(stat.second);
+    if (num_meta_reads > 0) {
+      RecordInHistogram(db_statistics_,
+                        NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL,
+                        num_meta_reads);
+    }
+    if (num_sst_reads > 0) {
+      RecordInHistogram(db_statistics_, NUM_SST_READ_PER_LEVEL, num_sst_reads);
+    }
+  }
+  if (num_levels > 0) {
+    RecordInHistogram(db_statistics_, NUM_LEVEL_READ_PER_MULTIGET, num_levels);
+  }
+
+  return s;
+}
+#endif
+
+bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) {
+  // Reaching the bottom level implies misses at all upper levels, so we'll
+  // skip checking the filters when we predict a hit.
+  return cfd_->ioptions()->optimize_filters_for_hits &&
+         (level > 0 || is_file_last_in_level) &&
+         level == storage_info_.num_non_empty_levels() - 1;
+}
+
+void VersionStorageInfo::GenerateLevelFilesBrief() {
+  level_files_brief_.resize(num_non_empty_levels_);
+  for (int level = 0; level < num_non_empty_levels_; level++) {
+    DoGenerateLevelFilesBrief(&level_files_brief_[level], files_[level],
+                              &arena_);
+  }
+}
+
+void VersionStorageInfo::PrepareForVersionAppend(
+    const ImmutableOptions& immutable_options,
+    const MutableCFOptions& mutable_cf_options) {
+  ComputeCompensatedSizes();
+  UpdateNumNonEmptyLevels();
+  CalculateBaseBytes(immutable_options, mutable_cf_options);
+  UpdateFilesByCompactionPri(immutable_options, mutable_cf_options);
+  GenerateFileIndexer();
+  GenerateLevelFilesBrief();
+  GenerateLevel0NonOverlapping();
+  if (!immutable_options.allow_ingest_behind) {
+    GenerateBottommostFiles();
+  }
+  GenerateFileLocationIndex();
+}
+
+void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options,
+                            bool update_stats) {
+  TEST_SYNC_POINT_CALLBACK(
+      "Version::PrepareAppend:forced_check",
+      reinterpret_cast<void*>(&storage_info_.force_consistency_checks_));
+
+  if (update_stats) {
+    UpdateAccumulatedStats();
+  }
+
+  storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options);
+}
+
+bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) {
+    return false;
+  }
+  std::shared_ptr<const TableProperties> tp;
+  Status s = GetTableProperties(&tp, file_meta);
+  file_meta->init_stats_from_file = true;
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(vset_->db_options_->info_log,
+                    "Unable to load table properties for file %" PRIu64
+                    " --- %s\n",
+                    file_meta->fd.GetNumber(), s.ToString().c_str());
+    return false;
+  }
+  if (tp.get() == nullptr) return false;
+  file_meta->num_entries = tp->num_entries;
+  file_meta->num_deletions = tp->num_deletions;
+  file_meta->raw_value_size = tp->raw_value_size;
+  file_meta->raw_key_size = tp->raw_key_size;
+
+  return true;
+}
+
+void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) {
+  TEST_SYNC_POINT_CALLBACK("VersionStorageInfo::UpdateAccumulatedStats",
+                           nullptr);
+
+  assert(file_meta->init_stats_from_file);
+  accumulated_file_size_ += file_meta->fd.GetFileSize();
+  accumulated_raw_key_size_ += file_meta->raw_key_size;
+  accumulated_raw_value_size_ += file_meta->raw_value_size;
+  accumulated_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  accumulated_num_deletions_ += file_meta->num_deletions;
+
+  current_num_non_deletions_ +=
+      file_meta->num_entries - file_meta->num_deletions;
+  current_num_deletions_ += file_meta->num_deletions;
+  current_num_samples_++;
+}
+
+void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) {
+  if (file_meta->init_stats_from_file) {
+    current_num_non_deletions_ -=
+        file_meta->num_entries - file_meta->num_deletions;
+    current_num_deletions_ -= file_meta->num_deletions;
+    current_num_samples_--;
+  }
+}
+
+void Version::UpdateAccumulatedStats() {
+  // maximum number of table properties loaded from files.
+  const int kMaxInitCount = 20;
+  int init_count = 0;
+  // here only the first kMaxInitCount files which haven't been
+  // initialized from file will be updated with num_deletions.
+  // The motivation here is to cap the maximum I/O per Version creation.
+  // The reason for choosing files from lower-level instead of higher-level
+  // is that such design is able to propagate the initialization from
+  // lower-level to higher-level:  When the num_deletions of lower-level
+  // files are updated, it will make the lower-level files have accurate
+  // compensated_file_size, making lower-level to higher-level compaction
+  // will be triggered, which creates higher-level files whose num_deletions
+  // will be updated here.
+  for (int level = 0;
+       level < storage_info_.num_levels_ && init_count < kMaxInitCount;
+       ++level) {
+    for (auto* file_meta : storage_info_.files_[level]) {
+      if (MaybeInitializeFileMetaData(file_meta)) {
+        // each FileMeta will be initialized only once.
+        storage_info_.UpdateAccumulatedStats(file_meta);
+        // when option "max_open_files" is -1, all the file metadata has
+        // already been read, so MaybeInitializeFileMetaData() won't incur
+        // any I/O cost. "max_open_files=-1" means that the table cache passed
+        // to the VersionSet and then to the ColumnFamilySet has a size of
+        // TableCache::kInfiniteCapacity
+        if (vset_->GetColumnFamilySet()->get_table_cache()->GetCapacity() ==
+            TableCache::kInfiniteCapacity) {
+          continue;
+        }
+        if (++init_count >= kMaxInitCount) {
+          break;
+        }
+      }
+    }
+  }
+  // In case all sampled-files contain only deletion entries, then we
+  // load the table-property of a file in higher-level to initialize
+  // that value.
+  for (int level = storage_info_.num_levels_ - 1;
+       storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) {
+    for (int i = static_cast<int>(storage_info_.files_[level].size()) - 1;
+         storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) {
+      if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) {
+        storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeCompensatedSizes() {
+  static const int kDeletionWeightOnCompaction = 2;
+  uint64_t average_value_size = GetAverageValueSize();
+
+  // compute the compensated size
+  for (int level = 0; level < num_levels_; level++) {
+    for (auto* file_meta : files_[level]) {
+      // Here we only compute compensated_file_size for those file_meta
+      // which compensated_file_size is uninitialized (== 0). This is true only
+      // for files that have been created right now and no other thread has
+      // access to them. That's why we can safely mutate compensated_file_size.
+      if (file_meta->compensated_file_size == 0) {
+        file_meta->compensated_file_size = file_meta->fd.GetFileSize();
+        // Here we only boost the size of deletion entries of a file only
+        // when the number of deletion entries is greater than the number of
+        // non-deletion entries in the file.  The motivation here is that in
+        // a stable workload, the number of deletion entries should be roughly
+        // equal to the number of non-deletion entries.  If we compensate the
+        // size of deletion entries in a stable workload, the deletion
+        // compensation logic might introduce unwanted effet which changes the
+        // shape of LSM tree.
+        if (file_meta->num_deletions * 2 >= file_meta->num_entries) {
+          file_meta->compensated_file_size +=
+              (file_meta->num_deletions * 2 - file_meta->num_entries) *
+              average_value_size * kDeletionWeightOnCompaction;
+        }
+      }
+    }
+  }
+}
+
+int VersionStorageInfo::MaxInputLevel() const {
+  if (compaction_style_ == kCompactionStyleLevel) {
+    return num_levels() - 2;
+  }
+  return 0;
+}
+
+int VersionStorageInfo::MaxOutputLevel(bool allow_ingest_behind) const {
+  if (allow_ingest_behind) {
+    assert(num_levels() > 1);
+    return num_levels() - 2;
+  }
+  return num_levels() - 1;
+}
+
+void VersionStorageInfo::EstimateCompactionBytesNeeded(
+    const MutableCFOptions& mutable_cf_options) {
+  // Only implemented for level-based compaction
+  if (compaction_style_ != kCompactionStyleLevel) {
+    estimated_compaction_needed_bytes_ = 0;
+    return;
+  }
+
+  // Start from Level 0, if level 0 qualifies compaction to level 1,
+  // we estimate the size of compaction.
+  // Then we move on to the next level and see whether it qualifies compaction
+  // to the next level. The size of the level is estimated as the actual size
+  // on the level plus the input bytes from the previous level if there is any.
+  // If it exceeds, take the exceeded bytes as compaction input and add the size
+  // of the compaction size to tatal size.
+  // We keep doing it to Level 2, 3, etc, until the last level and return the
+  // accumulated bytes.
+
+  uint64_t bytes_compact_to_next_level = 0;
+  uint64_t level_size = 0;
+  for (auto* f : files_[0]) {
+    level_size += f->fd.GetFileSize();
+  }
+  // Level 0
+  bool level0_compact_triggered = false;
+  if (static_cast<int>(files_[0].size()) >=
+          mutable_cf_options.level0_file_num_compaction_trigger ||
+      level_size >= mutable_cf_options.max_bytes_for_level_base) {
+    level0_compact_triggered = true;
+    estimated_compaction_needed_bytes_ = level_size;
+    bytes_compact_to_next_level = level_size;
+  } else {
+    estimated_compaction_needed_bytes_ = 0;
+  }
+
+  // Level 1 and up.
+  uint64_t bytes_next_level = 0;
+  for (int level = base_level(); level <= MaxInputLevel(); level++) {
+    level_size = 0;
+    if (bytes_next_level > 0) {
+#ifndef NDEBUG
+      uint64_t level_size2 = 0;
+      for (auto* f : files_[level]) {
+        level_size2 += f->fd.GetFileSize();
+      }
+      assert(level_size2 == bytes_next_level);
+#endif
+      level_size = bytes_next_level;
+      bytes_next_level = 0;
+    } else {
+      for (auto* f : files_[level]) {
+        level_size += f->fd.GetFileSize();
+      }
+    }
+    if (level == base_level() && level0_compact_triggered) {
+      // Add base level size to compaction if level0 compaction triggered.
+      estimated_compaction_needed_bytes_ += level_size;
+    }
+    // Add size added by previous compaction
+    level_size += bytes_compact_to_next_level;
+    bytes_compact_to_next_level = 0;
+    uint64_t level_target = MaxBytesForLevel(level);
+    if (level_size > level_target) {
+      bytes_compact_to_next_level = level_size - level_target;
+      // Estimate the actual compaction fan-out ratio as size ratio between
+      // the two levels.
+
+      assert(bytes_next_level == 0);
+      if (level + 1 < num_levels_) {
+        for (auto* f : files_[level + 1]) {
+          bytes_next_level += f->fd.GetFileSize();
+        }
+      }
+      if (bytes_next_level > 0) {
+        assert(level_size > 0);
+        estimated_compaction_needed_bytes_ += static_cast<uint64_t>(
+            static_cast<double>(bytes_compact_to_next_level) *
+            (static_cast<double>(bytes_next_level) /
+                 static_cast<double>(level_size) +
+             1));
+      }
+    }
+  }
+}
+
+namespace {
+uint32_t GetExpiredTtlFilesCount(const ImmutableOptions& ioptions,
+                                 const MutableCFOptions& mutable_cf_options,
+                                 const std::vector<FileMetaData*>& files) {
+  uint32_t ttl_expired_files_count = 0;
+
+  int64_t _current_time;
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
+  if (status.ok()) {
+    const uint64_t current_time = static_cast<uint64_t>(_current_time);
+    for (FileMetaData* f : files) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time != 0 &&
+            oldest_ancester_time < (current_time - mutable_cf_options.ttl)) {
+          ttl_expired_files_count++;
+        }
+      }
+    }
+  }
+  return ttl_expired_files_count;
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::ComputeCompactionScore(
+    const ImmutableOptions& immutable_options,
+    const MutableCFOptions& mutable_cf_options) {
+  double total_downcompact_bytes = 0.0;
+  // Historically, score is defined as actual bytes in a level divided by
+  // the level's target size, and 1.0 is the threshold for triggering
+  // compaction. Higher score means higher prioritization.
+  // Now we keep the compaction triggering condition, but consider more
+  // factors for priorization, while still keeping the 1.0 threshold.
+  // In order to provide flexibility for reducing score while still
+  // maintaining it to be over 1.0, we scale the original score by 10x
+  // if it is larger than 1.0.
+  const double kScoreScale = 10.0;
+  for (int level = 0; level <= MaxInputLevel(); level++) {
+    double score;
+    if (level == 0) {
+      // We treat level-0 specially by bounding the number of files
+      // instead of number of bytes for two reasons:
+      //
+      // (1) With larger write-buffer sizes, it is nice not to do too
+      // many level-0 compactions.
+      //
+      // (2) The files in level-0 are merged on every read and
+      // therefore we wish to avoid too many files when the individual
+      // file size is small (perhaps because of a small write-buffer
+      // setting, or very high compression ratios, or lots of
+      // overwrites/deletions).
+      int num_sorted_runs = 0;
+      uint64_t total_size = 0;
+      for (auto* f : files_[level]) {
+        total_downcompact_bytes += static_cast<double>(f->fd.GetFileSize());
+        if (!f->being_compacted) {
+          total_size += f->compensated_file_size;
+          num_sorted_runs++;
+        }
+      }
+      if (compaction_style_ == kCompactionStyleUniversal) {
+        // For universal compaction, we use level0 score to indicate
+        // compaction score for the whole DB. Adding other levels as if
+        // they are L0 files.
+        for (int i = 1; i < num_levels(); i++) {
+          // Its possible that a subset of the files in a level may be in a
+          // compaction, due to delete triggered compaction or trivial move.
+          // In that case, the below check may not catch a level being
+          // compacted as it only checks the first file. The worst that can
+          // happen is a scheduled compaction thread will find nothing to do.
+          if (!files_[i].empty() && !files_[i][0]->being_compacted) {
+            num_sorted_runs++;
+          }
+        }
+      }
+
+      if (compaction_style_ == kCompactionStyleFIFO) {
+        score = static_cast<double>(total_size) /
+                mutable_cf_options.compaction_options_fifo.max_table_files_size;
+        if (mutable_cf_options.compaction_options_fifo.allow_compaction ||
+            mutable_cf_options.compaction_options_fifo.age_for_warm > 0) {
+          // Warm tier move can happen at any time. It's too expensive to
+          // check very file's timestamp now. For now, just trigger it
+          // slightly more frequently than FIFO compaction so that this
+          // happens first.
+          score = std::max(
+              static_cast<double>(num_sorted_runs) /
+                  mutable_cf_options.level0_file_num_compaction_trigger,
+              score);
+        }
+        if (mutable_cf_options.ttl > 0) {
+          score = std::max(
+              static_cast<double>(GetExpiredTtlFilesCount(
+                  immutable_options, mutable_cf_options, files_[level])),
+              score);
+        }
+      } else {
+        score = static_cast<double>(num_sorted_runs) /
+                mutable_cf_options.level0_file_num_compaction_trigger;
+        if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+          // Level-based involves L0->L0 compactions that can lead to oversized
+          // L0 files. Take into account size as well to avoid later giant
+          // compactions to the base level.
+          // If score in L0 is always too high, L0->L1 will always be
+          // prioritized over L1->L2 compaction and L1 will accumulate to
+          // too large. But if L0 score isn't high enough, L0 will accumulate
+          // and data is not moved to L1 fast enough. With potential L0->L0
+          // compaction, number of L0 files aren't always an indication of
+          // L0 oversizing, and we also need to consider total size of L0.
+          if (immutable_options.level_compaction_dynamic_level_bytes) {
+            if (total_size >= mutable_cf_options.max_bytes_for_level_base) {
+              // When calculating estimated_compaction_needed_bytes, we assume
+              // L0 is qualified as pending compactions. We will need to make
+              // sure that it qualifies for compaction.
+              // It might be guafanteed by logic below anyway, but we are
+              // explicit here to make sure we don't stop writes with no
+              // compaction scheduled.
+              score = std::max(score, 1.01);
+            }
+            if (total_size > level_max_bytes_[base_level_]) {
+              // In this case, we compare L0 size with actual L1 size and make
+              // sure score is more than 1.0 (10.0 after scaled) if L0 is larger
+              // than L1. Since in this case L1 score is lower than 10.0, L0->L1
+              // is prioritized over L1->L2.
+              uint64_t base_level_size = 0;
+              for (auto f : files_[base_level_]) {
+                base_level_size += f->compensated_file_size;
+              }
+              score = std::max(score, static_cast<double>(total_size) /
+                                          static_cast<double>(std::max(
+                                              base_level_size,
+                                              level_max_bytes_[base_level_])));
+            }
+            if (score > 1.0) {
+              score *= kScoreScale;
+            }
+          } else {
+            score = std::max(score,
+                             static_cast<double>(total_size) /
+                                 mutable_cf_options.max_bytes_for_level_base);
+          }
+        }
+      }
+    } else {
+      // Compute the ratio of current size to size limit.
+      uint64_t level_bytes_no_compacting = 0;
+      uint64_t level_total_bytes = 0;
+      for (auto f : files_[level]) {
+        level_total_bytes += f->fd.GetFileSize();
+        if (!f->being_compacted) {
+          level_bytes_no_compacting += f->compensated_file_size;
+        }
+      }
+      if (!immutable_options.level_compaction_dynamic_level_bytes ||
+          level_bytes_no_compacting < MaxBytesForLevel(level)) {
+        score = static_cast<double>(level_bytes_no_compacting) /
+                MaxBytesForLevel(level);
+      } else {
+        // If there are a large mount of data being compacted down to the
+        // current level soon, we would de-prioritize compaction from
+        // a level where the incoming data would be a large ratio. We do
+        // it by dividing level size not by target level size, but
+        // the target size and the incoming compaction bytes.
+        score = static_cast<double>(level_bytes_no_compacting) /
+                (MaxBytesForLevel(level) + total_downcompact_bytes) *
+                kScoreScale;
+      }
+      if (level_total_bytes > MaxBytesForLevel(level)) {
+        total_downcompact_bytes +=
+            static_cast<double>(level_total_bytes - MaxBytesForLevel(level));
+      }
+    }
+    compaction_level_[level] = level;
+    compaction_score_[level] = score;
+  }
+
+  // sort all the levels based on their score. Higher scores get listed
+  // first. Use bubble sort because the number of entries are small.
+  for (int i = 0; i < num_levels() - 2; i++) {
+    for (int j = i + 1; j < num_levels() - 1; j++) {
+      if (compaction_score_[i] < compaction_score_[j]) {
+        double score = compaction_score_[i];
+        int level = compaction_level_[i];
+        compaction_score_[i] = compaction_score_[j];
+        compaction_level_[i] = compaction_level_[j];
+        compaction_score_[j] = score;
+        compaction_level_[j] = level;
+      }
+    }
+  }
+  ComputeFilesMarkedForCompaction();
+  if (!immutable_options.allow_ingest_behind) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+  if (mutable_cf_options.ttl > 0) {
+    ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
+  }
+  if (mutable_cf_options.periodic_compaction_seconds > 0) {
+    ComputeFilesMarkedForPeriodicCompaction(
+        immutable_options, mutable_cf_options.periodic_compaction_seconds);
+  }
+
+  if (mutable_cf_options.enable_blob_garbage_collection &&
+      mutable_cf_options.blob_garbage_collection_age_cutoff > 0.0 &&
+      mutable_cf_options.blob_garbage_collection_force_threshold < 1.0) {
+    ComputeFilesMarkedForForcedBlobGC(
+        mutable_cf_options.blob_garbage_collection_age_cutoff,
+        mutable_cf_options.blob_garbage_collection_force_threshold);
+  }
+
+  EstimateCompactionBytesNeeded(mutable_cf_options);
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForCompaction() {
+  files_marked_for_compaction_.clear();
+  int last_qualify_level = 0;
+
+  // Do not include files from the last level with data
+  // If table properties collector suggests a file on the last level,
+  // we should not move it to a new level.
+  for (int level = num_levels() - 1; level >= 1; level--) {
+    if (!files_[level].empty()) {
+      last_qualify_level = level - 1;
+      break;
+    }
+  }
+
+  for (int level = 0; level <= last_qualify_level; level++) {
+    for (auto* f : files_[level]) {
+      if (!f->being_compacted && f->marked_for_compaction) {
+        files_marked_for_compaction_.emplace_back(level, f);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeExpiredTtlFiles(
+    const ImmutableOptions& ioptions, const uint64_t ttl) {
+  assert(ttl > 0);
+
+  expired_ttl_files_.clear();
+
+  int64_t _current_time;
+  auto status = ioptions.clock->GetCurrentTime(&_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(_current_time);
+
+  for (int level = 0; level < num_levels() - 1; level++) {
+    for (FileMetaData* f : files_[level]) {
+      if (!f->being_compacted) {
+        uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
+        if (oldest_ancester_time > 0 &&
+            oldest_ancester_time < (current_time - ttl)) {
+          expired_ttl_files_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForPeriodicCompaction(
+    const ImmutableOptions& ioptions,
+    const uint64_t periodic_compaction_seconds) {
+  assert(periodic_compaction_seconds > 0);
+
+  files_marked_for_periodic_compaction_.clear();
+
+  int64_t temp_current_time;
+  auto status = ioptions.clock->GetCurrentTime(&temp_current_time);
+  if (!status.ok()) {
+    return;
+  }
+  const uint64_t current_time = static_cast<uint64_t>(temp_current_time);
+
+  // If periodic_compaction_seconds is larger than current time, periodic
+  // compaction can't possibly be triggered.
+  if (periodic_compaction_seconds > current_time) {
+    return;
+  }
+
+  const uint64_t allowed_time_limit =
+      current_time - periodic_compaction_seconds;
+
+  for (int level = 0; level < num_levels(); level++) {
+    for (auto f : files_[level]) {
+      if (!f->being_compacted) {
+        // Compute a file's modification time in the following order:
+        // 1. Use file_creation_time table property if it is > 0.
+        // 2. Use creation_time table property if it is > 0.
+        // 3. Use file's mtime metadata if the above two table properties are 0.
+        // Don't consider the file at all if the modification time cannot be
+        // correctly determined based on the above conditions.
+        uint64_t file_modification_time = f->TryGetFileCreationTime();
+        if (file_modification_time == kUnknownFileCreationTime) {
+          file_modification_time = f->TryGetOldestAncesterTime();
+        }
+        if (file_modification_time == kUnknownOldestAncesterTime) {
+          auto file_path = TableFileName(ioptions.cf_paths, f->fd.GetNumber(),
+                                         f->fd.GetPathId());
+          status = ioptions.env->GetFileModificationTime(
+              file_path, &file_modification_time);
+          if (!status.ok()) {
+            ROCKS_LOG_WARN(ioptions.logger,
+                           "Can't get file modification time: %s: %s",
+                           file_path.c_str(), status.ToString().c_str());
+            continue;
+          }
+        }
+        if (file_modification_time > 0 &&
+            file_modification_time < allowed_time_limit) {
+          files_marked_for_periodic_compaction_.emplace_back(level, f);
+        }
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::ComputeFilesMarkedForForcedBlobGC(
+    double blob_garbage_collection_age_cutoff,
+    double blob_garbage_collection_force_threshold) {
+  files_marked_for_forced_blob_gc_.clear();
+
+  if (blob_files_.empty()) {
+    return;
+  }
+
+  // Number of blob files eligible for GC based on age
+  const size_t cutoff_count = static_cast<size_t>(
+      blob_garbage_collection_age_cutoff * blob_files_.size());
+  if (!cutoff_count) {
+    return;
+  }
+
+  // Compute the sum of total and garbage bytes over the oldest batch of blob
+  // files. The oldest batch is defined as the set of blob files which are
+  // kept alive by the same SSTs as the very oldest one. Here is a toy example.
+  // Let's assume we have three SSTs 1, 2, and 3, and four blob files 10, 11,
+  // 12, and 13. Also, let's say SSTs 1 and 2 both rely on blob file 10 and
+  // potentially some higher-numbered ones, while SST 3 relies on blob file 12
+  // and potentially some higher-numbered ones. Then, the SST to oldest blob
+  // file mapping is as follows:
+  //
+  // SST file number               Oldest blob file number
+  // 1                             10
+  // 2                             10
+  // 3                             12
+  //
+  // This is what the same thing looks like from the blob files' POV. (Note that
+  // the linked SSTs simply denote the inverse mapping of the above.)
+  //
+  // Blob file number              Linked SST set
+  // 10                            {1, 2}
+  // 11                            {}
+  // 12                            {3}
+  // 13                            {}
+  //
+  // Then, the oldest batch of blob files consists of blob files 10 and 11,
+  // and we can get rid of them by forcing the compaction of SSTs 1 and 2.
+  //
+  // Note that the overall ratio of garbage computed for the batch has to exceed
+  // blob_garbage_collection_force_threshold and the entire batch has to be
+  // eligible for GC according to blob_garbage_collection_age_cutoff in order
+  // for us to schedule any compactions.
+  const auto& oldest_meta = blob_files_.front();
+  assert(oldest_meta);
+
+  const auto& linked_ssts = oldest_meta->GetLinkedSsts();
+  assert(!linked_ssts.empty());
+
+  size_t count = 1;
+  uint64_t sum_total_blob_bytes = oldest_meta->GetTotalBlobBytes();
+  uint64_t sum_garbage_blob_bytes = oldest_meta->GetGarbageBlobBytes();
+
+  assert(cutoff_count <= blob_files_.size());
+
+  for (; count < cutoff_count; ++count) {
+    const auto& meta = blob_files_[count];
+    assert(meta);
+
+    if (!meta->GetLinkedSsts().empty()) {
+      // Found the beginning of the next batch of blob files
+      break;
+    }
+
+    sum_total_blob_bytes += meta->GetTotalBlobBytes();
+    sum_garbage_blob_bytes += meta->GetGarbageBlobBytes();
+  }
+
+  if (count < blob_files_.size()) {
+    const auto& meta = blob_files_[count];
+    assert(meta);
+
+    if (meta->GetLinkedSsts().empty()) {
+      // Some files in the oldest batch are not eligible for GC
+      return;
+    }
+  }
+
+  if (sum_garbage_blob_bytes <
+      blob_garbage_collection_force_threshold * sum_total_blob_bytes) {
+    return;
+  }
+
+  for (uint64_t sst_file_number : linked_ssts) {
+    const FileLocation location = GetFileLocation(sst_file_number);
+    assert(location.IsValid());
+
+    const int level = location.GetLevel();
+    assert(level >= 0);
+
+    const size_t pos = location.GetPosition();
+
+    FileMetaData* const sst_meta = files_[level][pos];
+    assert(sst_meta);
+
+    if (sst_meta->being_compacted) {
+      continue;
+    }
+
+    files_marked_for_forced_blob_gc_.emplace_back(level, sst_meta);
+  }
+}
+
+namespace {
+
+// used to sort files by size
+struct Fsize {
+  size_t index;
+  FileMetaData* file;
+};
+
+// Comparator that is used to sort files based on their size
+// In normal mode: descending size
+bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
+  return (first.file->compensated_file_size >
+          second.file->compensated_file_size);
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
+  auto& level_files = files_[level];
+  level_files.push_back(f);
+
+  f->refs++;
+}
+
+void VersionStorageInfo::AddBlobFile(
+    std::shared_ptr<BlobFileMetaData> blob_file_meta) {
+  assert(blob_file_meta);
+
+  assert(blob_files_.empty() ||
+         (blob_files_.back() && blob_files_.back()->GetBlobFileNumber() <
+                                    blob_file_meta->GetBlobFileNumber()));
+
+  blob_files_.emplace_back(std::move(blob_file_meta));
+}
+
+VersionStorageInfo::BlobFiles::const_iterator
+VersionStorageInfo::GetBlobFileMetaDataLB(uint64_t blob_file_number) const {
+  return std::lower_bound(
+      blob_files_.begin(), blob_files_.end(), blob_file_number,
+      [](const std::shared_ptr<BlobFileMetaData>& lhs, uint64_t rhs) {
+        assert(lhs);
+        return lhs->GetBlobFileNumber() < rhs;
+      });
+}
+
+void VersionStorageInfo::SetFinalized() {
+  finalized_ = true;
+
+#ifndef NDEBUG
+  if (compaction_style_ != kCompactionStyleLevel) {
+    // Not level based compaction.
+    return;
+  }
+  assert(base_level_ < 0 || num_levels() == 1 ||
+         (base_level_ >= 1 && base_level_ < num_levels()));
+  // Verify all levels newer than base_level are empty except L0
+  for (int level = 1; level < base_level(); level++) {
+    assert(NumLevelBytes(level) == 0);
+  }
+  uint64_t max_bytes_prev_level = 0;
+  for (int level = base_level(); level < num_levels() - 1; level++) {
+    if (LevelFiles(level).size() == 0) {
+      continue;
+    }
+    assert(MaxBytesForLevel(level) >= max_bytes_prev_level);
+    max_bytes_prev_level = MaxBytesForLevel(level);
+  }
+  for (int level = 0; level < num_levels(); level++) {
+    assert(LevelFiles(level).size() == 0 ||
+           LevelFiles(level).size() == LevelFilesBrief(level).num_files);
+    if (LevelFiles(level).size() > 0) {
+      assert(level < num_non_empty_levels());
+    }
+  }
+  assert(compaction_level_.size() > 0);
+  assert(compaction_level_.size() == compaction_score_.size());
+#endif
+}
+
+void VersionStorageInfo::UpdateNumNonEmptyLevels() {
+  num_non_empty_levels_ = num_levels_;
+  for (int i = num_levels_ - 1; i >= 0; i--) {
+    if (files_[i].size() != 0) {
+      return;
+    } else {
+      num_non_empty_levels_ = i;
+    }
+  }
+}
+
+namespace {
+// Sort `temp` based on ratio of overlapping size over file size
+void SortFileByOverlappingRatio(
+    const InternalKeyComparator& icmp, const std::vector<FileMetaData*>& files,
+    const std::vector<FileMetaData*>& next_level_files, SystemClock* clock,
+    int level, int num_non_empty_levels, uint64_t ttl,
+    std::vector<Fsize>* temp) {
+  std::unordered_map<uint64_t, uint64_t> file_to_order;
+  auto next_level_it = next_level_files.begin();
+
+  int64_t curr_time;
+  Status status = clock->GetCurrentTime(&curr_time);
+  if (!status.ok()) {
+    // If we can't get time, disable TTL.
+    ttl = 0;
+  }
+
+  FileTtlBooster ttl_booster(static_cast<uint64_t>(curr_time), ttl,
+                             num_non_empty_levels, level);
+
+  for (auto& file : files) {
+    uint64_t overlapping_bytes = 0;
+    // Skip files in next level that is smaller than current file
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->largest, file->smallest) < 0) {
+      next_level_it++;
+    }
+
+    while (next_level_it != next_level_files.end() &&
+           icmp.Compare((*next_level_it)->smallest, file->largest) < 0) {
+      overlapping_bytes += (*next_level_it)->fd.file_size;
+
+      if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) {
+        // next level file cross large boundary of current file.
+        break;
+      }
+      next_level_it++;
+    }
+
+    uint64_t ttl_boost_score = (ttl > 0) ? ttl_booster.GetBoostScore(file) : 1;
+    assert(ttl_boost_score > 0);
+    assert(file->compensated_file_size != 0);
+    file_to_order[file->fd.GetNumber()] = overlapping_bytes * 1024U /
+                                          file->compensated_file_size /
+                                          ttl_boost_score;
+  }
+
+  size_t num_to_sort = temp->size() > VersionStorageInfo::kNumberFilesToSort
+                           ? VersionStorageInfo::kNumberFilesToSort
+                           : temp->size();
+
+  std::partial_sort(temp->begin(), temp->begin() + num_to_sort, temp->end(),
+                    [&](const Fsize& f1, const Fsize& f2) -> bool {
+                      // If score is the same, pick file with smaller keys.
+                      // This makes the algorithm more deterministic, and also
+                      // help the trivial move case to have more files to
+                      // extend.
+                      if (file_to_order[f1.file->fd.GetNumber()] ==
+                          file_to_order[f2.file->fd.GetNumber()]) {
+                        return icmp.Compare(f1.file->smallest,
+                                            f2.file->smallest) < 0;
+                      }
+                      return file_to_order[f1.file->fd.GetNumber()] <
+                             file_to_order[f2.file->fd.GetNumber()];
+                    });
+}
+
+void SortFileByRoundRobin(const InternalKeyComparator& icmp,
+                          std::vector<InternalKey>* compact_cursor,
+                          bool level0_non_overlapping, int level,
+                          std::vector<Fsize>* temp) {
+  if (level == 0 && !level0_non_overlapping) {
+    // Using kOldestSmallestSeqFirst when level === 0, since the
+    // files may overlap (not fully sorted)
+    std::sort(temp->begin(), temp->end(),
+              [](const Fsize& f1, const Fsize& f2) -> bool {
+                return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno;
+              });
+    return;
+  }
+
+  bool should_move_files =
+      compact_cursor->at(level).size() > 0 && temp->size() > 1;
+
+  // The iterator points to the Fsize with smallest key larger than or equal to
+  // the given cursor
+  std::vector<Fsize>::iterator current_file_iter;
+  if (should_move_files) {
+    // Find the file of which the smallest key is larger than or equal to
+    // the cursor (the smallest key in the successor file of the last
+    // chosen file), skip this if the cursor is invalid or there is only
+    // one file in this level
+    current_file_iter = std::lower_bound(
+        temp->begin(), temp->end(), compact_cursor->at(level),
+        [&](const Fsize& f, const InternalKey& cursor) -> bool {
+          return icmp.Compare(cursor, f.file->smallest) > 0;
+        });
+
+    should_move_files =
+        current_file_iter != temp->end() && current_file_iter != temp->begin();
+  }
+  if (should_move_files) {
+    // Construct a local temporary vector
+    std::vector<Fsize> local_temp;
+    local_temp.reserve(temp->size());
+    // Move the selected File into the first position and its successors
+    // into the second, third, ..., positions
+    for (auto iter = current_file_iter; iter != temp->end(); iter++) {
+      local_temp.push_back(*iter);
+    }
+    // Move the origin predecessors of the selected file in a round-robin
+    // manner
+    for (auto iter = temp->begin(); iter != current_file_iter; iter++) {
+      local_temp.push_back(*iter);
+    }
+    // Replace all the items in temp
+    for (size_t i = 0; i < local_temp.size(); i++) {
+      temp->at(i) = local_temp[i];
+    }
+  }
+}
+}  // anonymous namespace
+
+void VersionStorageInfo::UpdateFilesByCompactionPri(
+    const ImmutableOptions& ioptions, const MutableCFOptions& options) {
+  if (compaction_style_ == kCompactionStyleNone ||
+      compaction_style_ == kCompactionStyleFIFO ||
+      compaction_style_ == kCompactionStyleUniversal) {
+    // don't need this
+    return;
+  }
+  // No need to sort the highest level because it is never compacted.
+  for (int level = 0; level < num_levels() - 1; level++) {
+    const std::vector<FileMetaData*>& files = files_[level];
+    auto& files_by_compaction_pri = files_by_compaction_pri_[level];
+    assert(files_by_compaction_pri.size() == 0);
+
+    // populate a temp vector for sorting based on size
+    std::vector<Fsize> temp(files.size());
+    for (size_t i = 0; i < files.size(); i++) {
+      temp[i].index = i;
+      temp[i].file = files[i];
+    }
+
+    // sort the top number_of_files_to_sort_ based on file size
+    size_t num = VersionStorageInfo::kNumberFilesToSort;
+    if (num > temp.size()) {
+      num = temp.size();
+    }
+    switch (ioptions.compaction_pri) {
+      case kByCompensatedSize:
+        std::partial_sort(temp.begin(), temp.begin() + num, temp.end(),
+                          CompareCompensatedSizeDescending);
+        break;
+      case kOldestLargestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.largest_seqno <
+                           f2.file->fd.largest_seqno;
+                  });
+        break;
+      case kOldestSmallestSeqFirst:
+        std::sort(temp.begin(), temp.end(),
+                  [](const Fsize& f1, const Fsize& f2) -> bool {
+                    return f1.file->fd.smallest_seqno <
+                           f2.file->fd.smallest_seqno;
+                  });
+        break;
+      case kMinOverlappingRatio:
+        SortFileByOverlappingRatio(*internal_comparator_, files_[level],
+                                   files_[level + 1], ioptions.clock, level,
+                                   num_non_empty_levels_, options.ttl, &temp);
+        break;
+      case kRoundRobin:
+        SortFileByRoundRobin(*internal_comparator_, &compact_cursor_,
+                             level0_non_overlapping_, level, &temp);
+        break;
+      default:
+        assert(false);
+    }
+    assert(temp.size() == files.size());
+
+    // initialize files_by_compaction_pri_
+    for (size_t i = 0; i < temp.size(); i++) {
+      files_by_compaction_pri.push_back(static_cast<int>(temp[i].index));
+    }
+    next_file_to_compact_by_size_[level] = 0;
+    assert(files_[level].size() == files_by_compaction_pri_[level].size());
+  }
+}
+
+void VersionStorageInfo::GenerateLevel0NonOverlapping() {
+  assert(!finalized_);
+  level0_non_overlapping_ = true;
+  if (level_files_brief_.size() == 0) {
+    return;
+  }
+
+  // A copy of L0 files sorted by smallest key
+  std::vector<FdWithKeyRange> level0_sorted_file(
+      level_files_brief_[0].files,
+      level_files_brief_[0].files + level_files_brief_[0].num_files);
+  std::sort(level0_sorted_file.begin(), level0_sorted_file.end(),
+            [this](const FdWithKeyRange& f1, const FdWithKeyRange& f2) -> bool {
+              return (internal_comparator_->Compare(f1.smallest_key,
+                                                    f2.smallest_key) < 0);
+            });
+
+  for (size_t i = 1; i < level0_sorted_file.size(); ++i) {
+    FdWithKeyRange& f = level0_sorted_file[i];
+    FdWithKeyRange& prev = level0_sorted_file[i - 1];
+    if (internal_comparator_->Compare(prev.largest_key, f.smallest_key) >= 0) {
+      level0_non_overlapping_ = false;
+      break;
+    }
+  }
+}
+
+void VersionStorageInfo::GenerateBottommostFiles() {
+  assert(!finalized_);
+  assert(bottommost_files_.empty());
+  for (size_t level = 0; level < level_files_brief_.size(); ++level) {
+    for (size_t file_idx = 0; file_idx < level_files_brief_[level].num_files;
+         ++file_idx) {
+      const FdWithKeyRange& f = level_files_brief_[level].files[file_idx];
+      int l0_file_idx;
+      if (level == 0) {
+        l0_file_idx = static_cast<int>(file_idx);
+      } else {
+        l0_file_idx = -1;
+      }
+      Slice smallest_user_key = ExtractUserKey(f.smallest_key);
+      Slice largest_user_key = ExtractUserKey(f.largest_key);
+      if (!RangeMightExistAfterSortedRun(smallest_user_key, largest_user_key,
+                                         static_cast<int>(level),
+                                         l0_file_idx)) {
+        bottommost_files_.emplace_back(static_cast<int>(level),
+                                       f.file_metadata);
+      }
+    }
+  }
+}
+
+void VersionStorageInfo::GenerateFileLocationIndex() {
+  size_t num_files = 0;
+
+  for (int level = 0; level < num_levels_; ++level) {
+    num_files += files_[level].size();
+  }
+
+  file_locations_.reserve(num_files);
+
+  for (int level = 0; level < num_levels_; ++level) {
+    for (size_t pos = 0; pos < files_[level].size(); ++pos) {
+      const FileMetaData* const meta = files_[level][pos];
+      assert(meta);
+
+      const uint64_t file_number = meta->fd.GetNumber();
+
+      assert(file_locations_.find(file_number) == file_locations_.end());
+      file_locations_.emplace(file_number, FileLocation(level, pos));
+    }
+  }
+}
+
+void VersionStorageInfo::UpdateOldestSnapshot(SequenceNumber seqnum) {
+  assert(seqnum >= oldest_snapshot_seqnum_);
+  oldest_snapshot_seqnum_ = seqnum;
+  if (oldest_snapshot_seqnum_ > bottommost_files_mark_threshold_) {
+    ComputeBottommostFilesMarkedForCompaction();
+  }
+}
+
+void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() {
+  bottommost_files_marked_for_compaction_.clear();
+  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+  for (auto& level_and_file : bottommost_files_) {
+    if (!level_and_file.second->being_compacted &&
+        level_and_file.second->fd.largest_seqno != 0) {
+      // largest_seqno might be nonzero due to containing the final key in an
+      // earlier compaction, whose seqnum we didn't zero out. Multiple deletions
+      // ensures the file really contains deleted or overwritten keys.
+      if (level_and_file.second->fd.largest_seqno < oldest_snapshot_seqnum_) {
+        bottommost_files_marked_for_compaction_.push_back(level_and_file);
+      } else {
+        bottommost_files_mark_threshold_ =
+            std::min(bottommost_files_mark_threshold_,
+                     level_and_file.second->fd.largest_seqno);
+      }
+    }
+  }
+}
+
+void Version::Ref() { ++refs_; }
+
+bool Version::Unref() {
+  assert(refs_ >= 1);
+  --refs_;
+  if (refs_ == 0) {
+    delete this;
+    return true;
+  }
+  return false;
+}
+
+bool VersionStorageInfo::OverlapInLevel(int level,
+                                        const Slice* smallest_user_key,
+                                        const Slice* largest_user_key) {
+  if (level >= num_non_empty_levels_) {
+    // empty level, no overlap
+    return false;
+  }
+  return SomeFileOverlapsRange(*internal_comparator_, (level > 0),
+                               level_files_brief_[level], smallest_user_key,
+                               largest_user_key);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// If hint_index is specified, then it points to a file in the
+// overlapping range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetOverlappingInputs(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool expand_range, InternalKey** next_smallest) const {
+  if (level >= num_non_empty_levels_) {
+    // this level is empty, no overlapping inputs
+    return;
+  }
+
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  const Comparator* user_cmp = user_comparator_;
+  if (level > 0) {
+    GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                          file_index, false, next_smallest);
+    return;
+  }
+
+  if (next_smallest) {
+    // next_smallest key only makes sense for non-level 0, where files are
+    // non-overlapping
+    *next_smallest = nullptr;
+  }
+
+  Slice user_begin, user_end;
+  if (begin != nullptr) {
+    user_begin = begin->user_key();
+  }
+  if (end != nullptr) {
+    user_end = end->user_key();
+  }
+
+  // index stores the file index need to check.
+  std::list<size_t> index;
+  for (size_t i = 0; i < level_files_brief_[level].num_files; i++) {
+    index.emplace_back(i);
+  }
+
+  while (!index.empty()) {
+    bool found_overlapping_file = false;
+    auto iter = index.begin();
+    while (iter != index.end()) {
+      FdWithKeyRange* f = &(level_files_brief_[level].files[*iter]);
+      const Slice file_start = ExtractUserKey(f->smallest_key);
+      const Slice file_limit = ExtractUserKey(f->largest_key);
+      if (begin != nullptr &&
+          user_cmp->CompareWithoutTimestamp(file_limit, user_begin) < 0) {
+        // "f" is completely before specified range; skip it
+        iter++;
+      } else if (end != nullptr &&
+                 user_cmp->CompareWithoutTimestamp(file_start, user_end) > 0) {
+        // "f" is completely after specified range; skip it
+        iter++;
+      } else {
+        // if overlap
+        inputs->emplace_back(files_[level][*iter]);
+        found_overlapping_file = true;
+        // record the first file index.
+        if (file_index && *file_index == -1) {
+          *file_index = static_cast<int>(*iter);
+        }
+        // the related file is overlap, erase to avoid checking again.
+        iter = index.erase(iter);
+        if (expand_range) {
+          if (begin != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_start, user_begin) < 0) {
+            user_begin = file_start;
+          }
+          if (end != nullptr &&
+              user_cmp->CompareWithoutTimestamp(file_limit, user_end) > 0) {
+            user_end = file_limit;
+          }
+        }
+      }
+    }
+    // if all the files left are not overlap, break
+    if (!found_overlapping_file) {
+      break;
+    }
+  }
+}
+
+// Store in "*inputs" files in "level" that within range [begin,end]
+// Guarantee a "clean cut" boundary between the files in inputs
+// and the surrounding files and the maxinum number of files.
+// This will ensure that no parts of a key are lost during compaction.
+// If hint_index is specified, then it points to a file in the range.
+// The file_index returns a pointer to any file in an overlapping range.
+void VersionStorageInfo::GetCleanInputsWithinInterval(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index) const {
+  inputs->clear();
+  if (file_index) {
+    *file_index = -1;
+  }
+  if (level >= num_non_empty_levels_ || level == 0 ||
+      level_files_brief_[level].num_files == 0) {
+    // this level is empty, no inputs within range
+    // also don't support clean input interval within L0
+    return;
+  }
+
+  GetOverlappingInputsRangeBinarySearch(level, begin, end, inputs, hint_index,
+                                        file_index, true /* within_interval */);
+}
+
+// Store in "*inputs" all files in "level" that overlap [begin,end]
+// Employ binary search to find at least one file that overlaps the
+// specified range. From that file, iterate backwards and
+// forwards to find all overlapping files.
+// if within_range is set, then only store the maximum clean inputs
+// within range [begin, end]. "clean" means there is a boundary
+// between the files in "*inputs" and the surrounding files
+void VersionStorageInfo::GetOverlappingInputsRangeBinarySearch(
+    int level, const InternalKey* begin, const InternalKey* end,
+    std::vector<FileMetaData*>* inputs, int hint_index, int* file_index,
+    bool within_interval, InternalKey** next_smallest) const {
+  assert(level > 0);
+
+  auto user_cmp = user_comparator_;
+  const FdWithKeyRange* files = level_files_brief_[level].files;
+  const int num_files = static_cast<int>(level_files_brief_[level].num_files);
+
+  // begin to use binary search to find lower bound
+  // and upper bound.
+  int start_index = 0;
+  int end_index = num_files;
+
+  if (begin != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::lower_bound.
+    auto cmp = [&user_cmp, &within_interval](const FdWithKeyRange& f,
+                                             const InternalKey* k) {
+      auto& file_key = within_interval ? f.file_metadata->smallest
+                                       : f.file_metadata->largest;
+      return sstableKeyCompare(user_cmp, file_key, *k) < 0;
+    };
+
+    start_index = static_cast<int>(
+        std::lower_bound(files,
+                         files + (hint_index == -1 ? num_files : hint_index),
+                         begin, cmp) -
+        files);
+
+    if (start_index > 0 && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && start_index < num_files) {
+        auto& pre_limit = files[start_index - 1].file_metadata->largest;
+        auto& cur_start = files[start_index].file_metadata->smallest;
+        is_overlapping = sstableKeyCompare(user_cmp, pre_limit, cur_start) == 0;
+        start_index += is_overlapping;
+      }
+    }
+  }
+
+  if (end != nullptr) {
+    // if within_interval is true, with file_key would find
+    // not overlapping ranges in std::upper_bound.
+    auto cmp = [&user_cmp, &within_interval](const InternalKey* k,
+                                             const FdWithKeyRange& f) {
+      auto& file_key = within_interval ? f.file_metadata->largest
+                                       : f.file_metadata->smallest;
+      return sstableKeyCompare(user_cmp, *k, file_key) < 0;
+    };
+
+    end_index = static_cast<int>(
+        std::upper_bound(files + start_index, files + num_files, end, cmp) -
+        files);
+
+    if (end_index < num_files && within_interval) {
+      bool is_overlapping = true;
+      while (is_overlapping && end_index > start_index) {
+        auto& next_start = files[end_index].file_metadata->smallest;
+        auto& cur_limit = files[end_index - 1].file_metadata->largest;
+        is_overlapping =
+            sstableKeyCompare(user_cmp, cur_limit, next_start) == 0;
+        end_index -= is_overlapping;
+      }
+    }
+  }
+
+  assert(start_index <= end_index);
+
+  // If there were no overlapping files, return immediately.
+  if (start_index == end_index) {
+    if (next_smallest) {
+      *next_smallest = nullptr;
+    }
+    return;
+  }
+
+  assert(start_index < end_index);
+
+  // returns the index where an overlap is found
+  if (file_index) {
+    *file_index = start_index;
+  }
+
+  // insert overlapping files into vector
+  for (int i = start_index; i < end_index; i++) {
+    inputs->push_back(files_[level][i]);
+  }
+
+  if (next_smallest != nullptr) {
+    // Provide the next key outside the range covered by inputs
+    if (end_index < static_cast<int>(files_[level].size())) {
+      **next_smallest = files_[level][end_index]->smallest;
+    } else {
+      *next_smallest = nullptr;
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::NumLevelBytes(int level) const {
+  assert(level >= 0);
+  assert(level < num_levels());
+  return TotalFileSize(files_[level]);
+}
+
+const char* VersionStorageInfo::LevelSummary(
+    LevelSummaryStorage* scratch) const {
+  int len = 0;
+  if (compaction_style_ == kCompactionStyleLevel && num_levels() > 1) {
+    assert(base_level_ < static_cast<int>(level_max_bytes_.size()));
+    if (level_multiplier_ != 0.0) {
+      len = snprintf(
+          scratch->buffer, sizeof(scratch->buffer),
+          "base level %d level multiplier %.2f max bytes base %" PRIu64 " ",
+          base_level_, level_multiplier_, level_max_bytes_[base_level_]);
+    }
+  }
+  len +=
+      snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "files[");
+  for (int i = 0; i < num_levels(); i++) {
+    int sz = sizeof(scratch->buffer) - len;
+    int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size()));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  if (len > 0) {
+    // overwrite the last space
+    --len;
+  }
+  len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+                  "] max score %.2f", compaction_score_[0]);
+
+  if (!files_marked_for_compaction_.empty()) {
+    snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
+             " (%" ROCKSDB_PRIszt " files need compaction)",
+             files_marked_for_compaction_.size());
+  }
+
+  return scratch->buffer;
+}
+
+const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch,
+                                                 int level) const {
+  int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size[");
+  for (const auto& f : files_[level]) {
+    int sz = sizeof(scratch->buffer) - len;
+    char sztxt[16];
+    AppendHumanBytes(f->fd.GetFileSize(), sztxt, sizeof(sztxt));
+    int ret = snprintf(scratch->buffer + len, sz,
+                       "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ",
+                       f->fd.GetNumber(), f->fd.smallest_seqno, sztxt,
+                       static_cast<int>(f->being_compacted));
+    if (ret < 0 || ret >= sz) break;
+    len += ret;
+  }
+  // overwrite the last space (only if files_[level].size() is non-zero)
+  if (files_[level].size() && len > 0) {
+    --len;
+  }
+  snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]");
+  return scratch->buffer;
+}
+
+uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() {
+  uint64_t result = 0;
+  std::vector<FileMetaData*> overlaps;
+  for (int level = 1; level < num_levels() - 1; level++) {
+    for (const auto& f : files_[level]) {
+      GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps);
+      const uint64_t sum = TotalFileSize(overlaps);
+      if (sum > result) {
+        result = sum;
+      }
+    }
+  }
+  return result;
+}
+
+uint64_t VersionStorageInfo::MaxBytesForLevel(int level) const {
+  // Note: the result for level zero is not really used since we set
+  // the level-0 compaction threshold based on number of files.
+  assert(level >= 0);
+  assert(level < static_cast<int>(level_max_bytes_.size()));
+  return level_max_bytes_[level];
+}
+
+void VersionStorageInfo::CalculateBaseBytes(const ImmutableOptions& ioptions,
+                                            const MutableCFOptions& options) {
+  // Special logic to set number of sorted runs.
+  // It is to match the previous behavior when all files are in L0.
+  int num_l0_count = static_cast<int>(files_[0].size());
+  if (compaction_style_ == kCompactionStyleUniversal) {
+    // For universal compaction, we use level0 score to indicate
+    // compaction score for the whole DB. Adding other levels as if
+    // they are L0 files.
+    for (int i = 1; i < num_levels(); i++) {
+      if (!files_[i].empty()) {
+        num_l0_count++;
+      }
+    }
+  }
+  set_l0_delay_trigger_count(num_l0_count);
+
+  level_max_bytes_.resize(ioptions.num_levels);
+  if (!ioptions.level_compaction_dynamic_level_bytes) {
+    base_level_ = (ioptions.compaction_style == kCompactionStyleLevel) ? 1 : -1;
+
+    // Calculate for static bytes base case
+    for (int i = 0; i < ioptions.num_levels; ++i) {
+      if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      } else if (i > 1) {
+        level_max_bytes_[i] = MultiplyCheckOverflow(
+            MultiplyCheckOverflow(level_max_bytes_[i - 1],
+                                  options.max_bytes_for_level_multiplier),
+            options.MaxBytesMultiplerAdditional(i - 1));
+      } else {
+        level_max_bytes_[i] = options.max_bytes_for_level_base;
+      }
+    }
+  } else {
+    uint64_t max_level_size = 0;
+
+    int first_non_empty_level = -1;
+    // Find size of non-L0 level of most data.
+    // Cannot use the size of the last level because it can be empty or less
+    // than previous levels after compaction.
+    for (int i = 1; i < num_levels_; i++) {
+      uint64_t total_size = 0;
+      for (const auto& f : files_[i]) {
+        total_size += f->fd.GetFileSize();
+      }
+      if (total_size > 0 && first_non_empty_level == -1) {
+        first_non_empty_level = i;
+      }
+      if (total_size > max_level_size) {
+        max_level_size = total_size;
+      }
+    }
+
+    // Prefill every level's max bytes to disallow compaction from there.
+    for (int i = 0; i < num_levels_; i++) {
+      level_max_bytes_[i] = std::numeric_limits<uint64_t>::max();
+    }
+
+    if (max_level_size == 0) {
+      // No data for L1 and up. L0 compacts to last level directly.
+      // No compaction from L1+ needs to be scheduled.
+      base_level_ = num_levels_ - 1;
+    } else {
+      uint64_t base_bytes_max = options.max_bytes_for_level_base;
+      uint64_t base_bytes_min = static_cast<uint64_t>(
+          base_bytes_max / options.max_bytes_for_level_multiplier);
+
+      // Try whether we can make last level's target size to be max_level_size
+      uint64_t cur_level_size = max_level_size;
+      for (int i = num_levels_ - 2; i >= first_non_empty_level; i--) {
+        // Round up after dividing
+        cur_level_size = static_cast<uint64_t>(
+            cur_level_size / options.max_bytes_for_level_multiplier);
+      }
+
+      // Calculate base level and its size.
+      uint64_t base_level_size;
+      if (cur_level_size <= base_bytes_min) {
+        // Case 1. If we make target size of last level to be max_level_size,
+        // target size of the first non-empty level would be smaller than
+        // base_bytes_min. We set it be base_bytes_min.
+        base_level_size = base_bytes_min + 1U;
+        base_level_ = first_non_empty_level;
+        ROCKS_LOG_INFO(ioptions.logger,
+                       "More existing levels in DB than needed. "
+                       "max_bytes_for_level_multiplier may not be guaranteed.");
+      } else {
+        // Find base level (where L0 data is compacted to).
+        base_level_ = first_non_empty_level;
+        while (base_level_ > 1 && cur_level_size > base_bytes_max) {
+          --base_level_;
+          cur_level_size = static_cast<uint64_t>(
+              cur_level_size / options.max_bytes_for_level_multiplier);
+        }
+        if (cur_level_size > base_bytes_max) {
+          // Even L1 will be too large
+          assert(base_level_ == 1);
+          base_level_size = base_bytes_max;
+        } else {
+          base_level_size = cur_level_size;
+        }
+      }
+
+      level_multiplier_ = options.max_bytes_for_level_multiplier;
+      assert(base_level_size > 0);
+
+      uint64_t level_size = base_level_size;
+      for (int i = base_level_; i < num_levels_; i++) {
+        if (i > base_level_) {
+          level_size = MultiplyCheckOverflow(level_size, level_multiplier_);
+        }
+        // Don't set any level below base_bytes_max. Otherwise, the LSM can
+        // assume an hourglass shape where L1+ sizes are smaller than L0. This
+        // causes compaction scoring, which depends on level sizes, to favor L1+
+        // at the expense of L0, which may fill up and stall.
+        level_max_bytes_[i] = std::max(level_size, base_bytes_max);
+      }
+    }
+  }
+}
+
+uint64_t VersionStorageInfo::EstimateLiveDataSize() const {
+  // Estimate the live data size by adding up the size of a maximal set of
+  // sst files with no range overlap in same or higher level. The less
+  // compacted, the more optimistic (smaller) this estimate is. Also,
+  // for multiple sorted runs within a level, file order will matter.
+  uint64_t size = 0;
+
+  auto ikey_lt = [this](InternalKey* x, InternalKey* y) {
+    return internal_comparator_->Compare(*x, *y) < 0;
+  };
+  // (Ordered) map of largest keys in files being included in size estimate
+  std::map<InternalKey*, FileMetaData*, decltype(ikey_lt)> ranges(ikey_lt);
+
+  for (int l = num_levels_ - 1; l >= 0; l--) {
+    bool found_end = false;
+    for (auto file : files_[l]) {
+      // Find the first file already included with largest key is larger than
+      // the smallest key of `file`. If that file does not overlap with the
+      // current file, none of the files in the map does. If there is
+      // no potential overlap, we can safely insert the rest of this level
+      // (if the level is not 0) into the map without checking again because
+      // the elements in the level are sorted and non-overlapping.
+      auto lb = (found_end && l != 0) ? ranges.end()
+                                      : ranges.lower_bound(&file->smallest);
+      found_end = (lb == ranges.end());
+      if (found_end || internal_comparator_->Compare(
+                           file->largest, (*lb).second->smallest) < 0) {
+        ranges.emplace_hint(lb, &file->largest, file);
+        size += file->fd.file_size;
+      }
+    }
+  }
+
+  // For BlobDB, the result also includes the exact value of live bytes in the
+  // blob files of the version.
+  for (const auto& meta : blob_files_) {
+    assert(meta);
+
+    size += meta->GetTotalBlobBytes();
+    size -= meta->GetGarbageBlobBytes();
+  }
+
+  return size;
+}
+
+bool VersionStorageInfo::RangeMightExistAfterSortedRun(
+    const Slice& smallest_user_key, const Slice& largest_user_key,
+    int last_level, int last_l0_idx) {
+  assert((last_l0_idx != -1) == (last_level == 0));
+  // TODO(ajkr): this preserves earlier behavior where we considered an L0 file
+  // bottommost only if it's the oldest L0 file and there are no files on older
+  // levels. It'd be better to consider it bottommost if there's no overlap in
+  // older levels/files.
+  if (last_level == 0 &&
+      last_l0_idx != static_cast<int>(LevelFiles(0).size() - 1)) {
+    return true;
+  }
+
+  // Checks whether there are files living beyond the `last_level`. If lower
+  // levels have files, it checks for overlap between [`smallest_key`,
+  // `largest_key`] and those files. Bottomlevel optimizations can be made if
+  // there are no files in lower levels or if there is no overlap with the files
+  // in the lower levels.
+  for (int level = last_level + 1; level < num_levels(); level++) {
+    // The range is not in the bottommost level if there are files in lower
+    // levels when the `last_level` is 0 or if there are files in lower levels
+    // which overlap with [`smallest_key`, `largest_key`].
+    if (files_[level].size() > 0 &&
+        (last_level == 0 ||
+         OverlapInLevel(level, &smallest_user_key, &largest_user_key))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void Version::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                           std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
+  for (int level = 0; level < storage_info_.num_levels(); ++level) {
+    const auto& level_files = storage_info_.LevelFiles(level);
+    for (const auto& meta : level_files) {
+      assert(meta);
+
+      live_table_files->emplace_back(meta->fd.GetNumber());
+    }
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  for (const auto& meta : blob_files) {
+    assert(meta);
+
+    live_blob_files->emplace_back(meta->GetBlobFileNumber());
+  }
+}
+
+void Version::RemoveLiveFiles(
+    std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+    std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+  for (ObsoleteFileInfo& fi : sst_delete_candidates) {
+    if (!fi.only_delete_metadata &&
+        storage_info()->GetFileLocation(fi.metadata->fd.GetNumber()) !=
+            VersionStorageInfo::FileLocation::Invalid()) {
+      fi.only_delete_metadata = true;
+    }
+  }
+
+  blob_delete_candidates.erase(
+      std::remove_if(
+          blob_delete_candidates.begin(), blob_delete_candidates.end(),
+          [this](ObsoleteBlobFileInfo& x) {
+            return storage_info()->GetBlobFileMetaData(x.GetBlobFileNumber());
+          }),
+      blob_delete_candidates.end());
+}
+
+std::string Version::DebugString(bool hex, bool print_stats) const {
+  std::string r;
+  for (int level = 0; level < storage_info_.num_levels_; level++) {
+    // E.g.,
+    //   --- level 1 ---
+    //   17:123[1 .. 124]['a' .. 'd']
+    //   20:43[124 .. 128]['e' .. 'g']
+    //
+    // if print_stats=true:
+    //   17:123[1 .. 124]['a' .. 'd'](4096)
+    r.append("--- level ");
+    AppendNumberTo(&r, level);
+    r.append(" --- version# ");
+    AppendNumberTo(&r, version_number_);
+    if (storage_info_.compact_cursor_[level].Valid()) {
+      r.append(" --- compact_cursor: ");
+      r.append(storage_info_.compact_cursor_[level].DebugString(hex));
+    }
+    r.append(" ---\n");
+    const std::vector<FileMetaData*>& files = storage_info_.files_[level];
+    for (size_t i = 0; i < files.size(); i++) {
+      r.push_back(' ');
+      AppendNumberTo(&r, files[i]->fd.GetNumber());
+      r.push_back(':');
+      AppendNumberTo(&r, files[i]->fd.GetFileSize());
+      r.append("[");
+      AppendNumberTo(&r, files[i]->fd.smallest_seqno);
+      r.append(" .. ");
+      AppendNumberTo(&r, files[i]->fd.largest_seqno);
+      r.append("]");
+      r.append("[");
+      r.append(files[i]->smallest.DebugString(hex));
+      r.append(" .. ");
+      r.append(files[i]->largest.DebugString(hex));
+      r.append("]");
+      if (files[i]->oldest_blob_file_number != kInvalidBlobFileNumber) {
+        r.append(" blob_file:");
+        AppendNumberTo(&r, files[i]->oldest_blob_file_number);
+      }
+      if (print_stats) {
+        r.append("(");
+        r.append(std::to_string(
+            files[i]->stats.num_reads_sampled.load(std::memory_order_relaxed)));
+        r.append(")");
+      }
+      r.append("\n");
+    }
+  }
+
+  const auto& blob_files = storage_info_.GetBlobFiles();
+  if (!blob_files.empty()) {
+    r.append("--- blob files --- version# ");
+    AppendNumberTo(&r, version_number_);
+    r.append(" ---\n");
+    for (const auto& blob_file_meta : blob_files) {
+      assert(blob_file_meta);
+
+      r.append(blob_file_meta->DebugString());
+      r.push_back('\n');
+    }
+  }
+
+  return r;
+}
+
+// this is used to batch writes to the manifest file
+struct VersionSet::ManifestWriter {
+  Status status;
+  bool done;
+  InstrumentedCondVar cv;
+  ColumnFamilyData* cfd;
+  const MutableCFOptions mutable_cf_options;
+  const autovector<VersionEdit*>& edit_list;
+  const std::function<void(const Status&)> manifest_write_callback;
+
+  explicit ManifestWriter(
+      InstrumentedMutex* mu, ColumnFamilyData* _cfd,
+      const MutableCFOptions& cf_options, const autovector<VersionEdit*>& e,
+      const std::function<void(const Status&)>& manifest_wcb)
+      : done(false),
+        cv(mu),
+        cfd(_cfd),
+        mutable_cf_options(cf_options),
+        edit_list(e),
+        manifest_write_callback(manifest_wcb) {}
+  ~ManifestWriter() { status.PermitUncheckedError(); }
+
+  bool IsAllWalEdits() const {
+    bool all_wal_edits = true;
+    for (const auto& e : edit_list) {
+      if (!e->IsWalManipulation()) {
+        all_wal_edits = false;
+        break;
+      }
+    }
+    return all_wal_edits;
+  }
+};
+
+Status AtomicGroupReadBuffer::AddEdit(VersionEdit* edit) {
+  assert(edit);
+  if (edit->is_in_atomic_group_) {
+    TEST_SYNC_POINT("AtomicGroupReadBuffer::AddEdit:AtomicGroup");
+    if (replay_buffer_.empty()) {
+      replay_buffer_.resize(edit->remaining_entries_ + 1);
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", edit);
+    }
+    read_edits_in_atomic_group_++;
+    if (read_edits_in_atomic_group_ + edit->remaining_entries_ !=
+        static_cast<uint32_t>(replay_buffer_.size())) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize", edit);
+      return Status::Corruption("corrupted atomic group");
+    }
+    replay_buffer_[read_edits_in_atomic_group_ - 1] = *edit;
+    if (read_edits_in_atomic_group_ == replay_buffer_.size()) {
+      TEST_SYNC_POINT_CALLBACK(
+          "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", edit);
+      return Status::OK();
+    }
+    return Status::OK();
+  }
+
+  // A normal edit.
+  if (!replay_buffer().empty()) {
+    TEST_SYNC_POINT_CALLBACK(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits", edit);
+    return Status::Corruption("corrupted atomic group");
+  }
+  return Status::OK();
+}
+
+bool AtomicGroupReadBuffer::IsFull() const {
+  return read_edits_in_atomic_group_ == replay_buffer_.size();
+}
+
+bool AtomicGroupReadBuffer::IsEmpty() const { return replay_buffer_.empty(); }
+
+void AtomicGroupReadBuffer::Clear() {
+  read_edits_in_atomic_group_ = 0;
+  replay_buffer_.clear();
+}
+
+VersionSet::VersionSet(const std::string& dbname,
+                       const ImmutableDBOptions* _db_options,
+                       const FileOptions& storage_options, Cache* table_cache,
+                       WriteBufferManager* write_buffer_manager,
+                       WriteController* write_controller,
+                       BlockCacheTracer* const block_cache_tracer,
+                       const std::shared_ptr<IOTracer>& io_tracer,
+                       const std::string& db_id,
+                       const std::string& db_session_id)
+    : column_family_set_(new ColumnFamilySet(
+          dbname, _db_options, storage_options, table_cache,
+          write_buffer_manager, write_controller, block_cache_tracer, io_tracer,
+          db_id, db_session_id)),
+      table_cache_(table_cache),
+      env_(_db_options->env),
+      fs_(_db_options->fs, io_tracer),
+      clock_(_db_options->clock),
+      dbname_(dbname),
+      db_options_(_db_options),
+      next_file_number_(2),
+      manifest_file_number_(0),  // Filled by Recover()
+      options_file_number_(0),
+      options_file_size_(0),
+      pending_manifest_file_number_(0),
+      last_sequence_(0),
+      last_allocated_sequence_(0),
+      last_published_sequence_(0),
+      prev_log_number_(0),
+      current_version_number_(0),
+      manifest_file_size_(0),
+      file_options_(storage_options),
+      block_cache_tracer_(block_cache_tracer),
+      io_tracer_(io_tracer),
+      db_session_id_(db_session_id) {}
+
+VersionSet::~VersionSet() {
+  // we need to delete column_family_set_ because its destructor depends on
+  // VersionSet
+  column_family_set_.reset();
+  for (auto& file : obsolete_files_) {
+    if (file.metadata->table_reader_handle) {
+      table_cache_->Release(file.metadata->table_reader_handle);
+      TableCache::Evict(table_cache_, file.metadata->fd.GetNumber());
+    }
+    file.DeleteMetadata();
+  }
+  obsolete_files_.clear();
+  io_status_.PermitUncheckedError();
+}
+
+void VersionSet::Reset() {
+  if (column_family_set_) {
+    WriteBufferManager* wbm = column_family_set_->write_buffer_manager();
+    WriteController* wc = column_family_set_->write_controller();
+    // db_id becomes the source of truth after DBImpl::Recover():
+    // https://github.com/facebook/rocksdb/blob/v7.3.1/db/db_impl/db_impl_open.cc#L527
+    // Note: we may not be able to recover db_id from MANIFEST if
+    // options.write_dbid_to_manifest is false (default).
+    column_family_set_.reset(new ColumnFamilySet(
+        dbname_, db_options_, file_options_, table_cache_, wbm, wc,
+        block_cache_tracer_, io_tracer_, db_id_, db_session_id_));
+  }
+  db_id_.clear();
+  next_file_number_.store(2);
+  min_log_number_to_keep_.store(0);
+  manifest_file_number_ = 0;
+  options_file_number_ = 0;
+  pending_manifest_file_number_ = 0;
+  last_sequence_.store(0);
+  last_allocated_sequence_.store(0);
+  last_published_sequence_.store(0);
+  prev_log_number_ = 0;
+  descriptor_log_.reset();
+  current_version_number_ = 0;
+  manifest_writers_.clear();
+  manifest_file_size_ = 0;
+  obsolete_files_.clear();
+  obsolete_manifests_.clear();
+  wals_.Reset();
+}
+
+void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
+                               Version* v) {
+  // compute new compaction score
+  v->storage_info()->ComputeCompactionScore(
+      *column_family_data->ioptions(),
+      *column_family_data->GetLatestMutableCFOptions());
+
+  // Mark v finalized
+  v->storage_info_.SetFinalized();
+
+  // Make "v" current
+  assert(v->refs_ == 0);
+  Version* current = column_family_data->current();
+  assert(v != current);
+  if (current != nullptr) {
+    assert(current->refs_ > 0);
+    current->Unref();
+  }
+  column_family_data->SetCurrent(v);
+  v->Ref();
+
+  // Append to linked list
+  v->prev_ = column_family_data->dummy_versions()->prev_;
+  v->next_ = column_family_data->dummy_versions();
+  v->prev_->next_ = v;
+  v->next_->prev_ = v;
+}
+
+Status VersionSet::ProcessManifestWrites(
+    std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
+    FSDirectory* dir_contains_current_file, bool new_descriptor_log,
+    const ColumnFamilyOptions* new_cf_options) {
+  mu->AssertHeld();
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  ManifestWriter* last_writer = &first_writer;
+
+  assert(!manifest_writers_.empty());
+  assert(manifest_writers_.front() == &first_writer);
+
+  autovector<VersionEdit*> batch_edits;
+  autovector<Version*> versions;
+  autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
+  std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
+
+  // Tracking `max_last_sequence` is needed to ensure we write
+  // `VersionEdit::last_sequence_`s in non-decreasing order according to the
+  // recovery code's requirement. It also allows us to defer updating
+  // `descriptor_last_sequence_` until the apply phase, after the log phase
+  // succeeds.
+  SequenceNumber max_last_sequence = descriptor_last_sequence_;
+
+  if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+    // No group commits for column family add or drop
+    LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
+    batch_edits.push_back(first_writer.edit_list.front());
+  } else {
+    auto it = manifest_writers_.cbegin();
+    size_t group_start = std::numeric_limits<size_t>::max();
+    while (it != manifest_writers_.cend()) {
+      if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
+        // no group commits for column family add or drop
+        break;
+      }
+      last_writer = *(it++);
+      assert(last_writer != nullptr);
+      assert(last_writer->cfd != nullptr);
+      if (last_writer->cfd->IsDropped()) {
+        // If we detect a dropped CF at this point, and the corresponding
+        // version edits belong to an atomic group, then we need to find out
+        // the preceding version edits in the same atomic group, and update
+        // their `remaining_entries_` member variable because we are NOT going
+        // to write the version edits' of dropped CF to the MANIFEST. If we
+        // don't update, then Recover can report corrupted atomic group because
+        // the `remaining_entries_` do not match.
+        if (!batch_edits.empty()) {
+          if (batch_edits.back()->is_in_atomic_group_ &&
+              batch_edits.back()->remaining_entries_ > 0) {
+            assert(group_start < batch_edits.size());
+            const auto& edit_list = last_writer->edit_list;
+            size_t k = 0;
+            while (k < edit_list.size()) {
+              if (!edit_list[k]->is_in_atomic_group_) {
+                break;
+              } else if (edit_list[k]->remaining_entries_ == 0) {
+                ++k;
+                break;
+              }
+              ++k;
+            }
+            for (auto i = group_start; i < batch_edits.size(); ++i) {
+              assert(static_cast<uint32_t>(k) <=
+                     batch_edits.back()->remaining_entries_);
+              batch_edits[i]->remaining_entries_ -= static_cast<uint32_t>(k);
+            }
+          }
+        }
+        continue;
+      }
+      // We do a linear search on versions because versions is small.
+      // TODO(yanqin) maybe consider unordered_map
+      Version* version = nullptr;
+      VersionBuilder* builder = nullptr;
+      for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
+        uint32_t cf_id = last_writer->cfd->GetID();
+        if (versions[i]->cfd()->GetID() == cf_id) {
+          version = versions[i];
+          assert(!builder_guards.empty() &&
+                 builder_guards.size() == versions.size());
+          builder = builder_guards[i]->version_builder();
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
+          break;
+        }
+      }
+      if (version == nullptr) {
+        // WAL manipulations do not need to be applied to versions.
+        if (!last_writer->IsAllWalEdits()) {
+          version = new Version(last_writer->cfd, this, file_options_,
+                                last_writer->mutable_cf_options, io_tracer_,
+                                current_version_number_++);
+          versions.push_back(version);
+          mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
+          builder_guards.emplace_back(
+              new BaseReferencedVersionBuilder(last_writer->cfd));
+          builder = builder_guards.back()->version_builder();
+        }
+        assert(last_writer->IsAllWalEdits() || builder);
+        assert(last_writer->IsAllWalEdits() || version);
+        TEST_SYNC_POINT_CALLBACK("VersionSet::ProcessManifestWrites:NewVersion",
+                                 version);
+      }
+      for (const auto& e : last_writer->edit_list) {
+        if (e->is_in_atomic_group_) {
+          if (batch_edits.empty() || !batch_edits.back()->is_in_atomic_group_ ||
+              (batch_edits.back()->is_in_atomic_group_ &&
+               batch_edits.back()->remaining_entries_ == 0)) {
+            group_start = batch_edits.size();
+          }
+        } else if (group_start != std::numeric_limits<size_t>::max()) {
+          group_start = std::numeric_limits<size_t>::max();
+        }
+        Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
+                                     &max_last_sequence, mu);
+        if (!s.ok()) {
+          // free up the allocated memory
+          for (auto v : versions) {
+            delete v;
+          }
+          return s;
+        }
+        batch_edits.push_back(e);
+      }
+    }
+    for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+      assert(!builder_guards.empty() &&
+             builder_guards.size() == versions.size());
+      auto* builder = builder_guards[i]->version_builder();
+      Status s = builder->SaveTo(versions[i]->storage_info());
+      if (!s.ok()) {
+        // free up the allocated memory
+        for (auto v : versions) {
+          delete v;
+        }
+        return s;
+      }
+    }
+  }
+
+#ifndef NDEBUG
+  // Verify that version edits of atomic groups have correct
+  // remaining_entries_.
+  size_t k = 0;
+  while (k < batch_edits.size()) {
+    while (k < batch_edits.size() && !batch_edits[k]->is_in_atomic_group_) {
+      ++k;
+    }
+    if (k == batch_edits.size()) {
+      break;
+    }
+    size_t i = k;
+    while (i < batch_edits.size()) {
+      if (!batch_edits[i]->is_in_atomic_group_) {
+        break;
+      }
+      assert(i - k + batch_edits[i]->remaining_entries_ ==
+             batch_edits[k]->remaining_entries_);
+      if (batch_edits[i]->remaining_entries_ == 0) {
+        ++i;
+        break;
+      }
+      ++i;
+    }
+    assert(batch_edits[i - 1]->is_in_atomic_group_);
+    assert(0 == batch_edits[i - 1]->remaining_entries_);
+    std::vector<VersionEdit*> tmp;
+    for (size_t j = k; j != i; ++j) {
+      tmp.emplace_back(batch_edits[j]);
+    }
+    TEST_SYNC_POINT_CALLBACK(
+        "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", &tmp);
+    k = i;
+  }
+#endif  // NDEBUG
+
+  assert(pending_manifest_file_number_ == 0);
+  if (!descriptor_log_ ||
+      manifest_file_size_ > db_options_->max_manifest_file_size) {
+    TEST_SYNC_POINT("VersionSet::ProcessManifestWrites:BeforeNewManifest");
+    new_descriptor_log = true;
+  } else {
+    pending_manifest_file_number_ = manifest_file_number_;
+  }
+
+  // Local cached copy of state variable(s). WriteCurrentStateToManifest()
+  // reads its content after releasing db mutex to avoid race with
+  // SwitchMemtable().
+  std::unordered_map<uint32_t, MutableCFState> curr_state;
+  VersionEdit wal_additions;
+  if (new_descriptor_log) {
+    pending_manifest_file_number_ = NewFileNumber();
+    batch_edits.back()->SetNextFile(next_file_number_.load());
+
+    // if we are writing out new snapshot make sure to persist max column
+    // family.
+    if (column_family_set_->GetMaxColumnFamily() > 0) {
+      first_writer.edit_list.front()->SetMaxColumnFamily(
+          column_family_set_->GetMaxColumnFamily());
+    }
+    for (const auto* cfd : *column_family_set_) {
+      assert(curr_state.find(cfd->GetID()) == curr_state.end());
+      curr_state.emplace(std::make_pair(
+          cfd->GetID(),
+          MutableCFState(cfd->GetLogNumber(), cfd->GetFullHistoryTsLow())));
+    }
+
+    for (const auto& wal : wals_.GetWals()) {
+      wal_additions.AddWal(wal.first, wal.second);
+    }
+  }
+
+  uint64_t new_manifest_file_size = 0;
+  Status s;
+  IOStatus io_s;
+  IOStatus manifest_io_status;
+  {
+    FileOptions opt_file_opts = fs_->OptimizeForManifestWrite(file_options_);
+    mu->Unlock();
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestStart");
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WriteManifest", nullptr);
+    if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        assert(!builder_guards.empty() &&
+               builder_guards.size() == versions.size());
+        assert(!mutable_cf_options_ptrs.empty() &&
+               builder_guards.size() == versions.size());
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        s = builder_guards[i]->version_builder()->LoadTableHandlers(
+            cfd->internal_stats(), 1 /* max_threads */,
+            true /* prefetch_index_and_filter_in_cache */,
+            false /* is_initial_load */,
+            mutable_cf_options_ptrs[i]->prefix_extractor,
+            MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]));
+        if (!s.ok()) {
+          if (db_options_->paranoid_checks) {
+            break;
+          }
+          s = Status::OK();
+        }
+      }
+    }
+
+    if (s.ok() && new_descriptor_log) {
+      // This is fine because everything inside of this block is serialized --
+      // only one thread can be here at the same time
+      // create new manifest file
+      ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
+                     pending_manifest_file_number_);
+      std::string descriptor_fname =
+          DescriptorFileName(dbname_, pending_manifest_file_number_);
+      std::unique_ptr<FSWritableFile> descriptor_file;
+      io_s = NewWritableFile(fs_.get(), descriptor_fname, &descriptor_file,
+                             opt_file_opts);
+      if (io_s.ok()) {
+        descriptor_file->SetPreallocationBlockSize(
+            db_options_->manifest_preallocation_size);
+        FileTypeSet tmp_set = db_options_->checksum_handoff_file_types;
+        std::unique_ptr<WritableFileWriter> file_writer(new WritableFileWriter(
+            std::move(descriptor_file), descriptor_fname, opt_file_opts, clock_,
+            io_tracer_, nullptr, db_options_->listeners, nullptr,
+            tmp_set.Contains(FileType::kDescriptorFile),
+            tmp_set.Contains(FileType::kDescriptorFile)));
+        descriptor_log_.reset(
+            new log::Writer(std::move(file_writer), 0, false));
+        s = WriteCurrentStateToManifest(curr_state, wal_additions,
+                                        descriptor_log_.get(), io_s);
+      } else {
+        manifest_io_status = io_s;
+        s = io_s;
+      }
+    }
+
+    if (s.ok()) {
+      if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
+        constexpr bool update_stats = true;
+
+        for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+          versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats);
+        }
+      }
+
+      // Write new records to MANIFEST log
+#ifndef NDEBUG
+      size_t idx = 0;
+#endif
+      for (auto& e : batch_edits) {
+        std::string record;
+        if (!e->EncodeTo(&record)) {
+          s = Status::Corruption("Unable to encode VersionEdit:" +
+                                 e->DebugString(true));
+          break;
+        }
+        TEST_KILL_RANDOM_WITH_WEIGHT("VersionSet::LogAndApply:BeforeAddRecord",
+                                     REDUCE_ODDS2);
+#ifndef NDEBUG
+        if (batch_edits.size() > 1 && batch_edits.size() - 1 == idx) {
+          TEST_SYNC_POINT_CALLBACK(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0",
+              nullptr);
+          TEST_SYNC_POINT(
+              "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:1");
+        }
+        ++idx;
+#endif /* !NDEBUG */
+        io_s = descriptor_log_->AddRecord(record);
+        if (!io_s.ok()) {
+          s = io_s;
+          manifest_io_status = io_s;
+          break;
+        }
+      }
+      if (s.ok()) {
+        io_s = SyncManifest(db_options_, descriptor_log_->file());
+        manifest_io_status = io_s;
+        TEST_SYNC_POINT_CALLBACK(
+            "VersionSet::ProcessManifestWrites:AfterSyncManifest", &io_s);
+      }
+      if (!io_s.ok()) {
+        s = io_s;
+        ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
+                        s.ToString().c_str());
+      }
+    }
+
+    // If we just created a new descriptor file, install it by writing a
+    // new CURRENT file that points to it.
+    if (s.ok()) {
+      assert(manifest_io_status.ok());
+    }
+    if (s.ok() && new_descriptor_log) {
+      io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
+                            dir_contains_current_file);
+      if (!io_s.ok()) {
+        s = io_s;
+      }
+    }
+
+    if (s.ok()) {
+      // find offset in manifest file where this version is stored.
+      new_manifest_file_size = descriptor_log_->file()->GetFileSize();
+    }
+
+    if (first_writer.edit_list.front()->is_column_family_drop_) {
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
+      TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
+    }
+
+    LogFlush(db_options_->info_log);
+    TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone");
+    mu->Lock();
+  }
+
+  if (s.ok()) {
+    // Apply WAL edits, DB mutex must be held.
+    for (auto& e : batch_edits) {
+      if (e->IsWalAddition()) {
+        s = wals_.AddWals(e->GetWalAdditions());
+      } else if (e->IsWalDeletion()) {
+        s = wals_.DeleteWalsBefore(e->GetWalDeletion().GetLogNumber());
+      }
+      if (!s.ok()) {
+        break;
+      }
+    }
+  }
+
+  if (!io_s.ok()) {
+    if (io_status_.ok()) {
+      io_status_ = io_s;
+    }
+  } else if (!io_status_.ok()) {
+    io_status_ = io_s;
+  }
+
+  // Append the old manifest file to the obsolete_manifest_ list to be deleted
+  // by PurgeObsoleteFiles later.
+  if (s.ok() && new_descriptor_log) {
+    obsolete_manifests_.emplace_back(
+        DescriptorFileName("", manifest_file_number_));
+  }
+
+  // Install the new versions
+  if (s.ok()) {
+    if (first_writer.edit_list.front()->is_column_family_add_) {
+      assert(batch_edits.size() == 1);
+      assert(new_cf_options != nullptr);
+      assert(max_last_sequence == descriptor_last_sequence_);
+      CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
+    } else if (first_writer.edit_list.front()->is_column_family_drop_) {
+      assert(batch_edits.size() == 1);
+      assert(max_last_sequence == descriptor_last_sequence_);
+      first_writer.cfd->SetDropped();
+      first_writer.cfd->UnrefAndTryDelete();
+    } else {
+      // Each version in versions corresponds to a column family.
+      // For each column family, update its log number indicating that logs
+      // with number smaller than this should be ignored.
+      uint64_t last_min_log_number_to_keep = 0;
+      for (const auto& e : batch_edits) {
+        ColumnFamilyData* cfd = nullptr;
+        if (!e->IsColumnFamilyManipulation()) {
+          cfd = column_family_set_->GetColumnFamily(e->column_family_);
+          // e would not have been added to batch_edits if its corresponding
+          // column family is dropped.
+          assert(cfd);
+        }
+        if (cfd) {
+          if (e->has_log_number_ && e->log_number_ > cfd->GetLogNumber()) {
+            cfd->SetLogNumber(e->log_number_);
+          }
+          if (e->HasFullHistoryTsLow()) {
+            cfd->SetFullHistoryTsLow(e->GetFullHistoryTsLow());
+          }
+        }
+        if (e->has_min_log_number_to_keep_) {
+          last_min_log_number_to_keep =
+              std::max(last_min_log_number_to_keep, e->min_log_number_to_keep_);
+        }
+      }
+
+      if (last_min_log_number_to_keep != 0) {
+        MarkMinLogNumberToKeep(last_min_log_number_to_keep);
+      }
+
+      for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
+        ColumnFamilyData* cfd = versions[i]->cfd_;
+        AppendVersion(cfd, versions[i]);
+      }
+    }
+    assert(max_last_sequence >= descriptor_last_sequence_);
+    descriptor_last_sequence_ = max_last_sequence;
+    manifest_file_number_ = pending_manifest_file_number_;
+    manifest_file_size_ = new_manifest_file_size;
+    prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
+  } else {
+    std::string version_edits;
+    for (auto& e : batch_edits) {
+      version_edits += ("\n" + e->DebugString(true));
+    }
+    ROCKS_LOG_ERROR(db_options_->info_log,
+                    "Error in committing version edit to MANIFEST: %s",
+                    version_edits.c_str());
+    for (auto v : versions) {
+      delete v;
+    }
+    if (manifest_io_status.ok()) {
+      manifest_file_number_ = pending_manifest_file_number_;
+      manifest_file_size_ = new_manifest_file_size;
+    }
+    // If manifest append failed for whatever reason, the file could be
+    // corrupted. So we need to force the next version update to start a
+    // new manifest file.
+    descriptor_log_.reset();
+    // If manifest operations failed, then we know the CURRENT file still
+    // points to the original MANIFEST. Therefore, we can safely delete the
+    // new MANIFEST.
+    // If manifest operations succeeded, and we are here, then it is possible
+    // that renaming tmp file to CURRENT failed.
+    //
+    // On local POSIX-compliant FS, the CURRENT must point to the original
+    // MANIFEST. We can delete the new MANIFEST for simplicity, but we can also
+    // keep it. Future recovery will ignore this MANIFEST. It's also ok for the
+    // process not to crash and continue using the db. Any future LogAndApply()
+    // call will switch to a new MANIFEST and update CURRENT, still ignoring
+    // this one.
+    //
+    // On non-local FS, it is
+    // possible that the rename operation succeeded on the server (remote)
+    // side, but the client somehow returns a non-ok status to RocksDB. Note
+    // that this does not violate atomicity. Should we delete the new MANIFEST
+    // successfully, a subsequent recovery attempt will likely see the CURRENT
+    // pointing to the new MANIFEST, thus fail. We will not be able to open the
+    // DB again. Therefore, if manifest operations succeed, we should keep the
+    // the new MANIFEST. If the process proceeds, any future LogAndApply() call
+    // will switch to a new MANIFEST and update CURRENT. If user tries to
+    // re-open the DB,
+    // a) CURRENT points to the new MANIFEST, and the new MANIFEST is present.
+    // b) CURRENT points to the original MANIFEST, and the original MANIFEST
+    //    also exists.
+    if (new_descriptor_log && !manifest_io_status.ok()) {
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Deleting manifest %" PRIu64 " current manifest %" PRIu64
+                     "\n",
+                     pending_manifest_file_number_, manifest_file_number_);
+      Status manifest_del_status = env_->DeleteFile(
+          DescriptorFileName(dbname_, pending_manifest_file_number_));
+      if (!manifest_del_status.ok()) {
+        ROCKS_LOG_WARN(db_options_->info_log,
+                       "Failed to delete manifest %" PRIu64 ": %s",
+                       pending_manifest_file_number_,
+                       manifest_del_status.ToString().c_str());
+      }
+    }
+  }
+
+  pending_manifest_file_number_ = 0;
+
+#ifndef NDEBUG
+  // This is here kind of awkwardly because there's no other consistency
+  // checks on `VersionSet`'s updates for the new `Version`s. We might want
+  // to move it to a dedicated function, or remove it if we gain enough
+  // confidence in `descriptor_last_sequence_`.
+  if (s.ok()) {
+    for (const auto* v : versions) {
+      const auto* vstorage = v->storage_info();
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        for (const auto& file : vstorage->LevelFiles(level)) {
+          assert(file->fd.largest_seqno <= descriptor_last_sequence_);
+        }
+      }
+    }
+  }
+#endif  // NDEBUG
+
+  // wake up all the waiting writers
+  while (true) {
+    ManifestWriter* ready = manifest_writers_.front();
+    manifest_writers_.pop_front();
+    bool need_signal = true;
+    for (const auto& w : writers) {
+      if (&w == ready) {
+        need_signal = false;
+        break;
+      }
+    }
+    ready->status = s;
+    ready->done = true;
+    if (ready->manifest_write_callback) {
+      (ready->manifest_write_callback)(s);
+    }
+    if (need_signal) {
+      ready->cv.Signal();
+    }
+    if (ready == last_writer) {
+      break;
+    }
+  }
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+  return s;
+}
+
+void VersionSet::WakeUpWaitingManifestWriters() {
+  // wake up all the waiting writers
+  // Notify new head of manifest write queue.
+  if (!manifest_writers_.empty()) {
+    manifest_writers_.front()->cv.Signal();
+  }
+}
+
+// 'datas' is grammatically incorrect. We still use this notation to indicate
+// that this variable represents a collection of column_family_data.
+Status VersionSet::LogAndApply(
+    const autovector<ColumnFamilyData*>& column_family_datas,
+    const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+    const autovector<autovector<VersionEdit*>>& edit_lists,
+    InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+    bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options,
+    const std::vector<std::function<void(const Status&)>>& manifest_wcbs) {
+  mu->AssertHeld();
+  int num_edits = 0;
+  for (const auto& elist : edit_lists) {
+    num_edits += static_cast<int>(elist.size());
+  }
+  if (num_edits == 0) {
+    return Status::OK();
+  } else if (num_edits > 1) {
+#ifndef NDEBUG
+    for (const auto& edit_list : edit_lists) {
+      for (const auto& edit : edit_list) {
+        assert(!edit->IsColumnFamilyManipulation());
+      }
+    }
+#endif /* ! NDEBUG */
+  }
+
+  int num_cfds = static_cast<int>(column_family_datas.size());
+  if (num_cfds == 1 && column_family_datas[0] == nullptr) {
+    assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
+    assert(edit_lists[0][0]->is_column_family_add_);
+    assert(new_cf_options != nullptr);
+  }
+  std::deque<ManifestWriter> writers;
+  if (num_cfds > 0) {
+    assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
+    assert(static_cast<size_t>(num_cfds) == edit_lists.size());
+  }
+  for (int i = 0; i < num_cfds; ++i) {
+    const auto wcb =
+        manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
+    writers.emplace_back(mu, column_family_datas[i],
+                         *mutable_cf_options_list[i], edit_lists[i], wcb);
+    manifest_writers_.push_back(&writers[i]);
+  }
+  assert(!writers.empty());
+  ManifestWriter& first_writer = writers.front();
+  TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:BeforeWriterWaiting",
+                           nullptr);
+  while (!first_writer.done && &first_writer != manifest_writers_.front()) {
+    first_writer.cv.Wait();
+  }
+  if (first_writer.done) {
+    // All non-CF-manipulation operations can be grouped together and committed
+    // to MANIFEST. They should all have finished. The status code is stored in
+    // the first manifest writer.
+#ifndef NDEBUG
+    for (const auto& writer : writers) {
+      assert(writer.done);
+    }
+    TEST_SYNC_POINT_CALLBACK("VersionSet::LogAndApply:WakeUpAndDone", mu);
+#endif /* !NDEBUG */
+    return first_writer.status;
+  }
+
+  int num_undropped_cfds = 0;
+  for (auto cfd : column_family_datas) {
+    // if cfd == nullptr, it is a column family add.
+    if (cfd == nullptr || !cfd->IsDropped()) {
+      ++num_undropped_cfds;
+    }
+  }
+  if (0 == num_undropped_cfds) {
+    for (int i = 0; i != num_cfds; ++i) {
+      manifest_writers_.pop_front();
+    }
+    // Notify new head of manifest write queue.
+    if (!manifest_writers_.empty()) {
+      manifest_writers_.front()->cv.Signal();
+    }
+    return Status::ColumnFamilyDropped();
+  }
+  return ProcessManifestWrites(writers, mu, dir_contains_current_file,
+                               new_descriptor_log, new_cf_options);
+}
+
+void VersionSet::LogAndApplyCFHelper(VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence) {
+  assert(max_last_sequence != nullptr);
+  assert(edit->IsColumnFamilyManipulation());
+  edit->SetNextFile(next_file_number_.load());
+  assert(!edit->HasLastSequence());
+  edit->SetLastSequence(*max_last_sequence);
+  if (edit->is_column_family_drop_) {
+    // if we drop column family, we have to make sure to save max column family,
+    // so that we don't reuse existing ID
+    edit->SetMaxColumnFamily(column_family_set_->GetMaxColumnFamily());
+  }
+}
+
+Status VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd,
+                                     VersionBuilder* builder, VersionEdit* edit,
+                                     SequenceNumber* max_last_sequence,
+                                     InstrumentedMutex* mu) {
+#ifdef NDEBUG
+  (void)cfd;
+#endif
+  mu->AssertHeld();
+  assert(!edit->IsColumnFamilyManipulation());
+  assert(max_last_sequence != nullptr);
+
+  if (edit->has_log_number_) {
+    assert(edit->log_number_ >= cfd->GetLogNumber());
+    assert(edit->log_number_ < next_file_number_.load());
+  }
+
+  if (!edit->has_prev_log_number_) {
+    edit->SetPrevLogNumber(prev_log_number_);
+  }
+  edit->SetNextFile(next_file_number_.load());
+  if (edit->HasLastSequence() && edit->GetLastSequence() > *max_last_sequence) {
+    *max_last_sequence = edit->GetLastSequence();
+  } else {
+    edit->SetLastSequence(*max_last_sequence);
+  }
+
+  // The builder can be nullptr only if edit is WAL manipulation,
+  // because WAL edits do not need to be applied to versions,
+  // we return Status::OK() in this case.
+  assert(builder || edit->IsWalManipulation());
+  return builder ? builder->Apply(edit) : Status::OK();
+}
+
+Status VersionSet::GetCurrentManifestPath(const std::string& dbname,
+                                          FileSystem* fs,
+                                          std::string* manifest_path,
+                                          uint64_t* manifest_file_number) {
+  assert(fs != nullptr);
+  assert(manifest_path != nullptr);
+  assert(manifest_file_number != nullptr);
+
+  std::string fname;
+  Status s = ReadFileToString(fs, CurrentFileName(dbname), &fname);
+  if (!s.ok()) {
+    return s;
+  }
+  if (fname.empty() || fname.back() != '\n') {
+    return Status::Corruption("CURRENT file does not end with newline");
+  }
+  // remove the trailing '\n'
+  fname.resize(fname.size() - 1);
+  FileType type;
+  bool parse_ok = ParseFileName(fname, manifest_file_number, &type);
+  if (!parse_ok || type != kDescriptorFile) {
+    return Status::Corruption("CURRENT file corrupted");
+  }
+  *manifest_path = dbname;
+  if (dbname.back() != '/') {
+    manifest_path->push_back('/');
+  }
+  manifest_path->append(fname);
+  return Status::OK();
+}
+
+Status VersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id, bool no_error_if_files_missing) {
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+                                    &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
+  }
+
+  ROCKS_LOG_INFO(db_options_->info_log, "Recovering from manifest file: %s\n",
+                 manifest_path.c_str());
+
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
+  uint64_t current_manifest_file_size = 0;
+  uint64_t log_number = 0;
+  {
+    VersionSet::LogReporter reporter;
+    Status log_read_status;
+    reporter.status = &log_read_status;
+    log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    VersionEditHandler handler(
+        read_only, column_families, const_cast<VersionSet*>(this),
+        /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_);
+    handler.Iterate(reader, &log_read_status);
+    s = handler.status();
+    if (s.ok()) {
+      log_number = handler.GetVersionEditParams().log_number_;
+      current_manifest_file_size = reader.GetReadOffset();
+      assert(current_manifest_file_size != 0);
+      handler.GetDbId(db_id);
+    }
+  }
+
+  if (s.ok()) {
+    manifest_file_size_ = current_manifest_file_size;
+    ROCKS_LOG_INFO(
+        db_options_->info_log,
+        "Recovered from manifest file:%s succeeded,"
+        "manifest_file_number is %" PRIu64 ", next_file_number is %" PRIu64
+        ", last_sequence is %" PRIu64 ", log_number is %" PRIu64
+        ",prev_log_number is %" PRIu64 ",max_column_family is %" PRIu32
+        ",min_log_number_to_keep is %" PRIu64 "\n",
+        manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
+        last_sequence_.load(), log_number, prev_log_number_,
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
+
+    for (auto cfd : *column_family_set_) {
+      if (cfd->IsDropped()) {
+        continue;
+      }
+      ROCKS_LOG_INFO(db_options_->info_log,
+                     "Column family [%s] (ID %" PRIu32
+                     "), log number is %" PRIu64 "\n",
+                     cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
+    }
+  }
+
+  return s;
+}
+
+namespace {
+class ManifestPicker {
+ public:
+  explicit ManifestPicker(const std::string& dbname,
+                          const std::vector<std::string>& files_in_dbname);
+  // REQUIRES Valid() == true
+  std::string GetNextManifest(uint64_t* file_number, std::string* file_name);
+  bool Valid() const { return manifest_file_iter_ != manifest_files_.end(); }
+
+ private:
+  const std::string& dbname_;
+  // MANIFEST file names(s)
+  std::vector<std::string> manifest_files_;
+  std::vector<std::string>::const_iterator manifest_file_iter_;
+};
+
+ManifestPicker::ManifestPicker(const std::string& dbname,
+                               const std::vector<std::string>& files_in_dbname)
+    : dbname_(dbname) {
+  // populate manifest files
+  assert(!files_in_dbname.empty());
+  for (const auto& fname : files_in_dbname) {
+    uint64_t file_num = 0;
+    FileType file_type;
+    bool parse_ok = ParseFileName(fname, &file_num, &file_type);
+    if (parse_ok && file_type == kDescriptorFile) {
+      manifest_files_.push_back(fname);
+    }
+  }
+  // seek to first manifest
+  std::sort(manifest_files_.begin(), manifest_files_.end(),
+            [](const std::string& lhs, const std::string& rhs) {
+              uint64_t num1 = 0;
+              uint64_t num2 = 0;
+              FileType type1;
+              FileType type2;
+              bool parse_ok1 = ParseFileName(lhs, &num1, &type1);
+              bool parse_ok2 = ParseFileName(rhs, &num2, &type2);
+#ifndef NDEBUG
+              assert(parse_ok1);
+              assert(parse_ok2);
+#else
+              (void)parse_ok1;
+              (void)parse_ok2;
+#endif
+              return num1 > num2;
+            });
+  manifest_file_iter_ = manifest_files_.begin();
+}
+
+std::string ManifestPicker::GetNextManifest(uint64_t* number,
+                                            std::string* file_name) {
+  assert(Valid());
+  std::string ret;
+  if (manifest_file_iter_ != manifest_files_.end()) {
+    ret.assign(dbname_);
+    if (ret.back() != kFilePathSeparator) {
+      ret.push_back(kFilePathSeparator);
+    }
+    ret.append(*manifest_file_iter_);
+    if (number) {
+      FileType type;
+      bool parse = ParseFileName(*manifest_file_iter_, number, &type);
+      assert(type == kDescriptorFile);
+#ifndef NDEBUG
+      assert(parse);
+#else
+      (void)parse;
+#endif
+    }
+    if (file_name) {
+      *file_name = *manifest_file_iter_;
+    }
+    ++manifest_file_iter_;
+  }
+  return ret;
+}
+}  // anonymous namespace
+
+Status VersionSet::TryRecover(
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    const std::vector<std::string>& files_in_dbname, std::string* db_id,
+    bool* has_missing_table_file) {
+  ManifestPicker manifest_picker(dbname_, files_in_dbname);
+  if (!manifest_picker.Valid()) {
+    return Status::Corruption("Cannot locate MANIFEST file in " + dbname_);
+  }
+  Status s;
+  std::string manifest_path =
+      manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  while (!manifest_path.empty()) {
+    s = TryRecoverFromOneManifest(manifest_path, column_families, read_only,
+                                  db_id, has_missing_table_file);
+    if (s.ok() || !manifest_picker.Valid()) {
+      break;
+    }
+    Reset();
+    manifest_path =
+        manifest_picker.GetNextManifest(&manifest_file_number_, nullptr);
+  }
+  return s;
+}
+
+Status VersionSet::TryRecoverFromOneManifest(
+    const std::string& manifest_path,
+    const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
+    std::string* db_id, bool* has_missing_table_file) {
+  ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n",
+                 manifest_path.c_str());
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> manifest_file;
+    s = fs_->NewSequentialFile(manifest_path,
+                               fs_->OptimizeForManifestRead(file_options_),
+                               &manifest_file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+  }
+
+  assert(s.ok());
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter,
+                     /*checksum=*/true, /*log_num=*/0);
+  VersionEditHandlerPointInTime handler_pit(
+      read_only, column_families, const_cast<VersionSet*>(this), io_tracer_);
+
+  handler_pit.Iterate(reader, &s);
+
+  handler_pit.GetDbId(db_id);
+
+  assert(nullptr != has_missing_table_file);
+  *has_missing_table_file = handler_pit.HasMissingFiles();
+
+  return handler_pit.status();
+}
+
+Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
+                                      const std::string& dbname,
+                                      FileSystem* fs) {
+  // Read "CURRENT" file, which contains a pointer to the current manifest file
+  std::string manifest_path;
+  uint64_t manifest_file_number;
+  Status s =
+      GetCurrentManifestPath(dbname, fs, &manifest_path, &manifest_file_number);
+  if (!s.ok()) {
+    return s;
+  }
+  return ListColumnFamiliesFromManifest(manifest_path, fs, column_families);
+}
+
+Status VersionSet::ListColumnFamiliesFromManifest(
+    const std::string& manifest_path, FileSystem* fs,
+    std::vector<std::string>* column_families) {
+  std::unique_ptr<SequentialFileReader> file_reader;
+  Status s;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    // these are just for performance reasons, not correctness,
+    // so we're fine using the defaults
+    s = fs->NewSequentialFile(manifest_path, FileOptions(), &file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader = std::make_unique<SequentialFileReader>(
+        std::move(file), manifest_path, /*io_tracer=*/nullptr);
+  }
+
+  VersionSet::LogReporter reporter;
+  reporter.status = &s;
+  log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                     true /* checksum */, 0 /* log_number */);
+
+  ListColumnFamiliesHandler handler;
+  handler.Iterate(reader, &s);
+
+  assert(column_families);
+  column_families->clear();
+  if (handler.status().ok()) {
+    for (const auto& iter : handler.GetColumnFamilyNames()) {
+      column_families->push_back(iter.second);
+    }
+  }
+
+  return handler.status();
+}
+
+#ifndef ROCKSDB_LITE
+Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
+                                        const Options* options,
+                                        const FileOptions& file_options,
+                                        int new_levels) {
+  if (new_levels <= 1) {
+    return Status::InvalidArgument(
+        "Number of levels needs to be bigger than 1");
+  }
+
+  ImmutableDBOptions db_options(*options);
+  ColumnFamilyOptions cf_options(*options);
+  std::shared_ptr<Cache> tc(NewLRUCache(options->max_open_files - 10,
+                                        options->table_cache_numshardbits));
+  WriteController wc(options->delayed_write_rate);
+  WriteBufferManager wb(options->db_write_buffer_size);
+  VersionSet versions(dbname, &db_options, file_options, tc.get(), &wb, &wc,
+                      nullptr /*BlockCacheTracer*/, nullptr /*IOTracer*/,
+                      /*db_id*/ "",
+                      /*db_session_id*/ "");
+  Status status;
+
+  std::vector<ColumnFamilyDescriptor> dummy;
+  ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName,
+                                          ColumnFamilyOptions(*options));
+  dummy.push_back(dummy_descriptor);
+  status = versions.Recover(dummy);
+  if (!status.ok()) {
+    return status;
+  }
+
+  Version* current_version =
+      versions.GetColumnFamilySet()->GetDefault()->current();
+  auto* vstorage = current_version->storage_info();
+  int current_levels = vstorage->num_levels();
+
+  if (current_levels <= new_levels) {
+    return Status::OK();
+  }
+
+  // Make sure there are file only on one level from
+  // (new_levels-1) to (current_levels-1)
+  int first_nonempty_level = -1;
+  int first_nonempty_level_filenum = 0;
+  for (int i = new_levels - 1; i < current_levels; i++) {
+    int file_num = vstorage->NumLevelFiles(i);
+    if (file_num != 0) {
+      if (first_nonempty_level < 0) {
+        first_nonempty_level = i;
+        first_nonempty_level_filenum = file_num;
+      } else {
+        char msg[255];
+        snprintf(msg, sizeof(msg),
+                 "Found at least two levels containing files: "
+                 "[%d:%d],[%d:%d].\n",
+                 first_nonempty_level, first_nonempty_level_filenum, i,
+                 file_num);
+        return Status::InvalidArgument(msg);
+      }
+    }
+  }
+
+  // we need to allocate an array with the old number of levels size to
+  // avoid SIGSEGV in WriteCurrentStatetoManifest()
+  // however, all levels bigger or equal to new_levels will be empty
+  std::vector<FileMetaData*>* new_files_list =
+      new std::vector<FileMetaData*>[current_levels];
+  for (int i = 0; i < new_levels - 1; i++) {
+    new_files_list[i] = vstorage->LevelFiles(i);
+  }
+
+  if (first_nonempty_level > 0) {
+    auto& new_last_level = new_files_list[new_levels - 1];
+
+    new_last_level = vstorage->LevelFiles(first_nonempty_level);
+
+    for (size_t i = 0; i < new_last_level.size(); ++i) {
+      const FileMetaData* const meta = new_last_level[i];
+      assert(meta);
+
+      const uint64_t file_number = meta->fd.GetNumber();
+
+      vstorage->file_locations_[file_number] =
+          VersionStorageInfo::FileLocation(new_levels - 1, i);
+    }
+  }
+
+  delete[] vstorage->files_;
+  vstorage->files_ = new_files_list;
+  vstorage->num_levels_ = new_levels;
+  vstorage->ResizeCompactCursors(new_levels);
+
+  MutableCFOptions mutable_cf_options(*options);
+  VersionEdit ve;
+  InstrumentedMutex dummy_mutex;
+  InstrumentedMutexLock l(&dummy_mutex);
+  return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(),
+                              mutable_cf_options, &ve, &dummy_mutex, nullptr,
+                              true);
+}
+
+// Get the checksum information including the checksum and checksum function
+// name of all SST and blob files in VersionSet. Store the information in
+// FileChecksumList which contains a map from file number to its checksum info.
+// If DB is not running, make sure call VersionSet::Recover() to load the file
+// metadata from Manifest to VersionSet before calling this function.
+Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) {
+  // Clean the previously stored checksum information if any.
+  Status s;
+  if (checksum_list == nullptr) {
+    s = Status::InvalidArgument("checksum_list is nullptr");
+    return s;
+  }
+  checksum_list->reset();
+
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+
+    const auto* current = cfd->current();
+    assert(current);
+
+    const auto* vstorage = current->storage_info();
+    assert(vstorage);
+
+    /* SST files */
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      const auto& level_files = vstorage->LevelFiles(level);
+
+      for (const auto& file : level_files) {
+        assert(file);
+
+        s = checksum_list->InsertOneFileChecksum(file->fd.GetNumber(),
+                                                 file->file_checksum,
+                                                 file->file_checksum_func_name);
+        if (!s.ok()) {
+          return s;
+        }
+      }
+    }
+
+    /* Blob files */
+    const auto& blob_files = vstorage->GetBlobFiles();
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      std::string checksum_value = meta->GetChecksumValue();
+      std::string checksum_method = meta->GetChecksumMethod();
+      assert(checksum_value.empty() == checksum_method.empty());
+      if (meta->GetChecksumMethod().empty()) {
+        checksum_value = kUnknownFileChecksum;
+        checksum_method = kUnknownFileChecksumFuncName;
+      }
+
+      s = checksum_list->InsertOneFileChecksum(meta->GetBlobFileNumber(),
+                                               checksum_value, checksum_method);
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  return s;
+}
+
+Status VersionSet::DumpManifest(Options& options, std::string& dscname,
+                                bool verbose, bool hex, bool json) {
+  assert(options.env);
+  std::vector<std::string> column_families;
+  Status s = ListColumnFamiliesFromManifest(
+      dscname, options.env->GetFileSystem().get(), &column_families);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Open the specified manifest file.
+  std::unique_ptr<SequentialFileReader> file_reader;
+  {
+    std::unique_ptr<FSSequentialFile> file;
+    const std::shared_ptr<FileSystem>& fs = options.env->GetFileSystem();
+    s = fs->NewSequentialFile(
+        dscname, fs->OptimizeForManifestRead(file_options_), &file, nullptr);
+    if (!s.ok()) {
+      return s;
+    }
+    file_reader = std::make_unique<SequentialFileReader>(
+        std::move(file), dscname, db_options_->log_readahead_size, io_tracer_);
+  }
+
+  std::vector<ColumnFamilyDescriptor> cf_descs;
+  for (const auto& cf : column_families) {
+    cf_descs.emplace_back(cf, options);
+  }
+
+  DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json);
+  {
+    VersionSet::LogReporter reporter;
+    reporter.status = &s;
+    log::Reader reader(nullptr, std::move(file_reader), &reporter,
+                       true /* checksum */, 0 /* log_number */);
+    handler.Iterate(reader, &s);
+  }
+
+  return handler.status();
+}
+#endif  // ROCKSDB_LITE
+
+void VersionSet::MarkFileNumberUsed(uint64_t number) {
+  // only called during recovery and repair which are single threaded, so this
+  // works because there can't be concurrent calls
+  if (next_file_number_.load(std::memory_order_relaxed) <= number) {
+    next_file_number_.store(number + 1, std::memory_order_relaxed);
+  }
+}
+// Called only either from ::LogAndApply which is protected by mutex or during
+// recovery which is single-threaded.
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+  if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_.store(number, std::memory_order_relaxed);
+  }
+}
+
+Status VersionSet::WriteCurrentStateToManifest(
+    const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+    const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s) {
+  // TODO: Break up into multiple records to reduce memory usage on recovery?
+
+  // WARNING: This method doesn't hold a mutex!!
+
+  // This is done without DB mutex lock held, but only within single-threaded
+  // LogAndApply. Column family manipulations can only happen within LogAndApply
+  // (the same single thread), so we're safe to iterate.
+
+  assert(io_s.ok());
+  if (db_options_->write_dbid_to_manifest) {
+    VersionEdit edit_for_db_id;
+    assert(!db_id_.empty());
+    edit_for_db_id.SetDBId(db_id_);
+    std::string db_id_record;
+    if (!edit_for_db_id.EncodeTo(&db_id_record)) {
+      return Status::Corruption("Unable to Encode VersionEdit:" +
+                                edit_for_db_id.DebugString(true));
+    }
+    io_s = log->AddRecord(db_id_record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  // Save WALs.
+  if (!wal_additions.GetWalAdditions().empty()) {
+    TEST_SYNC_POINT_CALLBACK("VersionSet::WriteCurrentStateToManifest:SaveWal",
+                             const_cast<VersionEdit*>(&wal_additions));
+    std::string record;
+    if (!wal_additions.EncodeTo(&record)) {
+      return Status::Corruption("Unable to Encode VersionEdit: " +
+                                wal_additions.DebugString(true));
+    }
+    io_s = log->AddRecord(record);
+    if (!io_s.ok()) {
+      return io_s;
+    }
+  }
+
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (cfd->IsDropped()) {
+      continue;
+    }
+    assert(cfd->initialized());
+    {
+      // Store column family info
+      VersionEdit edit;
+      if (cfd->GetID() != 0) {
+        // default column family is always there,
+        // no need to explicitly write it
+        edit.AddColumnFamily(cfd->GetName());
+        edit.SetColumnFamily(cfd->GetID());
+      }
+      edit.SetComparatorName(
+          cfd->internal_comparator().user_comparator()->Name());
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption("Unable to Encode VersionEdit:" +
+                                  edit.DebugString(true));
+      }
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+
+    {
+      // Save files
+      VersionEdit edit;
+      edit.SetColumnFamily(cfd->GetID());
+
+      const auto* current = cfd->current();
+      assert(current);
+
+      const auto* vstorage = current->storage_info();
+      assert(vstorage);
+
+      for (int level = 0; level < cfd->NumberLevels(); level++) {
+        const auto& level_files = vstorage->LevelFiles(level);
+
+        for (const auto& f : level_files) {
+          assert(f);
+
+          edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(),
+                       f->fd.GetFileSize(), f->smallest, f->largest,
+                       f->fd.smallest_seqno, f->fd.largest_seqno,
+                       f->marked_for_compaction, f->temperature,
+                       f->oldest_blob_file_number, f->oldest_ancester_time,
+                       f->file_creation_time, f->file_checksum,
+                       f->file_checksum_func_name, f->unique_id);
+        }
+      }
+
+      edit.SetCompactCursors(vstorage->GetCompactCursors());
+
+      const auto& blob_files = vstorage->GetBlobFiles();
+      for (const auto& meta : blob_files) {
+        assert(meta);
+
+        const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+        edit.AddBlobFile(blob_file_number, meta->GetTotalBlobCount(),
+                         meta->GetTotalBlobBytes(), meta->GetChecksumMethod(),
+                         meta->GetChecksumValue());
+        if (meta->GetGarbageBlobCount() > 0) {
+          edit.AddBlobFileGarbage(blob_file_number, meta->GetGarbageBlobCount(),
+                                  meta->GetGarbageBlobBytes());
+        }
+      }
+
+      const auto iter = curr_state.find(cfd->GetID());
+      assert(iter != curr_state.end());
+      uint64_t log_number = iter->second.log_number;
+      edit.SetLogNumber(log_number);
+
+      if (cfd->GetID() == 0) {
+        // min_log_number_to_keep is for the whole db, not for specific column
+        // family. So it does not need to be set for every column family, just
+        // need to be set once. Since default CF can never be dropped, we set
+        // the min_log to the default CF here.
+        uint64_t min_log = min_log_number_to_keep();
+        if (min_log != 0) {
+          edit.SetMinLogNumberToKeep(min_log);
+        }
+      }
+
+      const std::string& full_history_ts_low = iter->second.full_history_ts_low;
+      if (!full_history_ts_low.empty()) {
+        edit.SetFullHistoryTsLow(full_history_ts_low);
+      }
+
+      edit.SetLastSequence(descriptor_last_sequence_);
+
+      std::string record;
+      if (!edit.EncodeTo(&record)) {
+        return Status::Corruption("Unable to Encode VersionEdit:" +
+                                  edit.DebugString(true));
+      }
+      io_s = log->AddRecord(record);
+      if (!io_s.ok()) {
+        return io_s;
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this
+// function is called repeatedly with consecutive pairs of slices. For example
+// if the slice list is [a, b, c, d] this function is called with arguments
+// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
+// we avoid doing binary search for the keys b and c twice and instead somehow
+// maintain state of where they first appear in the files.
+uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options,
+                                     Version* v, const Slice& start,
+                                     const Slice& end, int start_level,
+                                     int end_level, TableReaderCaller caller) {
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  // pre-condition
+  assert(icmp.Compare(start, end) <= 0);
+
+  uint64_t total_full_size = 0;
+  const auto* vstorage = v->storage_info();
+  const int num_non_empty_levels = vstorage->num_non_empty_levels();
+  end_level = (end_level == -1) ? num_non_empty_levels
+                                : std::min(end_level, num_non_empty_levels);
+
+  assert(start_level <= end_level);
+
+  // Outline of the optimization that uses options.files_size_error_margin.
+  // When approximating the files total size that is used to store a keys range,
+  // we first sum up the sizes of the files that fully fall into the range.
+  // Then we sum up the sizes of all the files that may intersect with the range
+  // (this includes all files in L0 as well). Then, if total_intersecting_size
+  // is smaller than total_full_size * options.files_size_error_margin - we can
+  // infer that the intersecting files have a sufficiently negligible
+  // contribution to the total size, and we can approximate the storage required
+  // for the keys in range as just half of the intersecting_files_size.
+  // E.g., if the value of files_size_error_margin is 0.1, then the error of the
+  // approximation is limited to only ~10% of the total size of files that fully
+  // fall into the keys range. In such case, this helps to avoid a costly
+  // process of binary searching the intersecting files that is required only
+  // for a more precise calculation of the total size.
+
+  autovector<FdWithKeyRange*, 32> first_files;
+  autovector<FdWithKeyRange*, 16> last_files;
+
+  // scan all the levels
+  for (int level = start_level; level < end_level; ++level) {
+    const LevelFilesBrief& files_brief = vstorage->LevelFilesBrief(level);
+    if (files_brief.num_files == 0) {
+      // empty level, skip exploration
+      continue;
+    }
+
+    if (level == 0) {
+      // level 0 files are not in sorted order, we need to iterate through
+      // the list to compute the total bytes that require scanning,
+      // so handle the case explicitly (similarly to first_files case)
+      for (size_t i = 0; i < files_brief.num_files; i++) {
+        first_files.push_back(&files_brief.files[i]);
+      }
+      continue;
+    }
+
+    assert(level > 0);
+    assert(files_brief.num_files > 0);
+
+    // identify the file position for start key
+    const int idx_start =
+        FindFileInRange(icmp, files_brief, start, 0,
+                        static_cast<uint32_t>(files_brief.num_files - 1));
+    assert(static_cast<size_t>(idx_start) < files_brief.num_files);
+
+    // identify the file position for end key
+    int idx_end = idx_start;
+    if (icmp.Compare(files_brief.files[idx_end].largest_key, end) < 0) {
+      idx_end =
+          FindFileInRange(icmp, files_brief, end, idx_start,
+                          static_cast<uint32_t>(files_brief.num_files - 1));
+    }
+    assert(idx_end >= idx_start &&
+           static_cast<size_t>(idx_end) < files_brief.num_files);
+
+    // scan all files from the starting index to the ending index
+    // (inferred from the sorted order)
+
+    // first scan all the intermediate full files (excluding first and last)
+    for (int i = idx_start + 1; i < idx_end; ++i) {
+      uint64_t file_size = files_brief.files[i].fd.GetFileSize();
+      // The entire file falls into the range, so we can just take its size.
+      assert(file_size ==
+             ApproximateSize(v, files_brief.files[i], start, end, caller));
+      total_full_size += file_size;
+    }
+
+    // save the first and the last files (which may be the same file), so we
+    // can scan them later.
+    first_files.push_back(&files_brief.files[idx_start]);
+    if (idx_start != idx_end) {
+      // we need to estimate size for both files, only if they are different
+      last_files.push_back(&files_brief.files[idx_end]);
+    }
+  }
+
+  // The sum of all file sizes that intersect the [start, end] keys range.
+  uint64_t total_intersecting_size = 0;
+  for (const auto* file_ptr : first_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+  for (const auto* file_ptr : last_files) {
+    total_intersecting_size += file_ptr->fd.GetFileSize();
+  }
+
+  // Now scan all the first & last files at each level, and estimate their size.
+  // If the total_intersecting_size is less than X% of the total_full_size - we
+  // want to approximate the result in order to avoid the costly binary search
+  // inside ApproximateSize. We use half of file size as an approximation below.
+
+  const double margin = options.files_size_error_margin;
+  if (margin > 0 && total_intersecting_size <
+                        static_cast<uint64_t>(total_full_size * margin)) {
+    total_full_size += total_intersecting_size / 2;
+  } else {
+    // Estimate for all the first files (might also be last files), at each
+    // level
+    for (const auto file_ptr : first_files) {
+      total_full_size += ApproximateSize(v, *file_ptr, start, end, caller);
+    }
+
+    // Estimate for all the last files, at each level
+    for (const auto file_ptr : last_files) {
+      // We could use ApproximateSize here, but calling ApproximateOffsetOf
+      // directly is just more efficient.
+      total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller);
+    }
+  }
+
+  return total_full_size;
+}
+
+uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                                         const Slice& key,
+                                         TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+
+  uint64_t result = 0;
+  if (icmp.Compare(f.largest_key, key) <= 0) {
+    // Entire file is before "key", so just add the file size
+    result = f.fd.GetFileSize();
+  } else if (icmp.Compare(f.smallest_key, key) > 0) {
+    // Entire file is after "key", so ignore
+    result = 0;
+  } else {
+    // "key" falls in the range for this table.  Add the
+    // approximate offset of "key" within the table.
+    TableCache* table_cache = v->cfd_->table_cache();
+    if (table_cache != nullptr) {
+      result = table_cache->ApproximateOffsetOf(
+          key, *f.file_metadata, caller, icmp,
+          v->GetMutableCFOptions().prefix_extractor);
+    }
+  }
+  return result;
+}
+
+uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
+                                     const Slice& start, const Slice& end,
+                                     TableReaderCaller caller) {
+  // pre-condition
+  assert(v);
+  const auto& icmp = v->cfd_->internal_comparator();
+  assert(icmp.Compare(start, end) <= 0);
+
+  if (icmp.Compare(f.largest_key, start) <= 0 ||
+      icmp.Compare(f.smallest_key, end) > 0) {
+    // Entire file is before or after the start/end keys range
+    return 0;
+  }
+
+  if (icmp.Compare(f.smallest_key, start) >= 0) {
+    // Start of the range is before the file start - approximate by end offset
+    return ApproximateOffsetOf(v, f, end, caller);
+  }
+
+  if (icmp.Compare(f.largest_key, end) < 0) {
+    // End of the range is after the file end - approximate by subtracting
+    // start offset from the file size
+    uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller);
+    assert(f.fd.GetFileSize() >= start_offset);
+    return f.fd.GetFileSize() - start_offset;
+  }
+
+  // The interval falls entirely in the range for this file.
+  TableCache* table_cache = v->cfd_->table_cache();
+  if (table_cache == nullptr) {
+    return 0;
+  }
+  return table_cache->ApproximateSize(
+      start, end, *f.file_metadata, caller, icmp,
+      v->GetMutableCFOptions().prefix_extractor);
+}
+
+void VersionSet::RemoveLiveFiles(
+    std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+    std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const {
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    auto* current = cfd->current();
+    bool found_current = false;
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      v->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+      if (v == current) {
+        found_current = true;
+      }
+    }
+
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->RemoveLiveFiles(sst_delete_candidates, blob_delete_candidates);
+    }
+  }
+}
+
+void VersionSet::AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                              std::vector<uint64_t>* live_blob_files) const {
+  assert(live_table_files);
+  assert(live_blob_files);
+
+  // pre-calculate space requirement
+  size_t total_table_files = 0;
+  size_t total_blob_files = 0;
+
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      assert(v);
+
+      const auto* vstorage = v->storage_info();
+      assert(vstorage);
+
+      for (int level = 0; level < vstorage->num_levels(); ++level) {
+        total_table_files += vstorage->LevelFiles(level).size();
+      }
+
+      total_blob_files += vstorage->GetBlobFiles().size();
+    }
+  }
+
+  // just one time extension to the right size
+  live_table_files->reserve(live_table_files->size() + total_table_files);
+  live_blob_files->reserve(live_blob_files->size() + total_blob_files);
+
+  assert(column_family_set_);
+  for (auto cfd : *column_family_set_) {
+    assert(cfd);
+    if (!cfd->initialized()) {
+      continue;
+    }
+
+    auto* current = cfd->current();
+    bool found_current = false;
+
+    Version* const dummy_versions = cfd->dummy_versions();
+    assert(dummy_versions);
+
+    for (Version* v = dummy_versions->next_; v != dummy_versions;
+         v = v->next_) {
+      v->AddLiveFiles(live_table_files, live_blob_files);
+      if (v == current) {
+        found_current = true;
+      }
+    }
+
+    if (!found_current && current != nullptr) {
+      // Should never happen unless it is a bug.
+      assert(false);
+      current->AddLiveFiles(live_table_files, live_blob_files);
+    }
+  }
+}
+
+InternalIterator* VersionSet::MakeInputIterator(
+    const ReadOptions& read_options, const Compaction* c,
+    RangeDelAggregator* range_del_agg,
+    const FileOptions& file_options_compactions,
+    const std::optional<const Slice>& start,
+    const std::optional<const Slice>& end) {
+  auto cfd = c->column_family_data();
+  // Level-0 files have to be merged together.  For other levels,
+  // we will make a concatenating iterator per level.
+  // TODO(opt): use concatenating iterator for level-0 if there is no overlap
+  const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files +
+                                              c->num_input_levels() - 1
+                                        : c->num_input_levels());
+  InternalIterator** list = new InternalIterator*[space];
+  size_t num = 0;
+  for (size_t which = 0; which < c->num_input_levels(); which++) {
+    if (c->input_levels(which)->num_files != 0) {
+      if (c->level(which) == 0) {
+        const LevelFilesBrief* flevel = c->input_levels(which);
+        for (size_t i = 0; i < flevel->num_files; i++) {
+          const FileMetaData& fmd = *flevel->files[i].file_metadata;
+          if (start.has_value() &&
+              cfd->user_comparator()->CompareWithoutTimestamp(
+                  start.value(), fmd.largest.user_key()) > 0) {
+            continue;
+          }
+          // We should be able to filter out the case where the end key
+          // equals to the end boundary, since the end key is exclusive.
+          // We try to be extra safe here.
+          if (end.has_value() &&
+              cfd->user_comparator()->CompareWithoutTimestamp(
+                  end.value(), fmd.smallest.user_key()) < 0) {
+            continue;
+          }
+
+          list[num++] = cfd->table_cache()->NewIterator(
+              read_options, file_options_compactions,
+              cfd->internal_comparator(), fmd, range_del_agg,
+              c->mutable_cf_options()->prefix_extractor,
+              /*table_reader_ptr=*/nullptr,
+              /*file_read_hist=*/nullptr, TableReaderCaller::kCompaction,
+              /*arena=*/nullptr,
+              /*skip_filters=*/false,
+              /*level=*/static_cast<int>(c->level(which)),
+              MaxFileSizeForL0MetaPin(*c->mutable_cf_options()),
+              /*smallest_compaction_key=*/nullptr,
+              /*largest_compaction_key=*/nullptr,
+              /*allow_unprepared_value=*/false);
+        }
+      } else {
+        // Create concatenating iterator for the files from this level
+        list[num++] = new LevelIterator(
+            cfd->table_cache(), read_options, file_options_compactions,
+            cfd->internal_comparator(), c->input_levels(which),
+            c->mutable_cf_options()->prefix_extractor,
+            /*should_sample=*/false,
+            /*no per level latency histogram=*/nullptr,
+            TableReaderCaller::kCompaction, /*skip_filters=*/false,
+            /*level=*/static_cast<int>(c->level(which)), range_del_agg,
+            c->boundaries(which));
+      }
+    }
+  }
+  assert(num <= space);
+  InternalIterator* result =
+      NewMergingIterator(&c->column_family_data()->internal_comparator(), list,
+                         static_cast<int>(num));
+  delete[] list;
+  return result;
+}
+
+Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel,
+                                      FileMetaData** meta,
+                                      ColumnFamilyData** cfd) {
+  for (auto cfd_iter : *column_family_set_) {
+    if (!cfd_iter->initialized()) {
+      continue;
+    }
+    Version* version = cfd_iter->current();
+    const auto* vstorage = version->storage_info();
+    for (int level = 0; level < vstorage->num_levels(); level++) {
+      for (const auto& file : vstorage->LevelFiles(level)) {
+        if (file->fd.GetNumber() == number) {
+          *meta = file;
+          *filelevel = level;
+          *cfd = cfd_iter;
+          return Status::OK();
+        }
+      }
+    }
+  }
+  return Status::NotFound("File not present in any level");
+}
+
+void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
+  for (auto cfd : *column_family_set_) {
+    if (cfd->IsDropped() || !cfd->initialized()) {
+      continue;
+    }
+    for (int level = 0; level < cfd->NumberLevels(); level++) {
+      for (const auto& file :
+           cfd->current()->storage_info()->LevelFiles(level)) {
+        LiveFileMetaData filemetadata;
+        filemetadata.column_family_name = cfd->GetName();
+        uint32_t path_id = file->fd.GetPathId();
+        if (path_id < cfd->ioptions()->cf_paths.size()) {
+          filemetadata.db_path = cfd->ioptions()->cf_paths[path_id].path;
+        } else {
+          assert(!cfd->ioptions()->cf_paths.empty());
+          filemetadata.db_path = cfd->ioptions()->cf_paths.back().path;
+        }
+        filemetadata.directory = filemetadata.db_path;
+        const uint64_t file_number = file->fd.GetNumber();
+        filemetadata.name = MakeTableFileName("", file_number);
+        filemetadata.relative_filename = filemetadata.name.substr(1);
+        filemetadata.file_number = file_number;
+        filemetadata.level = level;
+        filemetadata.size = file->fd.GetFileSize();
+        filemetadata.smallestkey = file->smallest.user_key().ToString();
+        filemetadata.largestkey = file->largest.user_key().ToString();
+        filemetadata.smallest_seqno = file->fd.smallest_seqno;
+        filemetadata.largest_seqno = file->fd.largest_seqno;
+        filemetadata.num_reads_sampled =
+            file->stats.num_reads_sampled.load(std::memory_order_relaxed);
+        filemetadata.being_compacted = file->being_compacted;
+        filemetadata.num_entries = file->num_entries;
+        filemetadata.num_deletions = file->num_deletions;
+        filemetadata.oldest_blob_file_number = file->oldest_blob_file_number;
+        filemetadata.file_checksum = file->file_checksum;
+        filemetadata.file_checksum_func_name = file->file_checksum_func_name;
+        filemetadata.temperature = file->temperature;
+        filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime();
+        filemetadata.file_creation_time = file->TryGetFileCreationTime();
+        metadata->push_back(filemetadata);
+      }
+    }
+  }
+}
+
+void VersionSet::GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                                  std::vector<ObsoleteBlobFileInfo>* blob_files,
+                                  std::vector<std::string>* manifest_filenames,
+                                  uint64_t min_pending_output) {
+  assert(files);
+  assert(blob_files);
+  assert(manifest_filenames);
+  assert(files->empty());
+  assert(blob_files->empty());
+  assert(manifest_filenames->empty());
+
+  std::vector<ObsoleteFileInfo> pending_files;
+  for (auto& f : obsolete_files_) {
+    if (f.metadata->fd.GetNumber() < min_pending_output) {
+      files->emplace_back(std::move(f));
+    } else {
+      pending_files.emplace_back(std::move(f));
+    }
+  }
+  obsolete_files_.swap(pending_files);
+
+  std::vector<ObsoleteBlobFileInfo> pending_blob_files;
+  for (auto& blob_file : obsolete_blob_files_) {
+    if (blob_file.GetBlobFileNumber() < min_pending_output) {
+      blob_files->emplace_back(std::move(blob_file));
+    } else {
+      pending_blob_files.emplace_back(std::move(blob_file));
+    }
+  }
+  obsolete_blob_files_.swap(pending_blob_files);
+
+  obsolete_manifests_.swap(*manifest_filenames);
+}
+
+ColumnFamilyData* VersionSet::CreateColumnFamily(
+    const ColumnFamilyOptions& cf_options, const VersionEdit* edit) {
+  assert(edit->is_column_family_add_);
+
+  MutableCFOptions dummy_cf_options;
+  Version* dummy_versions =
+      new Version(nullptr, this, file_options_, dummy_cf_options, io_tracer_);
+  // Ref() dummy version once so that later we can call Unref() to delete it
+  // by avoiding calling "delete" explicitly (~Version is private)
+  dummy_versions->Ref();
+  auto new_cfd = column_family_set_->CreateColumnFamily(
+      edit->column_family_name_, edit->column_family_, dummy_versions,
+      cf_options);
+
+  Version* v = new Version(new_cfd, this, file_options_,
+                           *new_cfd->GetLatestMutableCFOptions(), io_tracer_,
+                           current_version_number_++);
+
+  constexpr bool update_stats = false;
+
+  v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats);
+
+  AppendVersion(new_cfd, v);
+  // GetLatestMutableCFOptions() is safe here without mutex since the
+  // cfd is not available to client
+  new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions(),
+                             LastSequence());
+  new_cfd->SetLogNumber(edit->log_number_);
+  return new_cfd;
+}
+
+uint64_t VersionSet::GetNumLiveVersions(Version* dummy_versions) {
+  uint64_t count = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    count++;
+  }
+  return count;
+}
+
+uint64_t VersionSet::GetTotalSstFilesSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_files;
+  uint64_t total_files_size = 0;
+  for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    VersionStorageInfo* storage_info = v->storage_info();
+    for (int level = 0; level < storage_info->num_levels_; level++) {
+      for (const auto& file_meta : storage_info->LevelFiles(level)) {
+        if (unique_files.find(file_meta->fd.packed_number_and_path_id) ==
+            unique_files.end()) {
+          unique_files.insert(file_meta->fd.packed_number_and_path_id);
+          total_files_size += file_meta->fd.GetFileSize();
+        }
+      }
+    }
+  }
+  return total_files_size;
+}
+
+uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
+  std::unordered_set<uint64_t> unique_blob_files;
+
+  uint64_t all_versions_blob_file_size = 0;
+
+  for (auto* v = dummy_versions->next_; v != dummy_versions; v = v->next_) {
+    // iterate all the versions
+    const auto* vstorage = v->storage_info();
+    assert(vstorage);
+
+    const auto& blob_files = vstorage->GetBlobFiles();
+
+    for (const auto& meta : blob_files) {
+      assert(meta);
+
+      const uint64_t blob_file_number = meta->GetBlobFileNumber();
+
+      if (unique_blob_files.find(blob_file_number) == unique_blob_files.end()) {
+        // find Blob file that has not been counted
+        unique_blob_files.insert(blob_file_number);
+        all_versions_blob_file_size += meta->GetBlobFileSize();
+      }
+    }
+  }
+
+  return all_versions_blob_file_size;
+}
+
+Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
+                                      const std::string& fpath, int level,
+                                      const FileMetaData& meta) {
+  uint64_t fsize = 0;
+  Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
+  if (status.ok()) {
+    if (fsize != meta.fd.GetFileSize()) {
+      status = Status::Corruption("File size mismatch: " + fpath);
+    }
+  }
+  if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
+    assert(cfd);
+    TableCache* table_cache = cfd->table_cache();
+    assert(table_cache);
+
+    const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
+    assert(cf_opts);
+    std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
+    size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
+
+    const FileOptions& file_opts = file_options();
+
+    Version* version = cfd->current();
+    assert(version);
+    VersionStorageInfo& storage_info = version->storage_info_;
+    const InternalKeyComparator* icmp = storage_info.InternalComparator();
+    assert(icmp);
+
+    InternalStats* internal_stats = cfd->internal_stats();
+
+    FileMetaData meta_copy = meta;
+    status = table_cache->FindTable(
+        ReadOptions(), file_opts, *icmp, meta_copy,
+        &(meta_copy.table_reader_handle), pe,
+        /*no_io=*/false, /*record_read_stats=*/true,
+        internal_stats->GetFileReadHist(level), false, level,
+        /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
+        meta_copy.temperature);
+    if (meta_copy.table_reader_handle) {
+      table_cache->ReleaseHandle(meta_copy.table_reader_handle);
+    }
+  }
+  return status;
+}
+
+ReactiveVersionSet::ReactiveVersionSet(
+    const std::string& dbname, const ImmutableDBOptions* _db_options,
+    const FileOptions& _file_options, Cache* table_cache,
+    WriteBufferManager* write_buffer_manager, WriteController* write_controller,
+    const std::shared_ptr<IOTracer>& io_tracer)
+    : VersionSet(dbname, _db_options, _file_options, table_cache,
+                 write_buffer_manager, write_controller,
+                 /*block_cache_tracer=*/nullptr, io_tracer, /*db_id*/ "",
+                 /*db_session_id*/ "") {}
+
+ReactiveVersionSet::~ReactiveVersionSet() {}
+
+Status ReactiveVersionSet::Recover(
+    const std::vector<ColumnFamilyDescriptor>& column_families,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+    std::unique_ptr<Status>* manifest_reader_status) {
+  assert(manifest_reader != nullptr);
+  assert(manifest_reporter != nullptr);
+  assert(manifest_reader_status != nullptr);
+
+  manifest_reader_status->reset(new Status());
+  manifest_reporter->reset(new LogReporter());
+  static_cast_with_check<LogReporter>(manifest_reporter->get())->status =
+      manifest_reader_status->get();
+  Status s = MaybeSwitchManifest(manifest_reporter->get(), manifest_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+
+  manifest_tailer_.reset(new ManifestTailer(
+      column_families, const_cast<ReactiveVersionSet*>(this), io_tracer_));
+
+  manifest_tailer_->Iterate(*reader, manifest_reader_status->get());
+
+  return manifest_tailer_->status();
+}
+
+Status ReactiveVersionSet::ReadAndApply(
+    InstrumentedMutex* mu,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+    Status* manifest_read_status,
+    std::unordered_set<ColumnFamilyData*>* cfds_changed) {
+  assert(manifest_reader != nullptr);
+  assert(cfds_changed != nullptr);
+  mu->AssertHeld();
+
+  Status s;
+  log::Reader* reader = manifest_reader->get();
+  assert(reader);
+  s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader);
+  if (!s.ok()) {
+    return s;
+  }
+  manifest_tailer_->Iterate(*(manifest_reader->get()), manifest_read_status);
+  s = manifest_tailer_->status();
+  if (s.ok()) {
+    *cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
+  }
+
+  return s;
+}
+
+Status ReactiveVersionSet::MaybeSwitchManifest(
+    log::Reader::Reporter* reporter,
+    std::unique_ptr<log::FragmentBufferedReader>* manifest_reader) {
+  assert(manifest_reader != nullptr);
+  Status s;
+  std::string manifest_path;
+  s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
+                             &manifest_file_number_);
+  if (!s.ok()) {
+    return s;
+  }
+  std::unique_ptr<FSSequentialFile> manifest_file;
+  if (manifest_reader->get() != nullptr &&
+      manifest_reader->get()->file()->file_name() == manifest_path) {
+    // CURRENT points to the same MANIFEST as before, no need to switch
+    // MANIFEST.
+    return s;
+  }
+  assert(nullptr == manifest_reader->get() ||
+         manifest_reader->get()->file()->file_name() != manifest_path);
+  s = fs_->FileExists(manifest_path, IOOptions(), nullptr);
+  if (s.IsNotFound()) {
+    return Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
+  } else if (!s.ok()) {
+    return s;
+  }
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:0");
+  TEST_SYNC_POINT(
+      "ReactiveVersionSet::MaybeSwitchManifest:"
+      "AfterGetCurrentManifestPath:1");
+  // The primary can also delete the MANIFEST while the secondary is reading
+  // it. This is OK on POSIX. For other file systems, maybe create a hard link
+  // to MANIFEST. The hard link should be cleaned up later by the secondary.
+  s = fs_->NewSequentialFile(manifest_path,
+                             fs_->OptimizeForManifestRead(file_options_),
+                             &manifest_file, nullptr);
+  std::unique_ptr<SequentialFileReader> manifest_file_reader;
+  if (s.ok()) {
+    manifest_file_reader.reset(new SequentialFileReader(
+        std::move(manifest_file), manifest_path,
+        db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
+    manifest_reader->reset(new log::FragmentBufferedReader(
+        nullptr, std::move(manifest_file_reader), reporter, true /* checksum */,
+        0 /* log_number */));
+    ROCKS_LOG_INFO(db_options_->info_log, "Switched to new manifest: %s\n",
+                   manifest_path.c_str());
+    if (manifest_tailer_) {
+      manifest_tailer_->PrepareToReadNewManifest();
+    }
+  } else if (s.IsPathNotFound()) {
+    // This can happen if the primary switches to a new MANIFEST after the
+    // secondary reads the CURRENT file but before the secondary actually tries
+    // to open the MANIFEST.
+    s = Status::TryAgain(
+        "The primary may have switched to a new MANIFEST and deleted the old "
+        "one.");
+  }
+  return s;
+}
+
+#ifndef NDEBUG
+uint64_t ReactiveVersionSet::TEST_read_edits_in_atomic_group() const {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().TEST_read_edits_in_atomic_group();
+}
+#endif  // !NDEBUG
+
+std::vector<VersionEdit>& ReactiveVersionSet::replay_buffer() {
+  assert(manifest_tailer_);
+  return manifest_tailer_->GetReadBuffer().replay_buffer();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set.h b/src/rocksdb/db/version_set.h
new file mode 100644
index 000000000..03176a8b5
--- /dev/null
+++ b/src/rocksdb/db/version_set.h
@@ -0,0 +1,1652 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// The representation of a DBImpl consists of a set of Versions.  The
+// newest version is called "current".  Older versions may be kept
+// around to provide a consistent view to live iterators.
+//
+// Each Version keeps track of a set of table files per level, as well as a
+// set of blob files. The entire set of versions is maintained in a
+// VersionSet.
+//
+// Version,VersionSet are thread-compatible, but require external
+// synchronization on all accesses.
+
+#pragma once
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
+#include <set>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "cache/cache_helpers.h"
+#include "db/blob/blob_file_meta.h"
+#include "db/column_family.h"
+#include "db/compaction/compaction.h"
+#include "db/compaction/compaction_picker.h"
+#include "db/dbformat.h"
+#include "db/file_indexer.h"
+#include "db/log_reader.h"
+#include "db/range_del_aggregator.h"
+#include "db/read_callback.h"
+#include "db/table_cache.h"
+#include "db/version_builder.h"
+#include "db/version_edit.h"
+#include "db/write_controller.h"
+#include "env/file_system_tracer.h"
+#if USE_COROUTINES
+#include "folly/experimental/coro/BlockingWait.h"
+#include "folly/experimental/coro/Collect.h"
+#endif
+#include "monitoring/instrumented_mutex.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/file_checksum.h"
+#include "table/get_context.h"
+#include "table/multiget_context.h"
+#include "trace_replay/block_cache_tracer.h"
+#include "util/autovector.h"
+#include "util/coro_utils.h"
+#include "util/hash_containers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace log {
+class Writer;
+}
+
+class BlobIndex;
+class Compaction;
+class LogBuffer;
+class LookupKey;
+class MemTable;
+class Version;
+class VersionSet;
+class WriteBufferManager;
+class MergeContext;
+class ColumnFamilySet;
+class MergeIteratorBuilder;
+class SystemClock;
+class ManifestTailer;
+class FilePickerMultiGet;
+
+// VersionEdit is always supposed to be valid and it is used to point at
+// entries in Manifest. Ideally it should not be used as a container to
+// carry around few of its fields as function params because it can cause
+// readers to think it's a valid entry from Manifest. To avoid that confusion
+// introducing VersionEditParams to simply carry around multiple VersionEdit
+// params. It need not point to a valid record in Manifest.
+using VersionEditParams = VersionEdit;
+
+// Return the smallest index i such that file_level.files[i]->largest >= key.
+// Return file_level.num_files if there is no such file.
+// REQUIRES: "file_level.files" contains a sorted list of
+// non-overlapping files.
+extern int FindFile(const InternalKeyComparator& icmp,
+                    const LevelFilesBrief& file_level, const Slice& key);
+
+// Returns true iff some file in "files" overlaps the user key range
+// [*smallest,*largest].
+// smallest==nullptr represents a key smaller than all keys in the DB.
+// largest==nullptr represents a key largest than all keys in the DB.
+// REQUIRES: If disjoint_sorted_files, file_level.files[]
+// contains disjoint ranges in sorted order.
+extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp,
+                                  bool disjoint_sorted_files,
+                                  const LevelFilesBrief& file_level,
+                                  const Slice* smallest_user_key,
+                                  const Slice* largest_user_key);
+
+// Generate LevelFilesBrief from vector<FdWithKeyRange*>
+// Would copy smallest_key and largest_key data to sequential memory
+// arena: Arena used to allocate the memory
+extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level,
+                                      const std::vector<FileMetaData*>& files,
+                                      Arena* arena);
+
+// Information of the storage associated with each Version, including number of
+// levels of LSM tree, files information at each level, files marked for
+// compaction, blob files, etc.
+class VersionStorageInfo {
+ public:
+  VersionStorageInfo(const InternalKeyComparator* internal_comparator,
+                     const Comparator* user_comparator, int num_levels,
+                     CompactionStyle compaction_style,
+                     VersionStorageInfo* src_vstorage,
+                     bool _force_consistency_checks);
+  // No copying allowed
+  VersionStorageInfo(const VersionStorageInfo&) = delete;
+  void operator=(const VersionStorageInfo&) = delete;
+  ~VersionStorageInfo();
+
+  void Reserve(int level, size_t size) { files_[level].reserve(size); }
+
+  void AddFile(int level, FileMetaData* f);
+
+  // Resize/Initialize the space for compact_cursor_
+  void ResizeCompactCursors(int level) {
+    compact_cursor_.resize(level, InternalKey());
+  }
+
+  const std::vector<InternalKey>& GetCompactCursors() const {
+    return compact_cursor_;
+  }
+
+  // REQUIRES: ResizeCompactCursors has been called
+  void AddCursorForOneLevel(int level,
+                            const InternalKey& smallest_uncompacted_key) {
+    compact_cursor_[level] = smallest_uncompacted_key;
+  }
+
+  // REQUIRES: lock is held
+  // Update the compact cursor and advance the file index using increment
+  // so that it can point to the next cursor (increment means the number of
+  // input files in this level of the last compaction)
+  const InternalKey& GetNextCompactCursor(int level, size_t increment) {
+    int cmp_idx = next_file_to_compact_by_size_[level] + (int)increment;
+    assert(cmp_idx <= (int)files_by_compaction_pri_[level].size());
+    // TODO(zichen): may need to update next_file_to_compact_by_size_
+    // for parallel compaction.
+    InternalKey new_cursor;
+    if (cmp_idx >= (int)files_by_compaction_pri_[level].size()) {
+      cmp_idx = 0;
+    }
+    // TODO(zichen): rethink if this strategy gives us some good guarantee
+    return files_[level][files_by_compaction_pri_[level][cmp_idx]]->smallest;
+  }
+
+  void ReserveBlob(size_t size) { blob_files_.reserve(size); }
+
+  void AddBlobFile(std::shared_ptr<BlobFileMetaData> blob_file_meta);
+
+  void PrepareForVersionAppend(const ImmutableOptions& immutable_options,
+                               const MutableCFOptions& mutable_cf_options);
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  void SetFinalized();
+
+  // Update the accumulated stats from a file-meta.
+  void UpdateAccumulatedStats(FileMetaData* file_meta);
+
+  // Decrease the current stat from a to-be-deleted file-meta
+  void RemoveCurrentStats(FileMetaData* file_meta);
+
+  // Updates internal structures that keep track of compaction scores
+  // We use compaction scores to figure out which compaction to do next
+  // REQUIRES: db_mutex held!!
+  // TODO find a better way to pass compaction_options_fifo.
+  void ComputeCompactionScore(const ImmutableOptions& immutable_options,
+                              const MutableCFOptions& mutable_cf_options);
+
+  // Estimate est_comp_needed_bytes_
+  void EstimateCompactionBytesNeeded(
+      const MutableCFOptions& mutable_cf_options);
+
+  // This computes files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForCompaction();
+
+  // This computes ttl_expired_files_ and is called by
+  // ComputeCompactionScore()
+  void ComputeExpiredTtlFiles(const ImmutableOptions& ioptions,
+                              const uint64_t ttl);
+
+  // This computes files_marked_for_periodic_compaction_ and is called by
+  // ComputeCompactionScore()
+  void ComputeFilesMarkedForPeriodicCompaction(
+      const ImmutableOptions& ioptions,
+      const uint64_t periodic_compaction_seconds);
+
+  // This computes bottommost_files_marked_for_compaction_ and is called by
+  // ComputeCompactionScore() or UpdateOldestSnapshot().
+  //
+  // Among bottommost files (assumes they've already been computed), marks the
+  // ones that have keys that would be eliminated if recompacted, according to
+  // the seqnum of the oldest existing snapshot. Must be called every time
+  // oldest snapshot changes as that is when bottom-level files can become
+  // eligible for compaction.
+  //
+  // REQUIRES: DB mutex held
+  void ComputeBottommostFilesMarkedForCompaction();
+
+  // This computes files_marked_for_forced_blob_gc_ and is called by
+  // ComputeCompactionScore()
+  //
+  // REQUIRES: DB mutex held
+  void ComputeFilesMarkedForForcedBlobGC(
+      double blob_garbage_collection_age_cutoff,
+      double blob_garbage_collection_force_threshold);
+
+  bool level0_non_overlapping() const { return level0_non_overlapping_; }
+
+  // Updates the oldest snapshot and related internal state, like the bottommost
+  // files marked for compaction.
+  // REQUIRES: DB mutex held
+  void UpdateOldestSnapshot(SequenceNumber oldest_snapshot_seqnum);
+
+  int MaxInputLevel() const;
+  int MaxOutputLevel(bool allow_ingest_behind) const;
+
+  // Return level number that has idx'th highest score
+  int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; }
+
+  // Return idx'th highest score
+  double CompactionScore(int idx) const { return compaction_score_[idx]; }
+
+  void GetOverlappingInputs(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr,  // return index of overlap file
+      bool expand_range = true,   // if set, returns files which overlap the
+                                  // range and overlap each other. If false,
+                                  // then just files intersecting the range
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+  void GetCleanInputsWithinInterval(
+      int level, const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,               // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index = -1,        // index of overlap file
+      int* file_index = nullptr)  // return index of overlap file
+      const;
+
+  void GetOverlappingInputsRangeBinarySearch(
+      int level,                 // level > 0
+      const InternalKey* begin,  // nullptr means before all keys
+      const InternalKey* end,    // nullptr means after all keys
+      std::vector<FileMetaData*>* inputs,
+      int hint_index,                // index of overlap file
+      int* file_index,               // return index of overlap file
+      bool within_interval = false,  // if set, force the inputs within interval
+      InternalKey** next_smallest = nullptr)  // if non-null, returns the
+      const;  // smallest key of next file not included
+
+  // Returns true iff some file in the specified level overlaps
+  // some part of [*smallest_user_key,*largest_user_key].
+  // smallest_user_key==NULL represents a key smaller than all keys in the DB.
+  // largest_user_key==NULL represents a key largest than all keys in the DB.
+  bool OverlapInLevel(int level, const Slice* smallest_user_key,
+                      const Slice* largest_user_key);
+
+  // Returns true iff the first or last file in inputs contains
+  // an overlapping user key to the file "just outside" of it (i.e.
+  // just after the last file, or just before the first file)
+  // REQUIRES: "*inputs" is a sorted list of non-overlapping files
+  bool HasOverlappingUserKey(const std::vector<FileMetaData*>* inputs,
+                             int level);
+
+  int num_levels() const { return num_levels_; }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  int num_non_empty_levels() const {
+    assert(finalized_);
+    return num_non_empty_levels_;
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  // This may or may not return number of level files. It is to keep backward
+  // compatible behavior in universal compaction.
+  int l0_delay_trigger_count() const { return l0_delay_trigger_count_; }
+
+  void set_l0_delay_trigger_count(int v) { l0_delay_trigger_count_ = v; }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  int NumLevelFiles(int level) const {
+    assert(finalized_);
+    return static_cast<int>(files_[level].size());
+  }
+
+  // Return the combined file size of all files at the specified level.
+  uint64_t NumLevelBytes(int level) const;
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  const std::vector<FileMetaData*>& LevelFiles(int level) const {
+    return files_[level];
+  }
+
+  class FileLocation {
+   public:
+    FileLocation() = default;
+    FileLocation(int level, size_t position)
+        : level_(level), position_(position) {}
+
+    int GetLevel() const { return level_; }
+    size_t GetPosition() const { return position_; }
+
+    bool IsValid() const { return level_ >= 0; }
+
+    bool operator==(const FileLocation& rhs) const {
+      return level_ == rhs.level_ && position_ == rhs.position_;
+    }
+
+    bool operator!=(const FileLocation& rhs) const { return !(*this == rhs); }
+
+    static FileLocation Invalid() { return FileLocation(); }
+
+   private:
+    int level_ = -1;
+    size_t position_ = 0;
+  };
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  FileLocation GetFileLocation(uint64_t file_number) const {
+    const auto it = file_locations_.find(file_number);
+
+    if (it == file_locations_.end()) {
+      return FileLocation::Invalid();
+    }
+
+    assert(it->second.GetLevel() < num_levels_);
+    assert(it->second.GetPosition() < files_[it->second.GetLevel()].size());
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]);
+    assert(files_[it->second.GetLevel()][it->second.GetPosition()]
+               ->fd.GetNumber() == file_number);
+
+    return it->second;
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  FileMetaData* GetFileMetaDataByNumber(uint64_t file_number) const {
+    auto location = GetFileLocation(file_number);
+
+    if (!location.IsValid()) {
+      return nullptr;
+    }
+
+    return files_[location.GetLevel()][location.GetPosition()];
+  }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  using BlobFiles = std::vector<std::shared_ptr<BlobFileMetaData>>;
+  const BlobFiles& GetBlobFiles() const { return blob_files_; }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  BlobFiles::const_iterator GetBlobFileMetaDataLB(
+      uint64_t blob_file_number) const;
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  std::shared_ptr<BlobFileMetaData> GetBlobFileMetaData(
+      uint64_t blob_file_number) const {
+    const auto it = GetBlobFileMetaDataLB(blob_file_number);
+
+    assert(it == blob_files_.end() || *it);
+
+    if (it != blob_files_.end() &&
+        (*it)->GetBlobFileNumber() == blob_file_number) {
+      return *it;
+    }
+
+    return std::shared_ptr<BlobFileMetaData>();
+  }
+
+  // REQUIRES: This version has been saved (see VersionBuilder::SaveTo)
+  struct BlobStats {
+    uint64_t total_file_size = 0;
+    uint64_t total_garbage_size = 0;
+    double space_amp = 0.0;
+  };
+
+  BlobStats GetBlobStats() const {
+    uint64_t total_file_size = 0;
+    uint64_t total_garbage_size = 0;
+
+    for (const auto& meta : blob_files_) {
+      assert(meta);
+
+      total_file_size += meta->GetBlobFileSize();
+      total_garbage_size += meta->GetGarbageBlobBytes();
+    }
+
+    double space_amp = 0.0;
+    if (total_file_size > total_garbage_size) {
+      space_amp = static_cast<double>(total_file_size) /
+                  (total_file_size - total_garbage_size);
+    }
+
+    return BlobStats{total_file_size, total_garbage_size, space_amp};
+  }
+
+  const ROCKSDB_NAMESPACE::LevelFilesBrief& LevelFilesBrief(int level) const {
+    assert(level < static_cast<int>(level_files_brief_.size()));
+    return level_files_brief_[level];
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  const std::vector<int>& FilesByCompactionPri(int level) const {
+    assert(finalized_);
+    return files_by_compaction_pri_[level];
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForCompaction()
+      const {
+    assert(finalized_);
+    return files_marked_for_compaction_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& ExpiredTtlFiles() const {
+    assert(finalized_);
+    return expired_ttl_files_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  FilesMarkedForPeriodicCompaction() const {
+    assert(finalized_);
+    return files_marked_for_periodic_compaction_;
+  }
+
+  void TEST_AddFileMarkedForPeriodicCompaction(int level, FileMetaData* f) {
+    files_marked_for_periodic_compaction_.emplace_back(level, f);
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>&
+  BottommostFilesMarkedForCompaction() const {
+    assert(finalized_);
+    return bottommost_files_marked_for_compaction_;
+  }
+
+  // REQUIRES: ComputeCompactionScore has been called
+  // REQUIRES: DB mutex held during access
+  const autovector<std::pair<int, FileMetaData*>>& FilesMarkedForForcedBlobGC()
+      const {
+    assert(finalized_);
+    return files_marked_for_forced_blob_gc_;
+  }
+
+  int base_level() const { return base_level_; }
+  double level_multiplier() const { return level_multiplier_; }
+
+  // REQUIRES: lock is held
+  // Set the index that is used to offset into files_by_compaction_pri_ to find
+  // the next compaction candidate file.
+  void SetNextCompactionIndex(int level, int index) {
+    next_file_to_compact_by_size_[level] = index;
+  }
+
+  // REQUIRES: lock is held
+  int NextCompactionIndex(int level) const {
+    return next_file_to_compact_by_size_[level];
+  }
+
+  // REQUIRES: PrepareForVersionAppend has been called
+  const FileIndexer& file_indexer() const {
+    assert(finalized_);
+    return file_indexer_;
+  }
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t kNumberFilesToSort = 50;
+
+  // Return a human-readable short (single-line) summary of the number
+  // of files per level.  Uses *scratch as backing store.
+  struct LevelSummaryStorage {
+    char buffer[1000];
+  };
+  struct FileSummaryStorage {
+    char buffer[3000];
+  };
+  const char* LevelSummary(LevelSummaryStorage* scratch) const;
+  // Return a human-readable short (single-line) summary of files
+  // in a specified level.  Uses *scratch as backing store.
+  const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
+
+  // Return the maximum overlapping data (in bytes) at next level for any
+  // file at a level >= 1.
+  uint64_t MaxNextLevelOverlappingBytes();
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false) const;
+
+  uint64_t GetAverageValueSize() const {
+    if (accumulated_num_non_deletions_ == 0) {
+      return 0;
+    }
+    assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0);
+    assert(accumulated_file_size_ > 0);
+    return accumulated_raw_value_size_ / accumulated_num_non_deletions_ *
+           accumulated_file_size_ /
+           (accumulated_raw_key_size_ + accumulated_raw_value_size_);
+  }
+
+  uint64_t GetEstimatedActiveKeys() const;
+
+  double GetEstimatedCompressionRatioAtLevel(int level) const;
+
+  // re-initializes the index that is used to offset into
+  // files_by_compaction_pri_
+  // to find the next compaction candidate file.
+  void ResetNextCompactionIndex(int level) {
+    next_file_to_compact_by_size_[level] = 0;
+  }
+
+  const InternalKeyComparator* InternalComparator() const {
+    return internal_comparator_;
+  }
+
+  // Returns maximum total bytes of data on a given level.
+  uint64_t MaxBytesForLevel(int level) const;
+
+  // Returns an estimate of the amount of live data in bytes.
+  uint64_t EstimateLiveDataSize() const;
+
+  uint64_t estimated_compaction_needed_bytes() const {
+    return estimated_compaction_needed_bytes_;
+  }
+
+  void TEST_set_estimated_compaction_needed_bytes(uint64_t v) {
+    estimated_compaction_needed_bytes_ = v;
+  }
+
+  bool force_consistency_checks() const { return force_consistency_checks_; }
+
+  SequenceNumber bottommost_files_mark_threshold() const {
+    return bottommost_files_mark_threshold_;
+  }
+
+  // Returns whether any key in [`smallest_key`, `largest_key`] could appear in
+  // an older L0 file than `last_l0_idx` or in a greater level than `last_level`
+  //
+  // @param last_level Level after which we check for overlap
+  // @param last_l0_idx If `last_level == 0`, index of L0 file after which we
+  //    check for overlap; otherwise, must be -1
+  bool RangeMightExistAfterSortedRun(const Slice& smallest_user_key,
+                                     const Slice& largest_user_key,
+                                     int last_level, int last_l0_idx);
+
+ private:
+  void ComputeCompensatedSizes();
+  void UpdateNumNonEmptyLevels();
+  void CalculateBaseBytes(const ImmutableOptions& ioptions,
+                          const MutableCFOptions& options);
+  void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options,
+                                  const MutableCFOptions& mutable_cf_options);
+
+  void GenerateFileIndexer() {
+    file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_);
+  }
+
+  void GenerateLevelFilesBrief();
+  void GenerateLevel0NonOverlapping();
+  void GenerateBottommostFiles();
+  void GenerateFileLocationIndex();
+
+  const InternalKeyComparator* internal_comparator_;
+  const Comparator* user_comparator_;
+  int num_levels_;            // Number of levels
+  int num_non_empty_levels_;  // Number of levels. Any level larger than it
+                              // is guaranteed to be empty.
+  // Per-level max bytes
+  std::vector<uint64_t> level_max_bytes_;
+
+  // A short brief metadata of files per level
+  autovector<ROCKSDB_NAMESPACE::LevelFilesBrief> level_files_brief_;
+  FileIndexer file_indexer_;
+  Arena arena_;  // Used to allocate space for file_levels_
+
+  CompactionStyle compaction_style_;
+
+  // List of files per level, files in each level are arranged
+  // in increasing order of keys
+  std::vector<FileMetaData*>* files_;
+
+  // Map of all table files in version. Maps file number to (level, position on
+  // level).
+  using FileLocations = UnorderedMap<uint64_t, FileLocation>;
+  FileLocations file_locations_;
+
+  // Vector of blob files in version sorted by blob file number.
+  BlobFiles blob_files_;
+
+  // Level that L0 data should be compacted to. All levels < base_level_ should
+  // be empty. -1 if it is not level-compaction so it's not applicable.
+  int base_level_;
+
+  double level_multiplier_;
+
+  // A list for the same set of files that are stored in files_,
+  // but files in each level are now sorted based on file
+  // size. The file with the largest size is at the front.
+  // This vector stores the index of the file from files_.
+  std::vector<std::vector<int>> files_by_compaction_pri_;
+
+  // If true, means that files in L0 have keys with non overlapping ranges
+  bool level0_non_overlapping_;
+
+  // An index into files_by_compaction_pri_ that specifies the first
+  // file that is not yet compacted
+  std::vector<int> next_file_to_compact_by_size_;
+
+  // Only the first few entries of files_by_compaction_pri_ are sorted.
+  // There is no need to sort all the files because it is likely
+  // that on a running system, we need to look at only the first
+  // few largest files because a new version is created every few
+  // seconds/minutes (because of concurrent compactions).
+  static const size_t number_of_files_to_sort_ = 50;
+
+  // This vector contains list of files marked for compaction and also not
+  // currently being compacted. It is protected by DB mutex. It is calculated in
+  // ComputeCompactionScore()
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_compaction_;
+
+  autovector<std::pair<int, FileMetaData*>> expired_ttl_files_;
+
+  autovector<std::pair<int, FileMetaData*>>
+      files_marked_for_periodic_compaction_;
+
+  // These files are considered bottommost because none of their keys can exist
+  // at lower levels. They are not necessarily all in the same level. The marked
+  // ones are eligible for compaction because they contain duplicate key
+  // versions that are no longer protected by snapshot. These variables are
+  // protected by DB mutex and are calculated in `GenerateBottommostFiles()` and
+  // `ComputeBottommostFilesMarkedForCompaction()`.
+  autovector<std::pair<int, FileMetaData*>> bottommost_files_;
+  autovector<std::pair<int, FileMetaData*>>
+      bottommost_files_marked_for_compaction_;
+
+  autovector<std::pair<int, FileMetaData*>> files_marked_for_forced_blob_gc_;
+
+  // Threshold for needing to mark another bottommost file. Maintain it so we
+  // can quickly check when releasing a snapshot whether more bottommost files
+  // became eligible for compaction. It's defined as the min of the max nonzero
+  // seqnums of unmarked bottommost files.
+  SequenceNumber bottommost_files_mark_threshold_ = kMaxSequenceNumber;
+
+  // Monotonically increases as we release old snapshots. Zero indicates no
+  // snapshots have been released yet. When no snapshots remain we set it to the
+  // current seqnum, which needs to be protected as a snapshot can still be
+  // created that references it.
+  SequenceNumber oldest_snapshot_seqnum_ = 0;
+
+  // Level that should be compacted next and its compaction score.
+  // Score < 1 means compaction is not strictly needed.  These fields
+  // are initialized by ComputeCompactionScore.
+  // The most critical level to be compacted is listed first
+  // These are used to pick the best compaction level
+  std::vector<double> compaction_score_;
+  std::vector<int> compaction_level_;
+  int l0_delay_trigger_count_ = 0;  // Count used to trigger slow down and stop
+                                    // for number of L0 files.
+
+  // Compact cursors for round-robin compactions in each level
+  std::vector<InternalKey> compact_cursor_;
+
+  // the following are the sampled temporary stats.
+  // the current accumulated size of sampled files.
+  uint64_t accumulated_file_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_key_size_;
+  // the current accumulated size of all raw keys based on the sampled files.
+  uint64_t accumulated_raw_value_size_;
+  // total number of non-deletion entries
+  uint64_t accumulated_num_non_deletions_;
+  // total number of deletion entries
+  uint64_t accumulated_num_deletions_;
+  // current number of non_deletion entries
+  uint64_t current_num_non_deletions_;
+  // current number of deletion entries
+  uint64_t current_num_deletions_;
+  // current number of file samples
+  uint64_t current_num_samples_;
+  // Estimated bytes needed to be compacted until all levels' size is down to
+  // target sizes.
+  uint64_t estimated_compaction_needed_bytes_;
+
+  bool finalized_;
+
+  // If set to true, we will run consistency checks even if RocksDB
+  // is compiled in release mode
+  bool force_consistency_checks_;
+
+  friend class Version;
+  friend class VersionSet;
+};
+
+struct ObsoleteFileInfo {
+  FileMetaData* metadata;
+  std::string path;
+  // If true, the FileMataData should be destroyed but the file should
+  // not be deleted. This is because another FileMetaData still references
+  // the file, usually because the file is trivial moved so two FileMetadata
+  // is managing the file.
+  bool only_delete_metadata = false;
+
+  ObsoleteFileInfo() noexcept
+      : metadata(nullptr), only_delete_metadata(false) {}
+  ObsoleteFileInfo(FileMetaData* f, const std::string& file_path,
+                   std::shared_ptr<CacheReservationManager>
+                       file_metadata_cache_res_mgr_arg = nullptr)
+      : metadata(f),
+        path(file_path),
+        only_delete_metadata(false),
+        file_metadata_cache_res_mgr(file_metadata_cache_res_mgr_arg) {}
+
+  ObsoleteFileInfo(const ObsoleteFileInfo&) = delete;
+  ObsoleteFileInfo& operator=(const ObsoleteFileInfo&) = delete;
+
+  ObsoleteFileInfo(ObsoleteFileInfo&& rhs) noexcept : ObsoleteFileInfo() {
+    *this = std::move(rhs);
+  }
+
+  ObsoleteFileInfo& operator=(ObsoleteFileInfo&& rhs) noexcept {
+    path = std::move(rhs.path);
+    metadata = rhs.metadata;
+    rhs.metadata = nullptr;
+    file_metadata_cache_res_mgr = rhs.file_metadata_cache_res_mgr;
+    rhs.file_metadata_cache_res_mgr = nullptr;
+
+    return *this;
+  }
+  void DeleteMetadata() {
+    if (file_metadata_cache_res_mgr) {
+      Status s = file_metadata_cache_res_mgr->UpdateCacheReservation(
+          metadata->ApproximateMemoryUsage(), false /* increase */);
+      s.PermitUncheckedError();
+    }
+    delete metadata;
+    metadata = nullptr;
+  }
+
+ private:
+  std::shared_ptr<CacheReservationManager> file_metadata_cache_res_mgr;
+};
+
+class ObsoleteBlobFileInfo {
+ public:
+  ObsoleteBlobFileInfo(uint64_t blob_file_number, std::string path)
+      : blob_file_number_(blob_file_number), path_(std::move(path)) {}
+
+  uint64_t GetBlobFileNumber() const { return blob_file_number_; }
+  const std::string& GetPath() const { return path_; }
+
+ private:
+  uint64_t blob_file_number_;
+  std::string path_;
+};
+
+using MultiGetRange = MultiGetContext::Range;
+// A column family's version consists of the table and blob files owned by
+// the column family at a certain point in time.
+class Version {
+ public:
+  // Append to *iters a sequence of iterators that will
+  // yield the contents of this Version when merged together.
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  void AddIterators(const ReadOptions& read_options,
+                    const FileOptions& soptions,
+                    MergeIteratorBuilder* merger_iter_builder,
+                    bool allow_unprepared_value);
+
+  // @param read_options Must outlive any iterator built by
+  // `merger_iter_builder`.
+  void AddIteratorsForLevel(const ReadOptions& read_options,
+                            const FileOptions& soptions,
+                            MergeIteratorBuilder* merger_iter_builder,
+                            int level, bool allow_unprepared_value);
+
+  Status OverlapWithLevelIterator(const ReadOptions&, const FileOptions&,
+                                  const Slice& smallest_user_key,
+                                  const Slice& largest_user_key, int level,
+                                  bool* overlap);
+
+  // Lookup the value for key or get all merge operands for key.
+  // If do_merge = true (default) then lookup value for key.
+  // Behavior if do_merge = true:
+  //    If found, store it in *value and
+  //    return OK.  Else return a non-OK status.
+  //    Uses *operands to store merge_operator operations to apply later.
+  //
+  //    If the ReadOptions.read_tier is set to do a read-only fetch, then
+  //    *value_found will be set to false if it cannot be determined whether
+  //    this value exists without doing IO.
+  //
+  //    If the key is Deleted, *status will be set to NotFound and
+  //                        *key_exists will be set to true.
+  //    If no key was found, *status will be set to NotFound and
+  //                      *key_exists will be set to false.
+  //    If seq is non-null, *seq will be set to the sequence number found
+  //    for the key if a key was found.
+  // Behavior if do_merge = false
+  //    If the key has any merge operands then store them in
+  //    merge_context.operands_list and don't merge the operands
+  // REQUIRES: lock is not held
+  // REQUIRES: pinned_iters_mgr != nullptr
+  void Get(const ReadOptions&, const LookupKey& key, PinnableSlice* value,
+           PinnableWideColumns* columns, std::string* timestamp, Status* status,
+           MergeContext* merge_context,
+           SequenceNumber* max_covering_tombstone_seq,
+           PinnedIteratorsManager* pinned_iters_mgr,
+           bool* value_found = nullptr, bool* key_exists = nullptr,
+           SequenceNumber* seq = nullptr, ReadCallback* callback = nullptr,
+           bool* is_blob = nullptr, bool do_merge = true);
+
+  void MultiGet(const ReadOptions&, MultiGetRange* range,
+                ReadCallback* callback = nullptr);
+
+  // Interprets blob_index_slice as a blob reference, and (assuming the
+  // corresponding blob file is part of this Version) retrieves the blob and
+  // saves it in *value.
+  // REQUIRES: blob_index_slice stores an encoded blob reference
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const Slice& blob_index_slice,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  // Retrieves a blob using a blob reference and saves it in *value,
+  // assuming the corresponding blob file is part of this Version.
+  Status GetBlob(const ReadOptions& read_options, const Slice& user_key,
+                 const BlobIndex& blob_index,
+                 FilePrefetchBuffer* prefetch_buffer, PinnableSlice* value,
+                 uint64_t* bytes_read) const;
+
+  using BlobReadContext =
+      std::pair<BlobIndex, std::reference_wrapper<const KeyContext>>;
+  using BlobReadContexts = std::vector<BlobReadContext>;
+  void MultiGetBlob(const ReadOptions& read_options, MultiGetRange& range,
+                    std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs);
+
+  // Loads some stats information from files (if update_stats is set) and
+  // populates derived data structures. Call without mutex held. It needs to be
+  // called before appending the version to the version set.
+  void PrepareAppend(const MutableCFOptions& mutable_cf_options,
+                     bool update_stats);
+
+  // Reference count management (so Versions do not disappear out from
+  // under live iterators)
+  void Ref();
+  // Decrease reference count. Delete the object if no reference left
+  // and return true. Otherwise, return false.
+  bool Unref();
+
+  // Add all files listed in the current version to *live_table_files and
+  // *live_blob_files.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
+
+  // Remove live files that are in the delete candidate lists.
+  void RemoveLiveFiles(
+      std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+      std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+  // Return a human readable string that describes this version's contents.
+  std::string DebugString(bool hex = false, bool print_stats = false) const;
+
+  // Returns the version number of this version
+  uint64_t GetVersionNumber() const { return version_number_; }
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the table properties of the file
+  // specified in "file_meta".  If the file name of "file_meta" is
+  // known ahead, passing it by a non-null "fname" can save a
+  // file-name conversion.
+  Status GetTableProperties(std::shared_ptr<const TableProperties>* tp,
+                            const FileMetaData* file_meta,
+                            const std::string* fname = nullptr) const;
+
+  // REQUIRES: lock is held
+  // On success, *props will be populated with all SSTables' table properties.
+  // The keys of `props` are the sst file name, the values of `props` are the
+  // tables' properties, represented as std::shared_ptr.
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props);
+  Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level);
+  Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n,
+                                      TablePropertiesCollection* props) const;
+
+  // Print summary of range delete tombstones in SST files into out_str,
+  // with maximum max_entries_to_print entries printed out.
+  Status TablesRangeTombstoneSummary(int max_entries_to_print,
+                                     std::string* out_str);
+
+  // REQUIRES: lock is held
+  // On success, "tp" will contains the aggregated table property among
+  // the table properties of all sst files in this version.
+  Status GetAggregatedTableProperties(
+      std::shared_ptr<const TableProperties>* tp, int level = -1);
+
+  uint64_t GetEstimatedActiveKeys() {
+    return storage_info_.GetEstimatedActiveKeys();
+  }
+
+  size_t GetMemoryUsageByTableReaders();
+
+  ColumnFamilyData* cfd() const { return cfd_; }
+
+  // Return the next Version in the linked list.
+  Version* Next() const { return next_; }
+
+  int TEST_refs() const { return refs_; }
+
+  VersionStorageInfo* storage_info() { return &storage_info_; }
+  const VersionStorageInfo* storage_info() const { return &storage_info_; }
+
+  VersionSet* version_set() { return vset_; }
+
+  void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta);
+
+  uint64_t GetSstFilesSize();
+
+  // Retrieves the file_creation_time of the oldest file in the DB.
+  // Prerequisite for this API is max_open_files = -1
+  void GetCreationTimeOfOldestFile(uint64_t* creation_time);
+
+  const MutableCFOptions& GetMutableCFOptions() { return mutable_cf_options_; }
+
+  InternalIterator* TEST_GetLevelIterator(
+      const ReadOptions& read_options, MergeIteratorBuilder* merge_iter_builder,
+      int level, bool allow_unprepared_value);
+
+ private:
+  Env* env_;
+  SystemClock* clock_;
+
+  friend class ReactiveVersionSet;
+  friend class VersionSet;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+
+  const InternalKeyComparator* internal_comparator() const {
+    return storage_info_.internal_comparator_;
+  }
+  const Comparator* user_comparator() const {
+    return storage_info_.user_comparator_;
+  }
+
+  // Returns true if the filter blocks in the specified level will not be
+  // checked during read operations. In certain cases (trivial move or preload),
+  // the filter block may already be cached, but we still do not access it such
+  // that it eventually expires from the cache.
+  bool IsFilterSkipped(int level, bool is_file_last_in_level = false);
+
+  // The helper function of UpdateAccumulatedStats, which may fill the missing
+  // fields of file_meta from its associated TableProperties.
+  // Returns true if it does initialize FileMetaData.
+  bool MaybeInitializeFileMetaData(FileMetaData* file_meta);
+
+  // Update the accumulated stats associated with the current version.
+  // This accumulated stats will be used in compaction.
+  void UpdateAccumulatedStats();
+
+  DECLARE_SYNC_AND_ASYNC(
+      /* ret_type */ Status, /* func_name */ MultiGetFromSST,
+      const ReadOptions& read_options, MultiGetRange file_range,
+      int hit_file_level, bool skip_filters, bool skip_range_deletions,
+      FdWithKeyRange* f,
+      std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+      Cache::Handle* table_handle, uint64_t& num_filter_read,
+      uint64_t& num_index_read, uint64_t& num_sst_read);
+
+#ifdef USE_COROUTINES
+  // MultiGet using async IO to read data blocks from SST files in parallel
+  // within and across levels
+  Status MultiGetAsync(
+      const ReadOptions& options, MultiGetRange* range,
+      std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs);
+
+  // A helper function to lookup a batch of keys in a single level. It will
+  // queue coroutine tasks to mget_tasks. It may also split the input batch
+  // by creating a new batch with keys definitely not in this level and
+  // enqueuing it to to_process.
+  Status ProcessBatch(
+      const ReadOptions& read_options, FilePickerMultiGet* batch,
+      std::vector<folly::coro::Task<Status>>& mget_tasks,
+      std::unordered_map<uint64_t, BlobReadContexts>* blob_ctxs,
+      autovector<FilePickerMultiGet, 4>& batches, std::deque<size_t>& waiting,
+      std::deque<size_t>& to_process, unsigned int& num_tasks_queued,
+      std::unordered_map<int, std::tuple<uint64_t, uint64_t, uint64_t>>&
+          mget_stats);
+#endif
+
+  ColumnFamilyData* cfd_;  // ColumnFamilyData to which this Version belongs
+  Logger* info_log_;
+  Statistics* db_statistics_;
+  TableCache* table_cache_;
+  BlobSource* blob_source_;
+  const MergeOperator* merge_operator_;
+
+  VersionStorageInfo storage_info_;
+  VersionSet* vset_;  // VersionSet to which this Version belongs
+  Version* next_;     // Next version in linked list
+  Version* prev_;     // Previous version in linked list
+  int refs_;          // Number of live refs to this version
+  const FileOptions file_options_;
+  const MutableCFOptions mutable_cf_options_;
+  // Cached value to avoid recomputing it on every read.
+  const size_t max_file_size_for_l0_meta_pin_;
+
+  // A version number that uniquely represents this version. This is
+  // used for debugging and logging purposes only.
+  uint64_t version_number_;
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt,
+          MutableCFOptions mutable_cf_options,
+          const std::shared_ptr<IOTracer>& io_tracer,
+          uint64_t version_number = 0);
+
+  ~Version();
+
+  // No copying allowed
+  Version(const Version&) = delete;
+  void operator=(const Version&) = delete;
+};
+
+class BaseReferencedVersionBuilder;
+
+class AtomicGroupReadBuffer {
+ public:
+  AtomicGroupReadBuffer() = default;
+  Status AddEdit(VersionEdit* edit);
+  void Clear();
+  bool IsFull() const;
+  bool IsEmpty() const;
+
+  uint64_t TEST_read_edits_in_atomic_group() const {
+    return read_edits_in_atomic_group_;
+  }
+  std::vector<VersionEdit>& replay_buffer() { return replay_buffer_; }
+
+ private:
+  uint64_t read_edits_in_atomic_group_ = 0;
+  std::vector<VersionEdit> replay_buffer_;
+};
+
+// VersionSet is the collection of versions of all the column families of the
+// database. Each database owns one VersionSet. A VersionSet has access to all
+// column families via ColumnFamilySet, i.e. set of the column families.
+class VersionSet {
+ public:
+  VersionSet(const std::string& dbname, const ImmutableDBOptions* db_options,
+             const FileOptions& file_options, Cache* table_cache,
+             WriteBufferManager* write_buffer_manager,
+             WriteController* write_controller,
+             BlockCacheTracer* const block_cache_tracer,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const std::string& db_id, const std::string& db_session_id);
+  // No copying allowed
+  VersionSet(const VersionSet&) = delete;
+  void operator=(const VersionSet&) = delete;
+
+  virtual ~VersionSet();
+
+  Status LogAndApplyToDefaultColumnFamily(
+      VersionEdit* edit, InstrumentedMutex* mu,
+      FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault();
+    const MutableCFOptions* cf_options =
+        default_cf->GetLatestMutableCFOptions();
+    return LogAndApply(default_cf, *cf_options, edit, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options);
+  }
+
+  // Apply *edit to the current version to form a new descriptor that
+  // is both saved to persistent state and installed as the new
+  // current version.  Will release *mu while actually writing to the file.
+  // column_family_options has to be set if edit is column family add
+  // REQUIRES: *mu is held on entry.
+  // REQUIRES: no other thread concurrently calls LogAndApply()
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options, VersionEdit* edit,
+      InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(edit);
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options);
+  }
+  // The batch version. If edit_list.size() > 1, caller must ensure that
+  // no edit in the list column family add or drop
+  Status LogAndApply(
+      ColumnFamilyData* column_family_data,
+      const MutableCFOptions& mutable_cf_options,
+      const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
+      FSDirectory* dir_contains_current_file, bool new_descriptor_log = false,
+      const ColumnFamilyOptions* column_family_options = nullptr,
+      const std::function<void(const Status&)>& manifest_wcb = {}) {
+    autovector<ColumnFamilyData*> cfds;
+    cfds.emplace_back(column_family_data);
+    autovector<const MutableCFOptions*> mutable_cf_options_list;
+    mutable_cf_options_list.emplace_back(&mutable_cf_options);
+    autovector<autovector<VersionEdit*>> edit_lists;
+    edit_lists.emplace_back(edit_list);
+    return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
+                       dir_contains_current_file, new_descriptor_log,
+                       column_family_options, {manifest_wcb});
+  }
+
+  // The across-multi-cf batch version. If edit_lists contain more than
+  // 1 version edits, caller must ensure that no edit in the []list is column
+  // family manipulation.
+  virtual Status LogAndApply(
+      const autovector<ColumnFamilyData*>& cfds,
+      const autovector<const MutableCFOptions*>& mutable_cf_options_list,
+      const autovector<autovector<VersionEdit*>>& edit_lists,
+      InstrumentedMutex* mu, FSDirectory* dir_contains_current_file,
+      bool new_descriptor_log = false,
+      const ColumnFamilyOptions* new_cf_options = nullptr,
+      const std::vector<std::function<void(const Status&)>>& manifest_wcbs =
+          {});
+
+  static Status GetCurrentManifestPath(const std::string& dbname,
+                                       FileSystem* fs,
+                                       std::string* manifest_filename,
+                                       uint64_t* manifest_file_number);
+  void WakeUpWaitingManifestWriters();
+
+  // Recover the last saved descriptor from persistent storage.
+  // If read_only == true, Recover() will not complain if some column families
+  // are not opened
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 bool read_only = false, std::string* db_id = nullptr,
+                 bool no_error_if_files_missing = false);
+
+  Status TryRecover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                    bool read_only,
+                    const std::vector<std::string>& files_in_dbname,
+                    std::string* db_id, bool* has_missing_table_file);
+
+  // Try to recover the version set to the most recent consistent state
+  // recorded in the specified manifest.
+  Status TryRecoverFromOneManifest(
+      const std::string& manifest_path,
+      const std::vector<ColumnFamilyDescriptor>& column_families,
+      bool read_only, std::string* db_id, bool* has_missing_table_file);
+
+  // Reads a manifest file and returns a list of column families in
+  // column_families.
+  static Status ListColumnFamilies(std::vector<std::string>* column_families,
+                                   const std::string& dbname, FileSystem* fs);
+  static Status ListColumnFamiliesFromManifest(
+      const std::string& manifest_path, FileSystem* fs,
+      std::vector<std::string>* column_families);
+
+#ifndef ROCKSDB_LITE
+  // Try to reduce the number of levels. This call is valid when
+  // only one level from the new max level to the old
+  // max level containing files.
+  // The call is static, since number of levels is immutable during
+  // the lifetime of a RocksDB instance. It reduces number of levels
+  // in a DB by applying changes to manifest.
+  // For example, a db currently has 7 levels [0-6], and a call to
+  // to reduce to 5 [0-4] can only be executed when only one level
+  // among [4-6] contains files.
+  static Status ReduceNumberOfLevels(const std::string& dbname,
+                                     const Options* options,
+                                     const FileOptions& file_options,
+                                     int new_levels);
+
+  // Get the checksum information of all live files
+  Status GetLiveFilesChecksumInfo(FileChecksumList* checksum_list);
+
+  // printf contents (for debugging)
+  Status DumpManifest(Options& options, std::string& manifestFileName,
+                      bool verbose, bool hex = false, bool json = false);
+
+#endif  // ROCKSDB_LITE
+
+  const std::string& DbSessionId() const { return db_session_id_; }
+
+  // Return the current manifest file number
+  uint64_t manifest_file_number() const { return manifest_file_number_; }
+
+  uint64_t options_file_number() const { return options_file_number_; }
+
+  uint64_t pending_manifest_file_number() const {
+    return pending_manifest_file_number_;
+  }
+
+  uint64_t current_next_file_number() const { return next_file_number_.load(); }
+
+  uint64_t min_log_number_to_keep() const {
+    return min_log_number_to_keep_.load();
+  }
+
+  // Allocate and return a new file number
+  uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); }
+
+  // Fetch And Add n new file number
+  uint64_t FetchAddFileNumber(uint64_t n) {
+    return next_file_number_.fetch_add(n);
+  }
+
+  // Return the last sequence number.
+  uint64_t LastSequence() const {
+    return last_sequence_.load(std::memory_order_acquire);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastAllocatedSequence() const {
+    return last_allocated_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_acquire must be sufficient.
+  uint64_t LastPublishedSequence() const {
+    return last_published_sequence_.load(std::memory_order_seq_cst);
+  }
+
+  // Set the last sequence number to s.
+  void SetLastSequence(uint64_t s) {
+    assert(s >= last_sequence_);
+    // Last visible sequence must always be less than last written seq
+    assert(!db_options_->two_write_queues || s <= last_allocated_sequence_);
+    last_sequence_.store(s, std::memory_order_release);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastPublishedSequence(uint64_t s) {
+    assert(s >= last_published_sequence_);
+    last_published_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  void SetLastAllocatedSequence(uint64_t s) {
+    assert(s >= last_allocated_sequence_);
+    last_allocated_sequence_.store(s, std::memory_order_seq_cst);
+  }
+
+  // Note: memory_order_release must be sufficient
+  uint64_t FetchAddLastAllocatedSequence(uint64_t s) {
+    return last_allocated_sequence_.fetch_add(s, std::memory_order_seq_cst);
+  }
+
+  // Mark the specified file number as used.
+  // REQUIRED: this is only called during single-threaded recovery or repair.
+  void MarkFileNumberUsed(uint64_t number);
+
+  // Mark the specified log number as deleted
+  // REQUIRED: this is only called during single-threaded recovery or repair, or
+  // from ::LogAndApply where the global mutex is held.
+  void MarkMinLogNumberToKeep(uint64_t number);
+
+  // Return the log file number for the log file that is currently
+  // being compacted, or zero if there is no such log file.
+  uint64_t prev_log_number() const { return prev_log_number_; }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // In non-2PC mode, all the log numbers smaller than this number can be safely
+  // deleted, although we still use `min_log_number_to_keep_` to determine when
+  // to delete a WAL file.
+  uint64_t MinLogNumberWithUnflushedData() const {
+    return PreComputeMinLogNumberWithUnflushedData(nullptr);
+  }
+
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file.
+  // Empty column families' log number is considered to be
+  // new_log_number_for_empty_cf.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      uint64_t new_log_number_for_empty_cf) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      uint64_t num =
+          cfd->IsEmpty() ? new_log_number_for_empty_cf : cfd->GetLogNumber();
+      if (min_log_num > num && !cfd->IsDropped()) {
+        min_log_num = num;
+      }
+    }
+    return min_log_num;
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfd_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const ColumnFamilyData* cfd_to_skip) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (cfd == cfd_to_skip) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+  // Returns the minimum log number which still has data not flushed to any SST
+  // file, except data from `cfds_to_skip`.
+  uint64_t PreComputeMinLogNumberWithUnflushedData(
+      const std::unordered_set<const ColumnFamilyData*>& cfds_to_skip) const {
+    uint64_t min_log_num = std::numeric_limits<uint64_t>::max();
+    for (auto cfd : *column_family_set_) {
+      if (cfds_to_skip.count(cfd)) {
+        continue;
+      }
+      // It's safe to ignore dropped column families here:
+      // cfd->IsDropped() becomes true after the drop is persisted in MANIFEST.
+      if (min_log_num > cfd->GetLogNumber() && !cfd->IsDropped()) {
+        min_log_num = cfd->GetLogNumber();
+      }
+    }
+    return min_log_num;
+  }
+
+  // Create an iterator that reads over the compaction inputs for "*c".
+  // The caller should delete the iterator when no longer needed.
+  // @param read_options Must outlive the returned iterator.
+  // @param start, end indicates compaction range
+  InternalIterator* MakeInputIterator(
+      const ReadOptions& read_options, const Compaction* c,
+      RangeDelAggregator* range_del_agg,
+      const FileOptions& file_options_compactions,
+      const std::optional<const Slice>& start,
+      const std::optional<const Slice>& end);
+
+  // Add all files listed in any live version to *live_table_files and
+  // *live_blob_files. Note that these lists may contain duplicates.
+  void AddLiveFiles(std::vector<uint64_t>* live_table_files,
+                    std::vector<uint64_t>* live_blob_files) const;
+
+  // Remove live files that are in the delete candidate lists.
+  void RemoveLiveFiles(
+      std::vector<ObsoleteFileInfo>& sst_delete_candidates,
+      std::vector<ObsoleteBlobFileInfo>& blob_delete_candidates) const;
+
+  // Return the approximate size of data to be scanned for range [start, end)
+  // in levels [start_level, end_level). If end_level == -1 it will search
+  // through all non-empty levels
+  uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v,
+                           const Slice& start, const Slice& end,
+                           int start_level, int end_level,
+                           TableReaderCaller caller);
+
+  // Return the size of the current manifest file
+  uint64_t manifest_file_size() const { return manifest_file_size_; }
+
+  Status GetMetadataForFile(uint64_t number, int* filelevel,
+                            FileMetaData** metadata, ColumnFamilyData** cfd);
+
+  // This function doesn't support leveldb SST filenames
+  void GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata);
+
+  void AddObsoleteBlobFile(uint64_t blob_file_number, std::string path) {
+    assert(table_cache_);
+
+    table_cache_->Erase(GetSlice(&blob_file_number));
+
+    obsolete_blob_files_.emplace_back(blob_file_number, std::move(path));
+  }
+
+  void GetObsoleteFiles(std::vector<ObsoleteFileInfo>* files,
+                        std::vector<ObsoleteBlobFileInfo>* blob_files,
+                        std::vector<std::string>* manifest_filenames,
+                        uint64_t min_pending_output);
+
+  ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  RefedColumnFamilySet GetRefedColumnFamilySet() {
+    return RefedColumnFamilySet(GetColumnFamilySet());
+  }
+
+  const FileOptions& file_options() { return file_options_; }
+  void ChangeFileOptions(const MutableDBOptions& new_options) {
+    file_options_.writable_file_max_buffer_size =
+        new_options.writable_file_max_buffer_size;
+  }
+
+  const ImmutableDBOptions* db_options() const { return db_options_; }
+
+  static uint64_t GetNumLiveVersions(Version* dummy_versions);
+
+  static uint64_t GetTotalSstFilesSize(Version* dummy_versions);
+
+  static uint64_t GetTotalBlobFileSize(Version* dummy_versions);
+
+  // Get the IO Status returned by written Manifest.
+  const IOStatus& io_status() const { return io_status_; }
+
+  // The returned WalSet needs to be accessed with DB mutex held.
+  const WalSet& GetWalSet() const { return wals_; }
+
+  void TEST_CreateAndAppendVersion(ColumnFamilyData* cfd) {
+    assert(cfd);
+
+    const auto& mutable_cf_options = *cfd->GetLatestMutableCFOptions();
+    Version* const version =
+        new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_);
+
+    constexpr bool update_stats = false;
+    version->PrepareAppend(mutable_cf_options, update_stats);
+    AppendVersion(cfd, version);
+  }
+
+ protected:
+  using VersionBuilderMap =
+      UnorderedMap<uint32_t, std::unique_ptr<BaseReferencedVersionBuilder>>;
+
+  struct ManifestWriter;
+
+  friend class Version;
+  friend class VersionEditHandler;
+  friend class VersionEditHandlerPointInTime;
+  friend class DumpManifestHandler;
+  friend class DBImpl;
+  friend class DBImplReadOnly;
+
+  struct LogReporter : public log::Reader::Reporter {
+    Status* status;
+    virtual void Corruption(size_t /*bytes*/, const Status& s) override {
+      if (status->ok()) {
+        *status = s;
+      }
+    }
+  };
+
+  void Reset();
+
+  // Returns approximated offset of a key in a file for a given version.
+  uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f,
+                               const Slice& key, TableReaderCaller caller);
+
+  // Returns approximated data size between start and end keys in a file
+  // for a given version.
+  uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f,
+                           const Slice& start, const Slice& end,
+                           TableReaderCaller caller);
+
+  struct MutableCFState {
+    uint64_t log_number;
+    std::string full_history_ts_low;
+
+    explicit MutableCFState() = default;
+    explicit MutableCFState(uint64_t _log_number, std::string ts_low)
+        : log_number(_log_number), full_history_ts_low(std::move(ts_low)) {}
+  };
+
+  // Save current contents to *log
+  Status WriteCurrentStateToManifest(
+      const std::unordered_map<uint32_t, MutableCFState>& curr_state,
+      const VersionEdit& wal_additions, log::Writer* log, IOStatus& io_s);
+
+  void AppendVersion(ColumnFamilyData* column_family_data, Version* v);
+
+  ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
+                                       const VersionEdit* edit);
+
+  Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
+                            int level, const FileMetaData& meta);
+
+  // Protected by DB mutex.
+  WalSet wals_;
+
+  std::unique_ptr<ColumnFamilySet> column_family_set_;
+  Cache* table_cache_;
+  Env* const env_;
+  FileSystemPtr const fs_;
+  SystemClock* const clock_;
+  const std::string dbname_;
+  std::string db_id_;
+  const ImmutableDBOptions* const db_options_;
+  std::atomic<uint64_t> next_file_number_;
+  // Any WAL number smaller than this should be ignored during recovery,
+  // and is qualified for being deleted.
+  std::atomic<uint64_t> min_log_number_to_keep_ = {0};
+  uint64_t manifest_file_number_;
+  uint64_t options_file_number_;
+  uint64_t options_file_size_;
+  uint64_t pending_manifest_file_number_;
+  // The last seq visible to reads. It normally indicates the last sequence in
+  // the memtable but when using two write queues it could also indicate the
+  // last sequence in the WAL visible to reads.
+  std::atomic<uint64_t> last_sequence_;
+  // The last sequence number of data committed to the descriptor (manifest
+  // file).
+  SequenceNumber descriptor_last_sequence_ = 0;
+  // The last seq that is already allocated. It is applicable only when we have
+  // two write queues. In that case seq might or might not have appreated in
+  // memtable but it is expected to appear in the WAL.
+  // We have last_sequence <= last_allocated_sequence_
+  std::atomic<uint64_t> last_allocated_sequence_;
+  // The last allocated sequence that is also published to the readers. This is
+  // applicable only when last_seq_same_as_publish_seq_ is not set. Otherwise
+  // last_sequence_ also indicates the last published seq.
+  // We have last_sequence <= last_published_sequence_ <=
+  // last_allocated_sequence_
+  std::atomic<uint64_t> last_published_sequence_;
+  uint64_t prev_log_number_;  // 0 or backing store for memtable being compacted
+
+  // Opened lazily
+  std::unique_ptr<log::Writer> descriptor_log_;
+
+  // generates a increasing version number for every new version
+  uint64_t current_version_number_;
+
+  // Queue of writers to the manifest file
+  std::deque<ManifestWriter*> manifest_writers_;
+
+  // Current size of manifest file
+  uint64_t manifest_file_size_;
+
+  std::vector<ObsoleteFileInfo> obsolete_files_;
+  std::vector<ObsoleteBlobFileInfo> obsolete_blob_files_;
+  std::vector<std::string> obsolete_manifests_;
+
+  // env options for all reads and writes except compactions
+  FileOptions file_options_;
+
+  BlockCacheTracer* const block_cache_tracer_;
+
+  // Store the IO status when Manifest is written
+  IOStatus io_status_;
+
+  std::shared_ptr<IOTracer> io_tracer_;
+
+  std::string db_session_id_;
+
+ private:
+  // REQUIRES db mutex at beginning. may release and re-acquire db mutex
+  Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
+                               InstrumentedMutex* mu,
+                               FSDirectory* dir_contains_current_file,
+                               bool new_descriptor_log,
+                               const ColumnFamilyOptions* new_cf_options);
+
+  void LogAndApplyCFHelper(VersionEdit* edit,
+                           SequenceNumber* max_last_sequence);
+  Status LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b,
+                           VersionEdit* edit, SequenceNumber* max_last_sequence,
+                           InstrumentedMutex* mu);
+};
+
+// ReactiveVersionSet represents a collection of versions of the column
+// families of the database. Users of ReactiveVersionSet, e.g. DBImplSecondary,
+// need to replay the MANIFEST (description log in older terms) in order to
+// reconstruct and install versions.
+class ReactiveVersionSet : public VersionSet {
+ public:
+  ReactiveVersionSet(const std::string& dbname,
+                     const ImmutableDBOptions* _db_options,
+                     const FileOptions& _file_options, Cache* table_cache,
+                     WriteBufferManager* write_buffer_manager,
+                     WriteController* write_controller,
+                     const std::shared_ptr<IOTracer>& io_tracer);
+
+  ~ReactiveVersionSet() override;
+
+  Status ReadAndApply(
+      InstrumentedMutex* mu,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+      Status* manifest_read_status,
+      std::unordered_set<ColumnFamilyData*>* cfds_changed);
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
+                 std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
+                 std::unique_ptr<log::Reader::Reporter>* manifest_reporter,
+                 std::unique_ptr<Status>* manifest_reader_status);
+#ifndef NDEBUG
+  uint64_t TEST_read_edits_in_atomic_group() const;
+#endif  //! NDEBUG
+
+  std::vector<VersionEdit>& replay_buffer();
+
+ protected:
+  // REQUIRES db mutex
+  Status ApplyOneVersionEditToBuilder(
+      VersionEdit& edit, std::unordered_set<ColumnFamilyData*>* cfds_changed,
+      VersionEdit* version_edit);
+
+  Status MaybeSwitchManifest(
+      log::Reader::Reporter* reporter,
+      std::unique_ptr<log::FragmentBufferedReader>* manifest_reader);
+
+ private:
+  std::unique_ptr<ManifestTailer> manifest_tailer_;
+
+  using VersionSet::LogAndApply;
+  using VersionSet::Recover;
+
+  Status LogAndApply(
+      const autovector<ColumnFamilyData*>& /*cfds*/,
+      const autovector<const MutableCFOptions*>& /*mutable_cf_options_list*/,
+      const autovector<autovector<VersionEdit*>>& /*edit_lists*/,
+      InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/,
+      bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/,
+      const std::vector<std::function<void(const Status&)>>& /*manifest_wcbs*/)
+      override {
+    return Status::NotSupported("not supported in reactive mode");
+  }
+
+  // No copy allowed
+  ReactiveVersionSet(const ReactiveVersionSet&);
+  ReactiveVersionSet& operator=(const ReactiveVersionSet&);
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/version_set_sync_and_async.h b/src/rocksdb/db/version_set_sync_and_async.h
new file mode 100644
index 000000000..755585990
--- /dev/null
+++ b/src/rocksdb/db/version_set_sync_and_async.h
@@ -0,0 +1,151 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "util/coro_utils.h"
+
+#if defined(WITHOUT_COROUTINES) || \
+    (defined(USE_COROUTINES) && defined(WITH_COROUTINES))
+
+namespace ROCKSDB_NAMESPACE {
+
+// Lookup a batch of keys in a single SST file
+DEFINE_SYNC_AND_ASYNC(Status, Version::MultiGetFromSST)
+(const ReadOptions& read_options, MultiGetRange file_range, int hit_file_level,
+ bool skip_filters, bool skip_range_deletions, FdWithKeyRange* f,
+ std::unordered_map<uint64_t, BlobReadContexts>& blob_ctxs,
+ Cache::Handle* table_handle, uint64_t& num_filter_read,
+ uint64_t& num_index_read, uint64_t& num_sst_read) {
+  bool timer_enabled = GetPerfLevel() >= PerfLevel::kEnableTimeExceptForMutex &&
+                       get_perf_context()->per_level_perf_context_enabled;
+
+  Status s;
+  StopWatchNano timer(clock_, timer_enabled /* auto_start */);
+  s = CO_AWAIT(table_cache_->MultiGet)(
+      read_options, *internal_comparator(), *f->file_metadata, &file_range,
+      mutable_cf_options_.prefix_extractor,
+      cfd_->internal_stats()->GetFileReadHist(hit_file_level), skip_filters,
+      skip_range_deletions, hit_file_level, table_handle);
+  // TODO: examine the behavior for corrupted key
+  if (timer_enabled) {
+    PERF_COUNTER_BY_LEVEL_ADD(get_from_table_nanos, timer.ElapsedNanos(),
+                              hit_file_level);
+  }
+  if (!s.ok()) {
+    // TODO: Set status for individual keys appropriately
+    for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
+      *iter->s = s;
+      file_range.MarkKeyDone(iter);
+    }
+    CO_RETURN s;
+  }
+  uint64_t batch_size = 0;
+  for (auto iter = file_range.begin(); s.ok() && iter != file_range.end();
+       ++iter) {
+    GetContext& get_context = *iter->get_context;
+    Status* status = iter->s;
+    // The Status in the KeyContext takes precedence over GetContext state
+    // Status may be an error if there were any IO errors in the table
+    // reader. We never expect Status to be NotFound(), as that is
+    // determined by get_context
+    assert(!status->IsNotFound());
+    if (!status->ok()) {
+      file_range.MarkKeyDone(iter);
+      continue;
+    }
+
+    if (get_context.sample()) {
+      sample_file_read_inc(f->file_metadata);
+    }
+    batch_size++;
+    num_index_read += get_context.get_context_stats_.num_index_read;
+    num_filter_read += get_context.get_context_stats_.num_filter_read;
+    num_sst_read += get_context.get_context_stats_.num_sst_read;
+    // Reset these stats since they're specific to a level
+    get_context.get_context_stats_.num_index_read = 0;
+    get_context.get_context_stats_.num_filter_read = 0;
+    get_context.get_context_stats_.num_sst_read = 0;
+
+    // report the counters before returning
+    if (get_context.State() != GetContext::kNotFound &&
+        get_context.State() != GetContext::kMerge &&
+        db_statistics_ != nullptr) {
+      get_context.ReportCounters();
+    } else {
+      if (iter->max_covering_tombstone_seq > 0) {
+        // The remaining files we look at will only contain covered keys, so
+        // we stop here for this key
+        file_range.SkipKey(iter);
+      }
+    }
+    switch (get_context.State()) {
+      case GetContext::kNotFound:
+        // Keep searching in other files
+        break;
+      case GetContext::kMerge:
+        // TODO: update per-level perfcontext user_key_return_count for kMerge
+        break;
+      case GetContext::kFound:
+        if (hit_file_level == 0) {
+          RecordTick(db_statistics_, GET_HIT_L0);
+        } else if (hit_file_level == 1) {
+          RecordTick(db_statistics_, GET_HIT_L1);
+        } else if (hit_file_level >= 2) {
+          RecordTick(db_statistics_, GET_HIT_L2_AND_UP);
+        }
+
+        PERF_COUNTER_BY_LEVEL_ADD(user_key_return_count, 1, hit_file_level);
+
+        file_range.MarkKeyDone(iter);
+
+        if (iter->is_blob_index) {
+          if (iter->value) {
+            TEST_SYNC_POINT_CALLBACK("Version::MultiGet::TamperWithBlobIndex",
+                                     &(*iter));
+
+            const Slice& blob_index_slice = *(iter->value);
+            BlobIndex blob_index;
+            Status tmp_s = blob_index.DecodeFrom(blob_index_slice);
+            if (tmp_s.ok()) {
+              const uint64_t blob_file_num = blob_index.file_number();
+              blob_ctxs[blob_file_num].emplace_back(
+                  std::make_pair(blob_index, std::cref(*iter)));
+            } else {
+              *(iter->s) = tmp_s;
+            }
+          }
+        } else {
+          file_range.AddValueSize(iter->value->size());
+          if (file_range.GetValueSize() > read_options.value_size_soft_limit) {
+            s = Status::Aborted();
+            break;
+          }
+        }
+        continue;
+      case GetContext::kDeleted:
+        // Use empty error message for speed
+        *status = Status::NotFound();
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kCorrupt:
+        *status =
+            Status::Corruption("corrupted key for ", iter->lkey->user_key());
+        file_range.MarkKeyDone(iter);
+        continue;
+      case GetContext::kUnexpectedBlobIndex:
+        ROCKS_LOG_ERROR(info_log_, "Encounter unexpected blob index.");
+        *status = Status::NotSupported(
+            "Encounter unexpected blob index. Please open DB with "
+            "ROCKSDB_NAMESPACE::blob_db::BlobDB instead.");
+        file_range.MarkKeyDone(iter);
+        continue;
+    }
+  }
+
+  RecordInHistogram(db_statistics_, SST_BATCH_SIZE, batch_size);
+  CO_RETURN s;
+}
+}  // namespace ROCKSDB_NAMESPACE
+#endif
diff --git a/src/rocksdb/db/version_set_test.cc b/src/rocksdb/db/version_set_test.cc
new file mode 100644
index 000000000..7d17406c1
--- /dev/null
+++ b/src/rocksdb/db/version_set_test.cc
@@ -0,0 +1,3587 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/version_set.h"
+
+#include <algorithm>
+
+#include "db/db_impl/db_impl.h"
+#include "db/db_test_util.h"
+#include "db/log_writer.h"
+#include "rocksdb/advanced_options.h"
+#include "rocksdb/convenience.h"
+#include "rocksdb/file_system.h"
+#include "table/block_based/block_based_table_factory.h"
+#include "table/mock_table.h"
+#include "table/unique_id_impl.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class GenerateLevelFilesBriefTest : public testing::Test {
+ public:
+  std::vector<FileMetaData*> files_;
+  LevelFilesBrief file_level_;
+  Arena arena_;
+
+  GenerateLevelFilesBriefTest() {}
+
+  ~GenerateLevelFilesBriefTest() override {
+    for (size_t i = 0; i < files_.size(); i++) {
+      delete files_[i];
+    }
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    FileMetaData* f = new FileMetaData(
+        files_.size() + 1, 0, 0,
+        InternalKey(smallest, smallest_seq, kTypeValue),
+        InternalKey(largest, largest_seq, kTypeValue), smallest_seq,
+        largest_seq, /* marked_for_compact */ false, Temperature::kUnknown,
+        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
+        kUnknownFileCreationTime, kUnknownFileChecksum,
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    files_.push_back(f);
+  }
+
+  int Compare() {
+    int diff = 0;
+    for (size_t i = 0; i < files_.size(); i++) {
+      if (file_level_.files[i].fd.GetNumber() != files_[i]->fd.GetNumber()) {
+        diff++;
+      }
+    }
+    return diff;
+  }
+};
+
+TEST_F(GenerateLevelFilesBriefTest, Empty) {
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(0u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Single) {
+  Add("p", "q");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(1u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+TEST_F(GenerateLevelFilesBriefTest, Multiple) {
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  DoGenerateLevelFilesBrief(&file_level_, files_, &arena_);
+  ASSERT_EQ(4u, file_level_.num_files);
+  ASSERT_EQ(0, Compare());
+}
+
+class CountingLogger : public Logger {
+ public:
+  CountingLogger() : log_count(0) {}
+  using Logger::Logv;
+  void Logv(const char* /*format*/, va_list /*ap*/) override { log_count++; }
+  int log_count;
+};
+
+Options GetOptionsWithNumLevels(int num_levels,
+                                std::shared_ptr<CountingLogger> logger) {
+  Options opt;
+  opt.num_levels = num_levels;
+  opt.info_log = logger;
+  return opt;
+}
+
+class VersionStorageInfoTestBase : public testing::Test {
+ public:
+  const Comparator* ucmp_;
+  InternalKeyComparator icmp_;
+  std::shared_ptr<CountingLogger> logger_;
+  Options options_;
+  ImmutableOptions ioptions_;
+  MutableCFOptions mutable_cf_options_;
+  VersionStorageInfo vstorage_;
+
+  InternalKey GetInternalKey(const char* ukey,
+                             SequenceNumber smallest_seq = 100) {
+    return InternalKey(ukey, smallest_seq, kTypeValue);
+  }
+
+  explicit VersionStorageInfoTestBase(const Comparator* ucmp)
+      : ucmp_(ucmp),
+        icmp_(ucmp_),
+        logger_(new CountingLogger()),
+        options_(GetOptionsWithNumLevels(6, logger_)),
+        ioptions_(options_),
+        mutable_cf_options_(options_),
+        vstorage_(&icmp_, ucmp_, 6, kCompactionStyleLevel,
+                  /*src_vstorage=*/nullptr,
+                  /*_force_consistency_checks=*/false) {}
+
+  ~VersionStorageInfoTestBase() override {
+    for (int i = 0; i < vstorage_.num_levels(); ++i) {
+      for (auto* f : vstorage_.LevelFiles(i)) {
+        if (--f->refs == 0) {
+          delete f;
+        }
+      }
+    }
+  }
+
+  void Add(int level, uint32_t file_number, const char* smallest,
+           const char* largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    constexpr SequenceNumber dummy_seq = 0;
+
+    Add(level, file_number, GetInternalKey(smallest, dummy_seq),
+        GetInternalKey(largest, dummy_seq), file_size, oldest_blob_file_number);
+  }
+
+  void Add(int level, uint32_t file_number, const InternalKey& smallest,
+           const InternalKey& largest, uint64_t file_size = 0,
+           uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) {
+    assert(level < vstorage_.num_levels());
+    FileMetaData* f = new FileMetaData(
+        file_number, 0, file_size, smallest, largest, /* smallest_seq */ 0,
+        /* largest_seq */ 0, /* marked_for_compact */ false,
+        Temperature::kUnknown, oldest_blob_file_number,
+        kUnknownOldestAncesterTime, kUnknownFileCreationTime,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    f->compensated_file_size = file_size;
+    vstorage_.AddFile(level, f);
+  }
+
+  void AddBlob(uint64_t blob_file_number, uint64_t total_blob_count,
+               uint64_t total_blob_bytes,
+               BlobFileMetaData::LinkedSsts linked_ssts,
+               uint64_t garbage_blob_count, uint64_t garbage_blob_bytes) {
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes,
+        /* checksum_method */ std::string(),
+        /* checksum_value */ std::string());
+    auto meta =
+        BlobFileMetaData::Create(std::move(shared_meta), std::move(linked_ssts),
+                                 garbage_blob_count, garbage_blob_bytes);
+
+    vstorage_.AddBlobFile(std::move(meta));
+  }
+
+  void UpdateVersionStorageInfo() {
+    vstorage_.PrepareForVersionAppend(ioptions_, mutable_cf_options_);
+    vstorage_.SetFinalized();
+  }
+
+  std::string GetOverlappingFiles(int level, const InternalKey& begin,
+                                  const InternalKey& end) {
+    std::vector<FileMetaData*> inputs;
+    vstorage_.GetOverlappingInputs(level, &begin, &end, &inputs);
+
+    std::string result;
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      if (i > 0) {
+        result += ",";
+      }
+      AppendNumberTo(&result, inputs[i]->fd.GetNumber());
+    }
+    return result;
+  }
+};
+
+class VersionStorageInfoTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTest() : VersionStorageInfoTestBase(BytewiseComparator()) {}
+
+  ~VersionStorageInfoTest() override {}
+};
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelStatic) {
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  mutable_cf_options_.max_bytes_for_level_base = 10;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(4, 100U, "1", "2", 100U);
+  Add(5, 101U, "1", "2", 100U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 10U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 50U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 250U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1250U);
+
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.base_level(), 5);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 4);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_4) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 3);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamic_5) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 1000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+
+  Add(5, 1U, "1", "2", 500U);
+  Add(5, 2U, "3", "4", 550U);
+  Add(4, 3U, "3", "4", 550U);
+  Add(3, 4U, "3", "4", 250U);
+  Add(3, 5U, "5", "7", 300U);
+  Add(1, 6U, "3", "4", 5U);
+  Add(1, 7U, "8", "9", 5U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(1, logger_->log_count);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(4), 1005U);
+  ASSERT_GT(vstorage_.MaxBytesForLevel(3), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 1005U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 1000U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLotsOfData) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 100;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 2;
+
+  Add(0, 1U, "1", "2", 50U);
+  Add(1, 2U, "1", "2", 50U);
+  Add(2, 3U, "1", "2", 500U);
+  Add(3, 4U, "1", "2", 500U);
+  Add(4, 5U, "1", "2", 1700U);
+  Add(5, 6U, "1", "2", 500U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 800U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 400U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 200U);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(1), 100U);
+  ASSERT_EQ(vstorage_.base_level(), 1);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicLargeLevel) {
+  uint64_t kOneGB = 1000U * 1000U * 1000U;
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10U * kOneGB;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 10;
+
+  Add(0, 1U, "1", "2", 50U);
+  Add(3, 4U, "1", "2", 32U * kOneGB);
+  Add(4, 5U, "1", "2", 500U * kOneGB);
+  Add(5, 6U, "1", "2", 3000U * kOneGB);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(5), 3000U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(4), 300U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(3), 30U * kOneGB);
+  ASSERT_EQ(vstorage_.MaxBytesForLevel(2), 10U * kOneGB);
+  ASSERT_EQ(vstorage_.base_level(), 2);
+  ASSERT_EQ(0, logger_->log_count);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_1) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 40000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+
+  Add(0, 1U, "1", "2", 10000U);
+  Add(0, 2U, "1", "2", 10000U);
+  Add(0, 3U, "1", "2", 10000U);
+
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 200000U);
+  Add(3, 6U, "1", "2", 40000U);
+  Add(2, 7U, "1", "2", 8000U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  // level multiplier should be 3.5
+  ASSERT_EQ(vstorage_.level_multiplier(), 5.0);
+  ASSERT_EQ(40000U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Only L0 hits compaction.
+  ASSERT_EQ(vstorage_.CompactionScoreLevel(0), 0);
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_2) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 10000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+
+  Add(0, 11U, "1", "2", 10000U);
+  Add(0, 12U, "1", "2", 10000U);
+  Add(0, 13U, "1", "2", 10000U);
+
+  // Level size should be around 10,000, 10,290, 51,450, 257,250
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 258000U);  // unadjusted score 1.003
+  Add(3, 6U, "1", "2", 53000U);   // unadjusted score 1.03
+  Add(2, 7U, "1", "2", 20000U);   // unadjusted score 1.94
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(1, vstorage_.base_level());
+  ASSERT_EQ(10000U, vstorage_.MaxBytesForLevel(1));
+  ASSERT_EQ(10290U, vstorage_.MaxBytesForLevel(2));
+  ASSERT_EQ(51450U, vstorage_.MaxBytesForLevel(3));
+  ASSERT_EQ(257250U, vstorage_.MaxBytesForLevel(4));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Although L2 and l3 have higher unadjusted compaction score, considering
+  // a relatively large L0 being compacted down soon, L4 is picked up for
+  // compaction.
+  // L0 is still picked up for oversizing.
+  ASSERT_EQ(0, vstorage_.CompactionScoreLevel(0));
+  ASSERT_EQ(4, vstorage_.CompactionScoreLevel(1));
+}
+
+TEST_F(VersionStorageInfoTest, MaxBytesForLevelDynamicWithLargeL0_3) {
+  ioptions_.level_compaction_dynamic_level_bytes = true;
+  mutable_cf_options_.max_bytes_for_level_base = 20000;
+  mutable_cf_options_.max_bytes_for_level_multiplier = 5;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 5;
+
+  Add(0, 11U, "1", "2", 2500U);
+  Add(0, 12U, "1", "2", 2500U);
+  Add(0, 13U, "1", "2", 2500U);
+  Add(0, 14U, "1", "2", 2500U);
+
+  // Level size should be around 20,000, 53000, 258000
+  Add(5, 4U, "1", "2", 1286250U);
+  Add(4, 5U, "1", "2", 260000U);  // Unadjusted score 1.01, adjusted about 4.3
+  Add(3, 6U, "1", "2", 85000U);   // Unadjusted score 1.42, adjusted about 11.6
+  Add(2, 7U, "1", "2", 30000);    // Unadjusted score 1.5, adjusted about 10.0
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(0, logger_->log_count);
+  ASSERT_EQ(2, vstorage_.base_level());
+  ASSERT_EQ(20000U, vstorage_.MaxBytesForLevel(2));
+
+  vstorage_.ComputeCompactionScore(ioptions_, mutable_cf_options_);
+  // Although L2 has higher unadjusted compaction score, considering
+  // a relatively large L0 being compacted down soon, L3 is picked up for
+  // compaction.
+
+  ASSERT_EQ(3, vstorage_.CompactionScoreLevel(0));
+  ASSERT_EQ(2, vstorage_.CompactionScoreLevel(1));
+  ASSERT_EQ(4, vstorage_.CompactionScoreLevel(2));
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize) {
+  // Test whether the overlaps are detected as expected
+  Add(1, 1U, "4", "7", 1U);  // Perfect overlap with last level
+  Add(2, 2U, "3", "5", 1U);  // Partial overlap with last level
+  Add(2, 3U, "6", "8", 1U);  // Partial overlap with last level
+  Add(3, 4U, "1", "9", 1U);  // Contains range of last level
+  Add(4, 5U, "4", "5", 1U);  // Inside range of last level
+  Add(4, 6U, "6", "7", 1U);  // Inside range of last level
+  Add(5, 7U, "4", "7", 10U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(10U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, EstimateLiveDataSize2) {
+  Add(0, 1U, "9", "9", 1U);  // Level 0 is not ordered
+  Add(0, 2U, "5", "6", 1U);  // Ignored because of [5,6] in l1
+  Add(1, 3U, "1", "2", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 4U, "3", "4", 1U);  // Ignored because of [2,3] in l2
+  Add(1, 5U, "5", "6", 1U);
+  Add(2, 6U, "2", "3", 1U);
+  Add(3, 7U, "7", "8", 1U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(4U, vstorage_.EstimateLiveDataSize());
+}
+
+TEST_F(VersionStorageInfoTest, GetOverlappingInputs) {
+  // Two files that overlap at the range deletion tombstone sentinel.
+  Add(1, 1U, {"a", 0, kTypeValue},
+      {"b", kMaxSequenceNumber, kTypeRangeDeletion}, 1);
+  Add(1, 2U, {"b", 0, kTypeValue}, {"c", 0, kTypeValue}, 1);
+  // Two files that overlap at the same user key.
+  Add(1, 3U, {"d", 0, kTypeValue}, {"e", kMaxSequenceNumber, kTypeValue}, 1);
+  Add(1, 4U, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}, 1);
+  // Two files that do not overlap.
+  Add(1, 5U, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}, 1);
+  Add(1, 6U, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}, 1);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ("1,2",
+            GetOverlappingFiles(1, {"a", 0, kTypeValue}, {"b", 0, kTypeValue}));
+  ASSERT_EQ("1",
+            GetOverlappingFiles(1, {"a", 0, kTypeValue},
+                                {"b", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("2", GetOverlappingFiles(1, {"b", kMaxSequenceNumber, kTypeValue},
+                                     {"c", 0, kTypeValue}));
+  ASSERT_EQ("3,4",
+            GetOverlappingFiles(1, {"d", 0, kTypeValue}, {"e", 0, kTypeValue}));
+  ASSERT_EQ("3",
+            GetOverlappingFiles(1, {"d", 0, kTypeValue},
+                                {"e", kMaxSequenceNumber, kTypeRangeDeletion}));
+  ASSERT_EQ("3,4", GetOverlappingFiles(1, {"e", kMaxSequenceNumber, kTypeValue},
+                                       {"f", 0, kTypeValue}));
+  ASSERT_EQ("3,4",
+            GetOverlappingFiles(1, {"e", 0, kTypeValue}, {"f", 0, kTypeValue}));
+  ASSERT_EQ("5",
+            GetOverlappingFiles(1, {"g", 0, kTypeValue}, {"h", 0, kTypeValue}));
+  ASSERT_EQ("6",
+            GetOverlappingFiles(1, {"i", 0, kTypeValue}, {"j", 0, kTypeValue}));
+}
+
+TEST_F(VersionStorageInfoTest, FileLocationAndMetaDataByNumber) {
+  Add(0, 11U, "1", "2", 5000U);
+  Add(0, 12U, "1", "2", 5000U);
+
+  Add(2, 7U, "1", "2", 8000U);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(vstorage_.GetFileLocation(11U),
+            VersionStorageInfo::FileLocation(0, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(11U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(12U),
+            VersionStorageInfo::FileLocation(0, 1));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(12U), nullptr);
+
+  ASSERT_EQ(vstorage_.GetFileLocation(7U),
+            VersionStorageInfo::FileLocation(2, 0));
+  ASSERT_NE(vstorage_.GetFileMetaDataByNumber(7U), nullptr);
+
+  ASSERT_FALSE(vstorage_.GetFileLocation(999U).IsValid());
+  ASSERT_EQ(vstorage_.GetFileMetaDataByNumber(999U), nullptr);
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCEmpty) {
+  // No SST or blob files in VersionStorageInfo
+  UpdateVersionStorageInfo();
+
+  constexpr double age_cutoff = 0.5;
+  constexpr double force_threshold = 0.75;
+  vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+  ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCSingleBatch) {
+  // Test the edge case when all blob files are part of the oldest batch.
+  // We have one L0 SST file #1, and four blob files #10, #11, #12, and #13.
+  // The oldest blob file used by SST #1 is blob file #10.
+
+  constexpr int level = 0;
+
+  constexpr uint64_t sst = 1;
+
+  constexpr uint64_t first_blob = 10;
+  constexpr uint64_t second_blob = 11;
+  constexpr uint64_t third_blob = 12;
+  constexpr uint64_t fourth_blob = 13;
+
+  {
+    constexpr char smallest[] = "bar1";
+    constexpr char largest[] = "foo1";
+    constexpr uint64_t file_size = 1000;
+
+    Add(level, sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 10;
+    constexpr uint64_t total_blob_bytes = 100000;
+    constexpr uint64_t garbage_blob_count = 2;
+    constexpr uint64_t garbage_blob_bytes = 15000;
+
+    AddBlob(first_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{sst}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 4;
+    constexpr uint64_t total_blob_bytes = 400000;
+    constexpr uint64_t garbage_blob_count = 3;
+    constexpr uint64_t garbage_blob_bytes = 235000;
+
+    AddBlob(second_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 20;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 8;
+    constexpr uint64_t garbage_blob_bytes = 400000;
+
+    AddBlob(third_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 128;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 67;
+    constexpr uint64_t garbage_blob_bytes = 600000;
+
+    AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  UpdateVersionStorageInfo();
+
+  assert(vstorage_.num_levels() > 0);
+  const auto& level_files = vstorage_.LevelFiles(level);
+
+  assert(level_files.size() == 1);
+  assert(level_files[0] && level_files[0]->fd.GetNumber() == sst);
+
+  // No blob files eligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.1;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Part of the oldest batch of blob files (specifically, #12 and #13) is
+  // ineligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 1.0;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 1.0;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 1);
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+  }
+}
+
+TEST_F(VersionStorageInfoTest, ForcedBlobGCMultipleBatches) {
+  // Add three L0 SSTs (1, 2, and 3) and four blob files (10, 11, 12, and 13).
+  // The first two SSTs have the same oldest blob file, namely, the very oldest
+  // one (10), while the third SST's oldest blob file reference points to the
+  // third blob file (12). Thus, the oldest batch of blob files contains the
+  // first two blob files 10 and 11, and assuming they are eligible for GC based
+  // on the age cutoff, compacting away the SSTs 1 and 2 will eliminate them.
+
+  constexpr int level = 0;
+
+  constexpr uint64_t first_sst = 1;
+  constexpr uint64_t second_sst = 2;
+  constexpr uint64_t third_sst = 3;
+
+  constexpr uint64_t first_blob = 10;
+  constexpr uint64_t second_blob = 11;
+  constexpr uint64_t third_blob = 12;
+  constexpr uint64_t fourth_blob = 13;
+
+  {
+    constexpr char smallest[] = "bar1";
+    constexpr char largest[] = "foo1";
+    constexpr uint64_t file_size = 1000;
+
+    Add(level, first_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar2";
+    constexpr char largest[] = "foo2";
+    constexpr uint64_t file_size = 2000;
+
+    Add(level, second_sst, smallest, largest, file_size, first_blob);
+  }
+
+  {
+    constexpr char smallest[] = "bar3";
+    constexpr char largest[] = "foo3";
+    constexpr uint64_t file_size = 3000;
+
+    Add(level, third_sst, smallest, largest, file_size, third_blob);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 10;
+    constexpr uint64_t total_blob_bytes = 100000;
+    constexpr uint64_t garbage_blob_count = 2;
+    constexpr uint64_t garbage_blob_bytes = 15000;
+
+    AddBlob(first_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{first_sst, second_sst},
+            garbage_blob_count, garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 4;
+    constexpr uint64_t total_blob_bytes = 400000;
+    constexpr uint64_t garbage_blob_count = 3;
+    constexpr uint64_t garbage_blob_bytes = 235000;
+
+    AddBlob(second_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 20;
+    constexpr uint64_t total_blob_bytes = 1000000;
+    constexpr uint64_t garbage_blob_count = 8;
+    constexpr uint64_t garbage_blob_bytes = 123456;
+
+    AddBlob(third_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{third_sst}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  {
+    constexpr uint64_t total_blob_count = 128;
+    constexpr uint64_t total_blob_bytes = 789012345;
+    constexpr uint64_t garbage_blob_count = 67;
+    constexpr uint64_t garbage_blob_bytes = 88888888;
+
+    AddBlob(fourth_blob, total_blob_count, total_blob_bytes,
+            BlobFileMetaData::LinkedSsts{}, garbage_blob_count,
+            garbage_blob_bytes);
+  }
+
+  UpdateVersionStorageInfo();
+
+  assert(vstorage_.num_levels() > 0);
+  const auto& level_files = vstorage_.LevelFiles(level);
+
+  assert(level_files.size() == 3);
+  assert(level_files[0] && level_files[0]->fd.GetNumber() == first_sst);
+  assert(level_files[1] && level_files[1]->fd.GetNumber() == second_sst);
+  assert(level_files[2] && level_files[2]->fd.GetNumber() == third_sst);
+
+  // No blob files eligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.1;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Part of the oldest batch of blob files (specifically, the second file) is
+  // ineligible for GC due to the age cutoff
+
+  {
+    constexpr double age_cutoff = 0.25;
+    constexpr double force_threshold = 0.0;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 0.5;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+              [](const std::pair<int, FileMetaData*>& lhs,
+                 const std::pair<int, FileMetaData*>& rhs) {
+                assert(lhs.second);
+                assert(rhs.second);
+                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+              });
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]},
+                                      {level, level_files[1]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+  }
+
+  // Now try the last two cases again with a greater than necessary age cutoff
+
+  // Oldest batch is eligible based on age cutoff but its overall garbage ratio
+  // is below threshold
+
+  {
+    constexpr double age_cutoff = 0.75;
+    constexpr double force_threshold = 0.6;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    ASSERT_TRUE(vstorage_.FilesMarkedForForcedBlobGC().empty());
+  }
+
+  // Oldest batch is eligible based on age cutoff and its overall garbage ratio
+  // meets threshold
+
+  {
+    constexpr double age_cutoff = 0.75;
+    constexpr double force_threshold = 0.5;
+    vstorage_.ComputeFilesMarkedForForcedBlobGC(age_cutoff, force_threshold);
+
+    auto ssts_to_be_compacted = vstorage_.FilesMarkedForForcedBlobGC();
+    ASSERT_EQ(ssts_to_be_compacted.size(), 2);
+
+    std::sort(ssts_to_be_compacted.begin(), ssts_to_be_compacted.end(),
+              [](const std::pair<int, FileMetaData*>& lhs,
+                 const std::pair<int, FileMetaData*>& rhs) {
+                assert(lhs.second);
+                assert(rhs.second);
+                return lhs.second->fd.GetNumber() < rhs.second->fd.GetNumber();
+              });
+
+    const autovector<std::pair<int, FileMetaData*>>
+        expected_ssts_to_be_compacted{{level, level_files[0]},
+                                      {level, level_files[1]}};
+
+    ASSERT_EQ(ssts_to_be_compacted[0], expected_ssts_to_be_compacted[0]);
+    ASSERT_EQ(ssts_to_be_compacted[1], expected_ssts_to_be_compacted[1]);
+  }
+}
+
+class VersionStorageInfoTimestampTest : public VersionStorageInfoTestBase {
+ public:
+  VersionStorageInfoTimestampTest()
+      : VersionStorageInfoTestBase(test::BytewiseComparatorWithU64TsWrapper()) {
+  }
+  ~VersionStorageInfoTimestampTest() override {}
+  std::string Timestamp(uint64_t ts) const {
+    std::string ret;
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+  std::string PackUserKeyAndTimestamp(const Slice& ukey, uint64_t ts) const {
+    std::string ret;
+    ret.assign(ukey.data(), ukey.size());
+    PutFixed64(&ret, ts);
+    return ret;
+  }
+};
+
+TEST_F(VersionStorageInfoTimestampTest, GetOverlappingInputs) {
+  Add(/*level=*/1, /*file_number=*/1, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/9), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/8), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/2, /*smallest=*/
+      {PackUserKeyAndTimestamp("a", /*ts=*/5), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("b", /*ts=*/10), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+  Add(/*level=*/1, /*file_number=*/3, /*smallest=*/
+      {PackUserKeyAndTimestamp("c", /*ts=*/12), /*s=*/0, kTypeValue},
+      /*largest=*/
+      {PackUserKeyAndTimestamp("d", /*ts=*/1), /*s=*/0, kTypeValue},
+      /*file_size=*/100);
+
+  UpdateVersionStorageInfo();
+
+  ASSERT_EQ(
+      "1,2",
+      GetOverlappingFiles(
+          /*level=*/1,
+          {PackUserKeyAndTimestamp("a", /*ts=*/12), /*s=*/0, kTypeValue},
+          {PackUserKeyAndTimestamp("a", /*ts=*/11), /*s=*/0, kTypeValue}));
+  ASSERT_EQ("3",
+            GetOverlappingFiles(
+                /*level=*/1,
+                {PackUserKeyAndTimestamp("c", /*ts=*/15), /*s=*/0, kTypeValue},
+                {PackUserKeyAndTimestamp("c", /*ts=*/2), /*s=*/0, kTypeValue}));
+}
+
+class FindLevelFileTest : public testing::Test {
+ public:
+  LevelFilesBrief file_level_;
+  bool disjoint_sorted_files_;
+  Arena arena_;
+
+  FindLevelFileTest() : disjoint_sorted_files_(true) {}
+
+  ~FindLevelFileTest() override {}
+
+  void LevelFileInit(size_t num = 0) {
+    char* mem = arena_.AllocateAligned(num * sizeof(FdWithKeyRange));
+    file_level_.files = new (mem) FdWithKeyRange[num];
+    file_level_.num_files = 0;
+  }
+
+  void Add(const char* smallest, const char* largest,
+           SequenceNumber smallest_seq = 100,
+           SequenceNumber largest_seq = 100) {
+    InternalKey smallest_key = InternalKey(smallest, smallest_seq, kTypeValue);
+    InternalKey largest_key = InternalKey(largest, largest_seq, kTypeValue);
+
+    Slice smallest_slice = smallest_key.Encode();
+    Slice largest_slice = largest_key.Encode();
+
+    char* mem =
+        arena_.AllocateAligned(smallest_slice.size() + largest_slice.size());
+    memcpy(mem, smallest_slice.data(), smallest_slice.size());
+    memcpy(mem + smallest_slice.size(), largest_slice.data(),
+           largest_slice.size());
+
+    // add to file_level_
+    size_t num = file_level_.num_files;
+    auto& file = file_level_.files[num];
+    file.fd = FileDescriptor(num + 1, 0, 0);
+    file.smallest_key = Slice(mem, smallest_slice.size());
+    file.largest_key = Slice(mem + smallest_slice.size(), largest_slice.size());
+    file_level_.num_files++;
+  }
+
+  int Find(const char* key) {
+    InternalKey target(key, 100, kTypeValue);
+    InternalKeyComparator cmp(BytewiseComparator());
+    return FindFile(cmp, file_level_, target.Encode());
+  }
+
+  bool Overlaps(const char* smallest, const char* largest) {
+    InternalKeyComparator cmp(BytewiseComparator());
+    Slice s(smallest != nullptr ? smallest : "");
+    Slice l(largest != nullptr ? largest : "");
+    return SomeFileOverlapsRange(cmp, disjoint_sorted_files_, file_level_,
+                                 (smallest != nullptr ? &s : nullptr),
+                                 (largest != nullptr ? &l : nullptr));
+  }
+};
+
+TEST_F(FindLevelFileTest, LevelEmpty) {
+  LevelFileInit(0);
+
+  ASSERT_EQ(0, Find("foo"));
+  ASSERT_TRUE(!Overlaps("a", "z"));
+  ASSERT_TRUE(!Overlaps(nullptr, "z"));
+  ASSERT_TRUE(!Overlaps("a", nullptr));
+  ASSERT_TRUE(!Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelSingle) {
+  LevelFileInit(1);
+
+  Add("p", "q");
+  ASSERT_EQ(0, Find("a"));
+  ASSERT_EQ(0, Find("p"));
+  ASSERT_EQ(0, Find("p1"));
+  ASSERT_EQ(0, Find("q"));
+  ASSERT_EQ(1, Find("q1"));
+  ASSERT_EQ(1, Find("z"));
+
+  ASSERT_TRUE(!Overlaps("a", "b"));
+  ASSERT_TRUE(!Overlaps("z1", "z2"));
+  ASSERT_TRUE(Overlaps("a", "p"));
+  ASSERT_TRUE(Overlaps("a", "q"));
+  ASSERT_TRUE(Overlaps("a", "z"));
+  ASSERT_TRUE(Overlaps("p", "p1"));
+  ASSERT_TRUE(Overlaps("p", "q"));
+  ASSERT_TRUE(Overlaps("p", "z"));
+  ASSERT_TRUE(Overlaps("p1", "p2"));
+  ASSERT_TRUE(Overlaps("p1", "z"));
+  ASSERT_TRUE(Overlaps("q", "q"));
+  ASSERT_TRUE(Overlaps("q", "q1"));
+
+  ASSERT_TRUE(!Overlaps(nullptr, "j"));
+  ASSERT_TRUE(!Overlaps("r", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "p"));
+  ASSERT_TRUE(Overlaps(nullptr, "p1"));
+  ASSERT_TRUE(Overlaps("q", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelMultiple) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_EQ(0, Find("100"));
+  ASSERT_EQ(0, Find("150"));
+  ASSERT_EQ(0, Find("151"));
+  ASSERT_EQ(0, Find("199"));
+  ASSERT_EQ(0, Find("200"));
+  ASSERT_EQ(1, Find("201"));
+  ASSERT_EQ(1, Find("249"));
+  ASSERT_EQ(1, Find("250"));
+  ASSERT_EQ(2, Find("251"));
+  ASSERT_EQ(2, Find("299"));
+  ASSERT_EQ(2, Find("300"));
+  ASSERT_EQ(2, Find("349"));
+  ASSERT_EQ(2, Find("350"));
+  ASSERT_EQ(3, Find("351"));
+  ASSERT_EQ(3, Find("400"));
+  ASSERT_EQ(3, Find("450"));
+  ASSERT_EQ(4, Find("451"));
+
+  ASSERT_TRUE(!Overlaps("100", "149"));
+  ASSERT_TRUE(!Overlaps("251", "299"));
+  ASSERT_TRUE(!Overlaps("451", "500"));
+  ASSERT_TRUE(!Overlaps("351", "399"));
+
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+}
+
+TEST_F(FindLevelFileTest, LevelMultipleNullBoundaries) {
+  LevelFileInit(4);
+
+  Add("150", "200");
+  Add("200", "250");
+  Add("300", "350");
+  Add("400", "450");
+  ASSERT_TRUE(!Overlaps(nullptr, "149"));
+  ASSERT_TRUE(!Overlaps("451", nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, nullptr));
+  ASSERT_TRUE(Overlaps(nullptr, "150"));
+  ASSERT_TRUE(Overlaps(nullptr, "199"));
+  ASSERT_TRUE(Overlaps(nullptr, "200"));
+  ASSERT_TRUE(Overlaps(nullptr, "201"));
+  ASSERT_TRUE(Overlaps(nullptr, "400"));
+  ASSERT_TRUE(Overlaps(nullptr, "800"));
+  ASSERT_TRUE(Overlaps("100", nullptr));
+  ASSERT_TRUE(Overlaps("200", nullptr));
+  ASSERT_TRUE(Overlaps("449", nullptr));
+  ASSERT_TRUE(Overlaps("450", nullptr));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlapSequenceChecks) {
+  LevelFileInit(1);
+
+  Add("200", "200", 5000, 3000);
+  ASSERT_TRUE(!Overlaps("199", "199"));
+  ASSERT_TRUE(!Overlaps("201", "300"));
+  ASSERT_TRUE(Overlaps("200", "200"));
+  ASSERT_TRUE(Overlaps("190", "200"));
+  ASSERT_TRUE(Overlaps("200", "210"));
+}
+
+TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
+  LevelFileInit(2);
+
+  Add("150", "600");
+  Add("400", "500");
+  disjoint_sorted_files_ = false;
+  ASSERT_TRUE(!Overlaps("100", "149"));
+  ASSERT_TRUE(!Overlaps("601", "700"));
+  ASSERT_TRUE(Overlaps("100", "150"));
+  ASSERT_TRUE(Overlaps("100", "200"));
+  ASSERT_TRUE(Overlaps("100", "300"));
+  ASSERT_TRUE(Overlaps("100", "400"));
+  ASSERT_TRUE(Overlaps("100", "500"));
+  ASSERT_TRUE(Overlaps("375", "400"));
+  ASSERT_TRUE(Overlaps("450", "450"));
+  ASSERT_TRUE(Overlaps("450", "500"));
+  ASSERT_TRUE(Overlaps("450", "700"));
+  ASSERT_TRUE(Overlaps("600", "700"));
+}
+
+class VersionSetTestBase {
+ public:
+  const static std::string kColumnFamilyName1;
+  const static std::string kColumnFamilyName2;
+  const static std::string kColumnFamilyName3;
+  int num_initial_edits_;
+
+  explicit VersionSetTestBase(const std::string& name)
+      : env_(nullptr),
+        dbname_(test::PerThreadDBPath(name)),
+        options_(),
+        db_options_(options_),
+        cf_options_(options_),
+        immutable_options_(db_options_, cf_options_),
+        mutable_cf_options_(cf_options_),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        shutting_down_(false),
+        mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
+    EXPECT_OK(test::CreateEnvFromSystem(ConfigOptions(), &env_, &env_guard_));
+    if (env_ == Env::Default() && getenv("MEM_ENV")) {
+      env_guard_.reset(NewMemEnv(Env::Default()));
+      env_ = env_guard_.get();
+    }
+    EXPECT_NE(nullptr, env_);
+
+    fs_ = env_->GetFileSystem();
+    EXPECT_OK(fs_->CreateDirIfMissing(dbname_, IOOptions(), nullptr));
+
+    options_.env = env_;
+    db_options_.env = env_;
+    db_options_.fs = fs_;
+    immutable_options_.env = env_;
+    immutable_options_.fs = fs_;
+    immutable_options_.clock = env_->GetSystemClock().get();
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    reactive_versions_ = std::make_shared<ReactiveVersionSet>(
+        dbname_, &db_options_, env_options_, table_cache_.get(),
+        &write_buffer_manager_, &write_controller_, nullptr);
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+  }
+
+  virtual ~VersionSetTestBase() {
+    if (getenv("KEEP_DB")) {
+      fprintf(stdout, "DB is still at %s\n", dbname_.c_str());
+    } else {
+      Options options;
+      options.env = env_;
+      EXPECT_OK(DestroyDB(dbname_, options));
+    }
+  }
+
+ protected:
+  virtual void PrepareManifest(
+      std::vector<ColumnFamilyDescriptor>* column_families,
+      SequenceNumber* last_seqno, std::unique_ptr<log::Writer>* log_writer) {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    new_db.SetLogNumber(0);
+    new_db.SetNextFile(2);
+    new_db.SetLastSequence(0);
+
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
+    autovector<VersionEdit> new_cfs;
+    uint64_t last_seq = 1;
+    uint32_t cf_id = 1;
+    for (int i = 1; i != kInitialNumOfCfs; ++i) {
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_names[i]);
+      new_cf.SetColumnFamily(cf_id++);
+      new_cf.SetLogNumber(0);
+      new_cf.SetNextFile(2);
+      new_cf.SetLastSequence(last_seq++);
+      new_cfs.emplace_back(new_cf);
+    }
+    *last_seqno = last_seq;
+    num_initial_edits_ = static_cast<int>(new_cfs.size() + 1);
+    std::unique_ptr<WritableFileWriter> file_writer;
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      for (const auto& e : new_cfs) {
+        record.clear();
+        e.EncodeTo(&record);
+        s = (*log_writer)->AddRecord(record);
+        ASSERT_OK(s);
+      }
+    }
+    ASSERT_OK(s);
+
+    cf_options_.table_factory = mock_table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+    }
+  }
+
+  // Create DB with 3 column families.
+  void NewDB() {
+    SequenceNumber last_seqno;
+    std::unique_ptr<log::Writer> log_writer;
+    SetIdentityFile(env_, dbname_);
+    PrepareManifest(&column_families_, &last_seqno, &log_writer);
+    log_writer.reset();
+    // Make "CURRENT" file point to the new manifest file.
+    Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+    ASSERT_OK(s);
+
+    EXPECT_OK(versions_->Recover(column_families_, false));
+    EXPECT_EQ(column_families_.size(),
+              versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  }
+
+  void ReopenDB() {
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    EXPECT_OK(versions_->Recover(column_families_, false));
+  }
+
+  void VerifyManifest(std::string* manifest_path) const {
+    assert(manifest_path != nullptr);
+    uint64_t manifest_file_number = 0;
+    Status s = versions_->GetCurrentManifestPath(
+        dbname_, fs_.get(), manifest_path, &manifest_file_number);
+    ASSERT_OK(s);
+    ASSERT_EQ(1, manifest_file_number);
+  }
+
+  Status LogAndApplyToDefaultCF(VersionEdit& edit) {
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, &edit, &mutex_, nullptr);
+    mutex_.Unlock();
+    return s;
+  }
+
+  Status LogAndApplyToDefaultCF(
+      const autovector<std::unique_ptr<VersionEdit>>& edits) {
+    autovector<VersionEdit*> vedits;
+    for (auto& e : edits) {
+      vedits.push_back(e.get());
+    }
+    mutex_.Lock();
+    Status s =
+        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                               mutable_cf_options_, vedits, &mutex_, nullptr);
+    mutex_.Unlock();
+    return s;
+  }
+
+  void CreateNewManifest() {
+    constexpr FSDirectory* db_directory = nullptr;
+    constexpr bool new_descriptor_log = true;
+    mutex_.Lock();
+    VersionEdit dummy;
+    ASSERT_OK(versions_->LogAndApply(
+        versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+        &dummy, &mutex_, db_directory, new_descriptor_log));
+    mutex_.Unlock();
+  }
+
+  ColumnFamilyData* CreateColumnFamily(const std::string& cf_name,
+                                       const ColumnFamilyOptions& cf_options) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(cf_name);
+    uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
+    new_cf.SetColumnFamily(new_id);
+    new_cf.SetLogNumber(0);
+    new_cf.SetComparatorName(cf_options.comparator->Name());
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(/*column_family_data=*/nullptr,
+                               MutableCFOptions(cf_options), &new_cf, &mutex_,
+                               /*db_directory=*/nullptr,
+                               /*new_descriptor_log=*/false, &cf_options);
+    mutex_.Unlock();
+    EXPECT_OK(s);
+    ColumnFamilyData* cfd =
+        versions_->GetColumnFamilySet()->GetColumnFamily(cf_name);
+    EXPECT_NE(nullptr, cfd);
+    return cfd;
+  }
+
+  Env* mem_env_;
+  Env* env_;
+  std::shared_ptr<Env> env_guard_;
+  std::shared_ptr<FileSystem> fs_;
+  const std::string dbname_;
+  EnvOptions env_options_;
+  Options options_;
+  ImmutableDBOptions db_options_;
+  ColumnFamilyOptions cf_options_;
+  ImmutableOptions immutable_options_;
+  MutableCFOptions mutable_cf_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteController write_controller_;
+  WriteBufferManager write_buffer_manager_;
+  std::shared_ptr<VersionSet> versions_;
+  std::shared_ptr<ReactiveVersionSet> reactive_versions_;
+  InstrumentedMutex mutex_;
+  std::atomic<bool> shutting_down_;
+  std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+};
+
+const std::string VersionSetTestBase::kColumnFamilyName1 = "alice";
+const std::string VersionSetTestBase::kColumnFamilyName2 = "bob";
+const std::string VersionSetTestBase::kColumnFamilyName3 = "charles";
+
+class VersionSetTest : public VersionSetTestBase, public testing::Test {
+ public:
+  VersionSetTest() : VersionSetTestBase("version_set_test") {}
+};
+
+TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) {
+  NewDB();
+  const int kGroupSize = 5;
+  autovector<VersionEdit> edits;
+  for (int i = 0; i != kGroupSize; ++i) {
+    edits.emplace_back(VersionEdit());
+  }
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> all_mutable_cf_options;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (int i = 0; i != kGroupSize; ++i) {
+    cfds.emplace_back(versions_->GetColumnFamilySet()->GetDefault());
+    all_mutable_cf_options.emplace_back(&mutable_cf_options_);
+    autovector<VersionEdit*> edit_list;
+    edit_list.emplace_back(&edits[i]);
+    edit_lists.emplace_back(edit_list);
+  }
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  int count = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
+        uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
+        EXPECT_EQ(0u, *cf_id);
+        ++count;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists,
+                                    &mutex_, nullptr);
+  mutex_.Unlock();
+  EXPECT_OK(s);
+  EXPECT_EQ(kGroupSize - 1, count);
+}
+
+TEST_F(VersionSetTest, PersistBlobFileStateInNewManifest) {
+  // Initialize the database and add a couple of blob files, one with some
+  // garbage in it, and one without any garbage.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const version = cfd->current();
+  assert(version);
+
+  VersionStorageInfo* const storage_info = version->storage_info();
+  assert(storage_info);
+
+  {
+    constexpr uint64_t blob_file_number = 123;
+    constexpr uint64_t total_blob_count = 456;
+    constexpr uint64_t total_blob_bytes = 77777777;
+    constexpr char checksum_method[] = "SHA1";
+    constexpr char checksum_value[] =
+        "\xbd\xb7\xf3\x4a\x59\xdf\xa1\x59\x2c\xe7\xf5\x2e\x99\xf9\x8c\x57\x0c"
+        "\x52\x5c\xbd";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 89;
+    constexpr uint64_t garbage_blob_bytes = 1000000;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  {
+    constexpr uint64_t blob_file_number = 234;
+    constexpr uint64_t total_blob_count = 555;
+    constexpr uint64_t total_blob_bytes = 66666;
+    constexpr char checksum_method[] = "CRC32";
+    constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+    auto shared_meta = SharedBlobFileMetaData::Create(
+        blob_file_number, total_blob_count, total_blob_bytes, checksum_method,
+        checksum_value);
+
+    constexpr uint64_t garbage_blob_count = 0;
+    constexpr uint64_t garbage_blob_bytes = 0;
+
+    auto meta = BlobFileMetaData::Create(
+        std::move(shared_meta), BlobFileMetaData::LinkedSsts(),
+        garbage_blob_count, garbage_blob_bytes);
+
+    storage_info->AddBlobFile(std::move(meta));
+  }
+
+  // Force the creation of a new manifest file and make sure metadata for
+  // the blob files is re-persisted.
+  size_t addition_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileAddition::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++addition_encoded; });
+
+  size_t garbage_encoded = 0;
+  SyncPoint::GetInstance()->SetCallBack(
+      "BlobFileGarbage::EncodeTo::CustomFields",
+      [&](void* /* arg */) { ++garbage_encoded; });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  CreateNewManifest();
+
+  ASSERT_EQ(addition_encoded, 2);
+  ASSERT_EQ(garbage_encoded, 1);
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(VersionSetTest, AddLiveBlobFiles) {
+  // Initialize the database and add a blob file.
+  NewDB();
+
+  assert(versions_);
+  assert(versions_->GetColumnFamilySet());
+
+  ColumnFamilyData* const cfd = versions_->GetColumnFamilySet()->GetDefault();
+  assert(cfd);
+
+  Version* const first_version = cfd->current();
+  assert(first_version);
+
+  VersionStorageInfo* const first_storage_info = first_version->storage_info();
+  assert(first_storage_info);
+
+  constexpr uint64_t first_blob_file_number = 234;
+  constexpr uint64_t first_total_blob_count = 555;
+  constexpr uint64_t first_total_blob_bytes = 66666;
+  constexpr char first_checksum_method[] = "CRC32";
+  constexpr char first_checksum_value[] = "\x3d\x87\xff\x57";
+
+  auto first_shared_meta = SharedBlobFileMetaData::Create(
+      first_blob_file_number, first_total_blob_count, first_total_blob_bytes,
+      first_checksum_method, first_checksum_value);
+
+  constexpr uint64_t garbage_blob_count = 0;
+  constexpr uint64_t garbage_blob_bytes = 0;
+
+  auto first_meta = BlobFileMetaData::Create(
+      std::move(first_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  first_storage_info->AddBlobFile(first_meta);
+
+  // Reference the version so it stays alive even after the following version
+  // edit.
+  first_version->Ref();
+
+  // Get live files directly from version.
+  std::vector<uint64_t> version_table_files;
+  std::vector<uint64_t> version_blob_files;
+
+  first_version->AddLiveFiles(&version_table_files, &version_blob_files);
+
+  ASSERT_EQ(version_blob_files.size(), 1);
+  ASSERT_EQ(version_blob_files[0], first_blob_file_number);
+
+  // Create a new version containing an additional blob file.
+  versions_->TEST_CreateAndAppendVersion(cfd);
+
+  Version* const second_version = cfd->current();
+  assert(second_version);
+  assert(second_version != first_version);
+
+  VersionStorageInfo* const second_storage_info =
+      second_version->storage_info();
+  assert(second_storage_info);
+
+  constexpr uint64_t second_blob_file_number = 456;
+  constexpr uint64_t second_total_blob_count = 100;
+  constexpr uint64_t second_total_blob_bytes = 2000000;
+  constexpr char second_checksum_method[] = "CRC32B";
+  constexpr char second_checksum_value[] = "\x6d\xbd\xf2\x3a";
+
+  auto second_shared_meta = SharedBlobFileMetaData::Create(
+      second_blob_file_number, second_total_blob_count, second_total_blob_bytes,
+      second_checksum_method, second_checksum_value);
+
+  auto second_meta = BlobFileMetaData::Create(
+      std::move(second_shared_meta), BlobFileMetaData::LinkedSsts(),
+      garbage_blob_count, garbage_blob_bytes);
+
+  second_storage_info->AddBlobFile(std::move(first_meta));
+  second_storage_info->AddBlobFile(std::move(second_meta));
+
+  // Get all live files from version set. Note that the result contains
+  // duplicates.
+  std::vector<uint64_t> all_table_files;
+  std::vector<uint64_t> all_blob_files;
+
+  versions_->AddLiveFiles(&all_table_files, &all_blob_files);
+
+  ASSERT_EQ(all_blob_files.size(), 3);
+  ASSERT_EQ(all_blob_files[0], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[1], first_blob_file_number);
+  ASSERT_EQ(all_blob_files[2], second_blob_file_number);
+
+  // Clean up previous version.
+  first_version->Unref();
+}
+
+TEST_F(VersionSetTest, ObsoleteBlobFile) {
+  // Initialize the database and add a blob file that is entirely garbage
+  // and thus can immediately be marked obsolete.
+  NewDB();
+
+  VersionEdit edit;
+
+  constexpr uint64_t blob_file_number = 234;
+  constexpr uint64_t total_blob_count = 555;
+  constexpr uint64_t total_blob_bytes = 66666;
+  constexpr char checksum_method[] = "CRC32";
+  constexpr char checksum_value[] = "\x3d\x87\xff\x57";
+
+  edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes,
+                   checksum_method, checksum_value);
+
+  edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes);
+
+  mutex_.Lock();
+  Status s =
+      versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
+                             mutable_cf_options_, &edit, &mutex_, nullptr);
+  mutex_.Unlock();
+
+  ASSERT_OK(s);
+
+  // Make sure blob files from the pending number range are not returned
+  // as obsolete.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+
+  // Make sure the blob file is returned as obsolete if it's not in the pending
+  // range.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_EQ(blob_files.size(), 1);
+    ASSERT_EQ(blob_files[0].GetBlobFileNumber(), blob_file_number);
+  }
+
+  // Make sure it's not returned a second time.
+  {
+    std::vector<ObsoleteFileInfo> table_files;
+    std::vector<ObsoleteBlobFileInfo> blob_files;
+    std::vector<std::string> manifest_files;
+    constexpr uint64_t min_pending_output = blob_file_number + 1;
+
+    versions_->GetObsoleteFiles(&table_files, &blob_files, &manifest_files,
+                                min_pending_output);
+
+    ASSERT_TRUE(blob_files.empty());
+  }
+}
+
+TEST_F(VersionSetTest, WalEditsNotAppliedToVersion) {
+  NewDB();
+
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_EQ(versions[0], nullptr);
+}
+
+// Similar to WalEditsNotAppliedToVersion, but contains a non-WAL edit.
+TEST_F(VersionSetTest, NonWalEditsAppliedToVersion) {
+  NewDB();
+
+  const std::string kDBId = "db_db";
+  constexpr uint64_t kNumWals = 5;
+
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add some WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+  }
+  // Delete the first half of the WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals / 2 + 1);
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+
+  autovector<Version*> versions;
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:NewVersion",
+      [&](void* arg) { versions.push_back(reinterpret_cast<Version*>(arg)); });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  // Since the edits are all WAL edits, no version should be created.
+  ASSERT_EQ(versions.size(), 1);
+  ASSERT_NE(versions[0], nullptr);
+}
+
+TEST_F(VersionSetTest, WalAddition) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced for several times before closing.
+  {
+    for (uint64_t size_delta = 100; size_delta > 0; size_delta /= 2) {
+      uint64_t size = kSizeInBytes - size_delta;
+      WalMetadata wal(size);
+      VersionEdit edit;
+      edit.AddWal(kLogNumber, wal);
+
+      ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+      const auto& wals = versions_->GetWalSet().GetWals();
+      ASSERT_EQ(wals.size(), 1);
+      ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+      ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+      ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), size);
+    }
+  }
+
+  // The WAL is closed.
+  {
+    WalMetadata wal(kSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, /*read_only=*/false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalCloseWithoutSync) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+  constexpr uint64_t kSyncedSizeInBytes = kSizeInBytes / 2;
+
+  // A WAL is just created.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber).HasSyncedSize());
+  }
+
+  // The WAL is synced before closing.
+  {
+    WalMetadata wal(kSyncedSizeInBytes);
+    VersionEdit edit;
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+
+  // A new WAL with larger log number is created,
+  // implicitly marking the current WAL closed.
+  {
+    VersionEdit edit;
+    edit.AddWal(kLogNumber + 1);
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+    ASSERT_TRUE(wals.find(kLogNumber + 1) != wals.end());
+    ASSERT_FALSE(wals.at(kLogNumber + 1).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kLogNumber) != wals.end());
+    ASSERT_TRUE(wals.at(kLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kLogNumber).GetSyncedSizeInBytes(), kSyncedSizeInBytes);
+  }
+}
+
+TEST_F(VersionSetTest, WalDeletion) {
+  NewDB();
+
+  constexpr WalNumber kClosedLogNumber = 10;
+  constexpr WalNumber kNonClosedLogNumber = 20;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  // Add a non-closed and a closed WAL.
+  {
+    VersionEdit edit;
+    edit.AddWal(kClosedLogNumber, WalMetadata(kSizeInBytes));
+    edit.AddWal(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 2);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_TRUE(wals.find(kClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+    ASSERT_TRUE(wals.at(kClosedLogNumber).HasSyncedSize());
+    ASSERT_EQ(wals.at(kClosedLogNumber).GetSyncedSizeInBytes(), kSizeInBytes);
+  }
+
+  // Delete the closed WAL.
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonClosedLogNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+    const auto& wals = versions_->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Recover a new VersionSet, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+
+  // Force the creation of a new MANIFEST file,
+  // only the non-closed WAL should be written to the new MANIFEST.
+  {
+    std::vector<WalAddition> wal_additions;
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionSet::WriteCurrentStateToManifest:SaveWal", [&](void* arg) {
+          VersionEdit* edit = reinterpret_cast<VersionEdit*>(arg);
+          ASSERT_TRUE(edit->IsWalAddition());
+          for (auto& addition : edit->GetWalAdditions()) {
+            wal_additions.push_back(addition);
+          }
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+
+    CreateNewManifest();
+
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+
+    ASSERT_EQ(wal_additions.size(), 1);
+    ASSERT_EQ(wal_additions[0].GetLogNumber(), kNonClosedLogNumber);
+    ASSERT_FALSE(wal_additions[0].GetMetadata().HasSyncedSize());
+  }
+
+  // Recover from the new MANIFEST, only the non-closed WAL should show up.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNonClosedLogNumber) != wals.end());
+    ASSERT_FALSE(wals.at(kNonClosedLogNumber).HasSyncedSize());
+  }
+}
+
+TEST_F(VersionSetTest, WalCreateTwice) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+
+  VersionEdit edit;
+  edit.AddWal(kLogNumber);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+
+  Status s = LogAndApplyToDefaultCF(edit);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+              std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(VersionSetTest, WalCreateAfterClose) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Create the same WAL again.
+    VersionEdit edit;
+    edit.AddWal(kLogNumber);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(s.ToString().find("WAL 10 is created more than once") !=
+                std::string::npos)
+        << s.ToString();
+  }
+}
+
+TEST_F(VersionSetTest, AddWalWithSmallerSize) {
+  NewDB();
+  assert(versions_);
+
+  constexpr WalNumber kLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+  // Copy for future comparison.
+  const std::map<WalNumber, WalMetadata> wals1 =
+      versions_->GetWalSet().GetWals();
+
+  {
+    // Add the same WAL with smaller synced size.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes / 2);
+    edit.AddWal(kLogNumber, wal);
+
+    Status s = LogAndApplyToDefaultCF(edit);
+    ASSERT_OK(s);
+  }
+  const std::map<WalNumber, WalMetadata> wals2 =
+      versions_->GetWalSet().GetWals();
+  ASSERT_EQ(wals1, wals2);
+}
+
+TEST_F(VersionSetTest, DeleteWalsBeforeNonExistingWalNumber) {
+  NewDB();
+
+  constexpr WalNumber kLogNumber0 = 10;
+  constexpr WalNumber kLogNumber1 = 20;
+  constexpr WalNumber kNonExistingNumber = 15;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add closed WALs.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kLogNumber0, wal);
+    edit.AddWal(kLogNumber1, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    // Delete WALs before a non-existing WAL.
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kNonExistingNumber);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, WAL0 is deleted, WAL1 is not.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kLogNumber1) != wals.end());
+  }
+}
+
+TEST_F(VersionSetTest, DeleteAllWals) {
+  NewDB();
+
+  constexpr WalNumber kMaxLogNumber = 10;
+  constexpr uint64_t kSizeInBytes = 111;
+
+  {
+    // Add a closed WAL.
+    VersionEdit edit;
+    WalMetadata wal(kSizeInBytes);
+    edit.AddWal(kMaxLogNumber, wal);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  {
+    VersionEdit edit;
+    edit.DeleteWalsBefore(kMaxLogNumber + 10);
+
+    ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  }
+
+  // Recover a new VersionSet, all WALs are deleted.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(new_versions->Recover(column_families_, false));
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 0);
+  }
+}
+
+TEST_F(VersionSetTest, AtomicGroupWithWalEdits) {
+  NewDB();
+
+  constexpr int kAtomicGroupSize = 7;
+  constexpr uint64_t kNumWals = 5;
+  const std::string kDBId = "db_db";
+
+  int remaining = kAtomicGroupSize;
+  autovector<std::unique_ptr<VersionEdit>> edits;
+  // Add 5 WALs.
+  for (uint64_t i = 1; i <= kNumWals; i++) {
+    edits.emplace_back(new VersionEdit);
+    // WAL's size equals its log number.
+    edits.back()->AddWal(i, WalMetadata(i));
+    edits.back()->MarkAtomicGroup(--remaining);
+  }
+  // One edit with the min log number set.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->SetDBId(kDBId);
+  edits.back()->MarkAtomicGroup(--remaining);
+  // Delete the first added 4 WALs.
+  edits.emplace_back(new VersionEdit);
+  edits.back()->DeleteWalsBefore(kNumWals);
+  edits.back()->MarkAtomicGroup(--remaining);
+  ASSERT_EQ(remaining, 0);
+
+  ASSERT_OK(LogAndApplyToDefaultCF(edits));
+
+  // Recover a new VersionSet, the min log number and the last WAL should be
+  // kept.
+  {
+    std::unique_ptr<VersionSet> new_versions(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    std::string db_id;
+    ASSERT_OK(
+        new_versions->Recover(column_families_, /*read_only=*/false, &db_id));
+
+    ASSERT_EQ(db_id, kDBId);
+
+    const auto& wals = new_versions->GetWalSet().GetWals();
+    ASSERT_EQ(wals.size(), 1);
+    ASSERT_TRUE(wals.find(kNumWals) != wals.end());
+    ASSERT_TRUE(wals.at(kNumWals).HasSyncedSize());
+    ASSERT_EQ(wals.at(kNumWals).GetSyncedSizeInBytes(), kNumWals);
+  }
+}
+
+class VersionSetWithTimestampTest : public VersionSetTest {
+ public:
+  static const std::string kNewCfName;
+
+  explicit VersionSetWithTimestampTest() : VersionSetTest() {}
+
+  void SetUp() override {
+    NewDB();
+    Options options;
+    options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+    cfd_ = CreateColumnFamily(kNewCfName, options);
+    EXPECT_NE(nullptr, cfd_);
+    EXPECT_NE(nullptr, cfd_->GetLatestMutableCFOptions());
+    column_families_.emplace_back(kNewCfName, options);
+  }
+
+  void TearDown() override {
+    for (auto* e : edits_) {
+      delete e;
+    }
+    edits_.clear();
+  }
+
+  void GenVersionEditsToSetFullHistoryTsLow(
+      const std::vector<uint64_t>& ts_lbs) {
+    for (const auto ts_lb : ts_lbs) {
+      VersionEdit* edit = new VersionEdit;
+      edit->SetColumnFamily(cfd_->GetID());
+      std::string ts_str = test::EncodeInt(ts_lb);
+      edit->SetFullHistoryTsLow(ts_str);
+      edits_.emplace_back(edit);
+    }
+  }
+
+  void VerifyFullHistoryTsLow(uint64_t expected_ts_low) {
+    std::unique_ptr<VersionSet> vset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+    ASSERT_OK(vset->Recover(column_families_, /*read_only=*/false,
+                            /*db_id=*/nullptr));
+    for (auto* cfd : *(vset->GetColumnFamilySet())) {
+      ASSERT_NE(nullptr, cfd);
+      if (cfd->GetName() == kNewCfName) {
+        ASSERT_EQ(test::EncodeInt(expected_ts_low), cfd->GetFullHistoryTsLow());
+      } else {
+        ASSERT_TRUE(cfd->GetFullHistoryTsLow().empty());
+      }
+    }
+  }
+
+  void DoTest(const std::vector<uint64_t>& ts_lbs) {
+    if (ts_lbs.empty()) {
+      return;
+    }
+
+    GenVersionEditsToSetFullHistoryTsLow(ts_lbs);
+
+    Status s;
+    mutex_.Lock();
+    s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()),
+                               edits_, &mutex_, nullptr);
+    mutex_.Unlock();
+    ASSERT_OK(s);
+    VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end()));
+  }
+
+ protected:
+  ColumnFamilyData* cfd_{nullptr};
+  // edits_ must contain and own pointers to heap-alloc VersionEdit objects.
+  autovector<VersionEdit*> edits_;
+};
+
+const std::string VersionSetWithTimestampTest::kNewCfName("new_cf");
+
+TEST_F(VersionSetWithTimestampTest, SetFullHistoryTsLbOnce) {
+  constexpr uint64_t kTsLow = 100;
+  DoTest({kTsLow});
+}
+
+// Simulate the application increasing full_history_ts_low.
+TEST_F(VersionSetWithTimestampTest, IncreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {100, 101, 102, 103};
+  DoTest(ts_lbs);
+}
+
+// Simulate the application trying to decrease full_history_ts_low
+// unsuccessfully. If the application calls public API sequentially to
+// decrease the lower bound ts, RocksDB will return an InvalidArgument
+// status before involving VersionSet. Only when multiple threads trying
+// to decrease the lower bound concurrently will this case ever happen. Even
+// so, the lower bound cannot be decreased. The application will be notified
+// via return value of the API.
+TEST_F(VersionSetWithTimestampTest, TryDecreaseFullHistoryTsLb) {
+  const std::vector<uint64_t> ts_lbs = {103, 102, 101, 100};
+  DoTest(ts_lbs);
+}
+
+class VersionSetAtomicGroupTest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  VersionSetAtomicGroupTest()
+      : VersionSetTestBase("version_set_atomic_group_test") {}
+
+  void SetUp() override {
+    PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+    SetupTestSyncPoints();
+  }
+
+  void SetupValidAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupIncompleteTrailingAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      edits_[i].MarkAtomicGroup(--remaining);
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupCorruptedAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != ((size_t)atomic_group_size / 2)) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupIncorrectAtomicGroup(int atomic_group_size) {
+    edits_.resize(atomic_group_size);
+    int remaining = atomic_group_size;
+    for (size_t i = 0; i != edits_.size(); ++i) {
+      edits_[i].SetLogNumber(0);
+      edits_[i].SetNextFile(2);
+      if (i != 1) {
+        edits_[i].MarkAtomicGroup(--remaining);
+      } else {
+        edits_[i].MarkAtomicGroup(remaining--);
+      }
+      edits_[i].SetLastSequence(last_seqno_++);
+    }
+    ASSERT_OK(SetCurrentFile(fs_.get(), dbname_, 1, nullptr));
+  }
+
+  void SetupTestSyncPoints() {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:FirstInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.front().DebugString(),
+                    e->DebugString());  // compare based on value
+          first_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:LastInAtomicGroup", [&](void* arg) {
+          VersionEdit* e = reinterpret_cast<VersionEdit*>(arg);
+          EXPECT_EQ(edits_.back().DebugString(),
+                    e->DebugString());  // compare based on value
+          EXPECT_TRUE(first_in_atomic_group_);
+          last_in_atomic_group_ = true;
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "VersionEditHandlerBase::Iterate:Finish", [&](void* arg) {
+          num_recovered_edits_ = *reinterpret_cast<size_t*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroup",
+        [&](void* /* arg */) { ++num_edits_in_atomic_group_; });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:AtomicGroupMixedWithNormalEdits",
+        [&](void* arg) {
+          corrupted_edit_ = *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->SetCallBack(
+        "AtomicGroupReadBuffer::AddEdit:IncorrectAtomicGroupSize",
+        [&](void* arg) {
+          edit_with_incorrect_group_size_ =
+              *reinterpret_cast<VersionEdit*>(arg);
+        });
+    SyncPoint::GetInstance()->EnableProcessing();
+  }
+
+  void AddNewEditsToLog(int num_edits) {
+    for (int i = 0; i < num_edits; i++) {
+      std::string record;
+      edits_[i].EncodeTo(&record);
+      ASSERT_OK(log_writer_->AddRecord(record));
+    }
+  }
+
+  void TearDown() override {
+    SyncPoint::GetInstance()->DisableProcessing();
+    SyncPoint::GetInstance()->ClearAllCallBacks();
+    log_writer_.reset();
+  }
+
+ protected:
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::vector<VersionEdit> edits_;
+  bool first_in_atomic_group_ = false;
+  bool last_in_atomic_group_ = false;
+  int num_edits_in_atomic_group_ = 0;
+  size_t num_recovered_edits_ = 0;
+  VersionEdit corrupted_edit_;
+  VersionEdit edit_with_incorrect_group_size_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetAtomicGroupTest, HandleValidAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_ + kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleValidAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 3;
+  SetupValidAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  AddNewEditsToLog(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_TRUE(last_in_atomic_group_);
+  // The recover should clean up the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(kAtomicGroupSize, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  EXPECT_OK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+  // Write the last record. The reactive version set should now apply all
+  // edits.
+  std::string last_record;
+  edits_[kAtomicGroupSize - 1].EncodeTo(&last_record);
+  EXPECT_OK(log_writer_->AddRecord(last_record));
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  // Reactive version set should be empty now.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == 0);
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncompleteTrailingAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  const int kNumberOfPersistedVersionEdits = kAtomicGroupSize - 1;
+  SetupIncompleteTrailingAtomicGroup(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  // No edits in an atomic group.
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(num_initial_edits_, num_recovered_edits_);
+  // Write a few edits in an atomic group.
+  AddNewEditsToLog(kNumberOfPersistedVersionEdits);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  mu.Lock();
+  EXPECT_OK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_TRUE(first_in_atomic_group_);
+  EXPECT_FALSE(last_in_atomic_group_);
+  EXPECT_EQ(kNumberOfPersistedVersionEdits, num_edits_in_atomic_group_);
+  // Reactive version set should store the edits in the replay buffer.
+  EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() ==
+              kNumberOfPersistedVersionEdits);
+  EXPECT_TRUE(reactive_versions_->replay_buffer().size() == kAtomicGroupSize);
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleCorruptedAtomicGroupWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupCorruptedAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  // Write the corrupted edits.
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
+            corrupted_edit_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  EXPECT_NOK(versions_->Recover(column_families_, false));
+  EXPECT_EQ(column_families_.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetRecover) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  AddNewEditsToLog(kAtomicGroupSize);
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_NOK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                         &manifest_reporter,
+                                         &manifest_reader_status));
+  EXPECT_EQ(column_families_.size(),
+            reactive_versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+TEST_F(VersionSetAtomicGroupTest,
+       HandleIncorrectAtomicGroupSizeWithReactiveVersionSetReadAndApply) {
+  const int kAtomicGroupSize = 4;
+  SetupIncorrectAtomicGroup(kAtomicGroupSize);
+  InstrumentedMutex mu;
+  std::unordered_set<ColumnFamilyData*> cfds_changed;
+  std::unique_ptr<log::FragmentBufferedReader> manifest_reader;
+  std::unique_ptr<log::Reader::Reporter> manifest_reporter;
+  std::unique_ptr<Status> manifest_reader_status;
+  EXPECT_OK(reactive_versions_->Recover(column_families_, &manifest_reader,
+                                        &manifest_reporter,
+                                        &manifest_reader_status));
+  AddNewEditsToLog(kAtomicGroupSize);
+  mu.Lock();
+  EXPECT_NOK(reactive_versions_->ReadAndApply(
+      &mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
+  mu.Unlock();
+  EXPECT_EQ(edits_[1].DebugString(),
+            edit_with_incorrect_group_size_.DebugString());
+}
+
+class VersionSetTestDropOneCF : public VersionSetTestBase,
+                                public testing::TestWithParam<std::string> {
+ public:
+  VersionSetTestDropOneCF()
+      : VersionSetTestBase("version_set_test_drop_one_cf") {}
+};
+
+// This test simulates the following execution sequence
+// Time  thread1                  bg_flush_thr
+//  |                             Prepare version edits (e1,e2,e3) for atomic
+//  |                             flush cf1, cf2, cf3
+//  |    Enqueue e to drop cfi
+//  |    to manifest_writers_
+//  |                             Enqueue (e1,e2,e3) to manifest_writers_
+//  |
+//  |    Apply e,
+//  |    cfi.IsDropped() is true
+//  |                             Apply (e1,e2,e3),
+//  |                             since cfi.IsDropped() == true, we need to
+//  |                             drop ei and write the rest to MANIFEST.
+//  V
+//
+//  Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and
+//  last column family in an atomic group.
+TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) {
+  std::vector<ColumnFamilyDescriptor> column_families;
+  SequenceNumber last_seqno;
+  std::unique_ptr<log::Writer> log_writer;
+  PrepareManifest(&column_families, &last_seqno, &log_writer);
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+
+  EXPECT_OK(versions_->Recover(column_families, false /* read_only */));
+  EXPECT_EQ(column_families.size(),
+            versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
+
+  const int kAtomicGroupSize = 3;
+  const std::vector<std::string> non_default_cf_names = {
+      kColumnFamilyName1, kColumnFamilyName2, kColumnFamilyName3};
+
+  // Drop one column family
+  VersionEdit drop_cf_edit;
+  drop_cf_edit.DropColumnFamily();
+  const std::string cf_to_drop_name(GetParam());
+  auto cfd_to_drop =
+      versions_->GetColumnFamilySet()->GetColumnFamily(cf_to_drop_name);
+  ASSERT_NE(nullptr, cfd_to_drop);
+  // Increase its refcount because cfd_to_drop is used later, and we need to
+  // prevent it from being deleted.
+  cfd_to_drop->Ref();
+  drop_cf_edit.SetColumnFamily(cfd_to_drop->GetID());
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfd_to_drop,
+                             *cfd_to_drop->GetLatestMutableCFOptions(),
+                             &drop_cf_edit, &mutex_, nullptr);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+
+  std::vector<VersionEdit> edits(kAtomicGroupSize);
+  uint32_t remaining = kAtomicGroupSize;
+  size_t i = 0;
+  autovector<ColumnFamilyData*> cfds;
+  autovector<const MutableCFOptions*> mutable_cf_options_list;
+  autovector<autovector<VersionEdit*>> edit_lists;
+  for (const auto& cf_name : non_default_cf_names) {
+    auto cfd = (cf_name != cf_to_drop_name)
+                   ? versions_->GetColumnFamilySet()->GetColumnFamily(cf_name)
+                   : cfd_to_drop;
+    ASSERT_NE(nullptr, cfd);
+    cfds.push_back(cfd);
+    mutable_cf_options_list.emplace_back(cfd->GetLatestMutableCFOptions());
+    edits[i].SetColumnFamily(cfd->GetID());
+    edits[i].SetLogNumber(0);
+    edits[i].SetNextFile(2);
+    edits[i].MarkAtomicGroup(--remaining);
+    edits[i].SetLastSequence(last_seqno++);
+    autovector<VersionEdit*> tmp_edits;
+    tmp_edits.push_back(&edits[i]);
+    edit_lists.emplace_back(tmp_edits);
+    ++i;
+  }
+  int called = 0;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  SyncPoint::GetInstance()->SetCallBack(
+      "VersionSet::ProcessManifestWrites:CheckOneAtomicGroup", [&](void* arg) {
+        std::vector<VersionEdit*>* tmp_edits =
+            reinterpret_cast<std::vector<VersionEdit*>*>(arg);
+        EXPECT_EQ(kAtomicGroupSize - 1, tmp_edits->size());
+        for (const auto e : *tmp_edits) {
+          bool found = false;
+          for (const auto& e2 : edits) {
+            if (&e2 == e) {
+              found = true;
+              break;
+            }
+          }
+          ASSERT_TRUE(found);
+        }
+        ++called;
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+  mutex_.Lock();
+  s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_,
+                             nullptr);
+  mutex_.Unlock();
+  ASSERT_OK(s);
+  ASSERT_EQ(1, called);
+  cfd_to_drop->UnrefAndTryDelete();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AtomicGroup, VersionSetTestDropOneCF,
+    testing::Values(VersionSetTestBase::kColumnFamilyName1,
+                    VersionSetTestBase::kColumnFamilyName2,
+                    VersionSetTestBase::kColumnFamilyName3));
+
+class EmptyDefaultCfNewManifest : public VersionSetTestBase,
+                                  public testing::Test {
+ public:
+  EmptyDefaultCfNewManifest() : VersionSetTestBase("version_set_new_db_test") {}
+  // Emulate DBImpl::NewDB()
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(log_writer != nullptr);
+    VersionEdit new_db;
+    new_db.SetLogNumber(0);
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, true));
+    std::string record;
+    ASSERT_TRUE(new_db.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+    // Create new column family
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+    new_cf.SetColumnFamily(1);
+    new_cf.SetLastSequence(2);
+    new_cf.SetNextFile(2);
+    record.clear();
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = (*log_writer)->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+ protected:
+  bool write_dbid_to_manifest_ = false;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+// Create db, create column family. Cf creation will switch to a new MANIFEST.
+// Then reopen db, trying to recover.
+TEST_F(EmptyDefaultCfNewManifest, Recover) {
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::vector<ColumnFamilyDescriptor> column_families;
+  column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
+  column_families.emplace_back(VersionSetTestBase::kColumnFamilyName1,
+                               cf_options_);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(
+      manifest_path, column_families, false, &db_id, &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+}
+
+class VersionSetTestEmptyDb
+    : public VersionSetTestBase,
+      public testing::TestWithParam<
+          std::tuple<bool, bool, std::vector<std::string>>> {
+ public:
+  static const std::string kUnknownColumnFamilyName;
+  VersionSetTestEmptyDb() : VersionSetTestBase("version_set_test_empty_db") {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* /*column_families*/,
+                       SequenceNumber* /*last_seqno*/,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(nullptr != log_writer);
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    const std::string manifest_path = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest_path, fs->OptimizeForManifestWrite(env_options_),
+        &file_writer, nullptr);
+    ASSERT_OK(s);
+    {
+      log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+      std::string record;
+      new_db.EncodeTo(&record);
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+  }
+
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+const std::string VersionSetTestEmptyDb::kUnknownColumnFamilyName = "unknown";
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest0) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  log_writer_.reset();
+  Status s =
+      SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string> cf_names = std::get<2>(GetParam());
+
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromIncompleteManifest1) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Only a subset of column families in the MANIFEST.
+  VersionEdit new_cf1;
+  new_cf1.AddColumnFamily(VersionSetTestBase::kColumnFamilyName1);
+  new_cf1.SetColumnFamily(1);
+  Status s;
+  {
+    std::string record;
+    new_cf1.EncodeTo(&record);
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenFromInCompleteManifest2) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenManifestWithUnknownCF) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetColumnFamily(4);
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_NE(s.ToString().find(manifest_path), std::string::npos);
+    ASSERT_TRUE(s.IsCorruption());
+  }
+}
+
+TEST_P(VersionSetTestEmptyDb, OpenCompleteManifest) {
+  db_options_.write_dbid_to_manifest = std::get<0>(GetParam());
+  PrepareManifest(nullptr, nullptr, &log_writer_);
+  // Write all column families but no log_number, next_file_number and
+  // last_sequence.
+  const std::vector<std::string> all_cf_names = {
+      kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+      kColumnFamilyName3};
+  uint32_t cf_id = 1;
+  Status s;
+  for (size_t i = 1; i != all_cf_names.size(); ++i) {
+    VersionEdit new_cf;
+    new_cf.AddColumnFamily(all_cf_names[i]);
+    new_cf.SetColumnFamily(cf_id++);
+    std::string record;
+    ASSERT_TRUE(new_cf.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  {
+    VersionEdit tmp_edit;
+    tmp_edit.SetLogNumber(0);
+    tmp_edit.SetNextFile(2);
+    tmp_edit.SetLastSequence(0);
+    std::string record;
+    ASSERT_TRUE(tmp_edit.EncodeTo(&record));
+    s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+  log_writer_.reset();
+  s = SetCurrentFile(fs_.get(), dbname_, 1, /*directory_to_fsync=*/nullptr);
+  ASSERT_OK(s);
+
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+
+  bool read_only = std::get<1>(GetParam());
+  const std::vector<std::string>& cf_names = std::get<2>(GetParam());
+  std::vector<ColumnFamilyDescriptor> column_families;
+  for (const auto& cf_name : cf_names) {
+    column_families.emplace_back(cf_name, cf_options_);
+  }
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families,
+                                           read_only, &db_id,
+                                           &has_missing_table_file);
+  auto iter =
+      std::find(cf_names.begin(), cf_names.end(), kDefaultColumnFamilyName);
+  if (iter == cf_names.end()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else if (read_only) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() == all_cf_names.size()) {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+  } else if (cf_names.size() < all_cf_names.size()) {
+    ASSERT_TRUE(s.IsInvalidArgument());
+  } else {
+    ASSERT_OK(s);
+    ASSERT_FALSE(has_missing_table_file);
+    ColumnFamilyData* cfd = versions_->GetColumnFamilySet()->GetColumnFamily(
+        kUnknownColumnFamilyName);
+    ASSERT_EQ(nullptr, cfd);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(
+    BestEffortRecovery, VersionSetTestEmptyDb,
+    testing::Combine(
+        /*write_dbid_to_manifest=*/testing::Bool(),
+        /*read_only=*/testing::Bool(),
+        /*cf_names=*/
+        testing::Values(
+            std::vector<std::string>(),
+            std::vector<std::string>({kDefaultColumnFamilyName}),
+            std::vector<std::string>({VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1}),
+            std::vector<std::string>({kDefaultColumnFamilyName,
+                                      VersionSetTestBase::kColumnFamilyName1,
+                                      VersionSetTestBase::kColumnFamilyName2,
+                                      VersionSetTestBase::kColumnFamilyName3}),
+            std::vector<std::string>(
+                {kDefaultColumnFamilyName,
+                 VersionSetTestBase::kColumnFamilyName1,
+                 VersionSetTestBase::kColumnFamilyName2,
+                 VersionSetTestBase::kColumnFamilyName3,
+                 VersionSetTestEmptyDb::kUnknownColumnFamilyName}))));
+
+class VersionSetTestMissingFiles : public VersionSetTestBase,
+                                   public testing::Test {
+ public:
+  VersionSetTestMissingFiles()
+      : VersionSetTestBase("version_set_test_missing_files"),
+        block_based_table_options_(),
+        table_factory_(std::make_shared<BlockBasedTableFactory>(
+            block_based_table_options_)),
+        internal_comparator_(
+            std::make_shared<InternalKeyComparator>(options_.comparator)) {}
+
+ protected:
+  void PrepareManifest(std::vector<ColumnFamilyDescriptor>* column_families,
+                       SequenceNumber* last_seqno,
+                       std::unique_ptr<log::Writer>* log_writer) override {
+    assert(column_families != nullptr);
+    assert(last_seqno != nullptr);
+    assert(log_writer != nullptr);
+    const std::string manifest = DescriptorFileName(dbname_, 1);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    Status s = WritableFileWriter::Create(
+        fs, manifest, fs->OptimizeForManifestWrite(env_options_), &file_writer,
+        nullptr);
+    ASSERT_OK(s);
+    log_writer->reset(new log::Writer(std::move(file_writer), 0, false));
+    VersionEdit new_db;
+    if (db_options_.write_dbid_to_manifest) {
+      DBOptions tmp_db_options;
+      tmp_db_options.env = env_;
+      std::unique_ptr<DBImpl> impl(new DBImpl(tmp_db_options, dbname_));
+      std::string db_id;
+      impl->GetDbIdentityFromIdentityFile(&db_id);
+      new_db.SetDBId(db_id);
+    }
+    {
+      std::string record;
+      ASSERT_TRUE(new_db.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    const std::vector<std::string> cf_names = {
+        kDefaultColumnFamilyName, kColumnFamilyName1, kColumnFamilyName2,
+        kColumnFamilyName3};
+    uint32_t cf_id = 1;  // default cf id is 0
+    cf_options_.table_factory = table_factory_;
+    for (const auto& cf_name : cf_names) {
+      column_families->emplace_back(cf_name, cf_options_);
+      if (cf_name == kDefaultColumnFamilyName) {
+        continue;
+      }
+      VersionEdit new_cf;
+      new_cf.AddColumnFamily(cf_name);
+      new_cf.SetColumnFamily(cf_id);
+      std::string record;
+      ASSERT_TRUE(new_cf.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+
+      VersionEdit cf_files;
+      cf_files.SetColumnFamily(cf_id);
+      cf_files.SetLogNumber(0);
+      record.clear();
+      ASSERT_TRUE(cf_files.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+      ++cf_id;
+    }
+    SequenceNumber seq = 2;
+    {
+      VersionEdit edit;
+      edit.SetNextFile(7);
+      edit.SetLastSequence(seq);
+      std::string record;
+      ASSERT_TRUE(edit.EncodeTo(&record));
+      s = (*log_writer)->AddRecord(record);
+      ASSERT_OK(s);
+    }
+    *last_seqno = seq + 1;
+  }
+
+  struct SstInfo {
+    uint64_t file_number;
+    std::string column_family;
+    std::string key;  // the only key
+    int level = 0;
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key)
+        : SstInfo(file_num, cf_name, _key, 0) {}
+    SstInfo(uint64_t file_num, const std::string& cf_name,
+            const std::string& _key, int lvl)
+        : file_number(file_num),
+          column_family(cf_name),
+          key(_key),
+          level(lvl) {}
+  };
+
+  // Create dummy sst, return their metadata. Note that only file name and size
+  // are used.
+  void CreateDummyTableFiles(const std::vector<SstInfo>& file_infos,
+                             std::vector<FileMetaData>* file_metas) {
+    assert(file_metas != nullptr);
+    for (const auto& info : file_infos) {
+      uint64_t file_num = info.file_number;
+      std::string fname = MakeTableFileName(dbname_, file_num);
+      std::unique_ptr<FSWritableFile> file;
+      Status s = fs_->NewWritableFile(fname, FileOptions(), &file, nullptr);
+      ASSERT_OK(s);
+      std::unique_ptr<WritableFileWriter> fwriter(new WritableFileWriter(
+          std::move(file), fname, FileOptions(), env_->GetSystemClock().get()));
+      IntTblPropCollectorFactories int_tbl_prop_collector_factories;
+
+      std::unique_ptr<TableBuilder> builder(table_factory_->NewTableBuilder(
+          TableBuilderOptions(
+              immutable_options_, mutable_cf_options_, *internal_comparator_,
+              &int_tbl_prop_collector_factories, kNoCompression,
+              CompressionOptions(),
+              TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
+              info.column_family, info.level),
+          fwriter.get()));
+      InternalKey ikey(info.key, 0, ValueType::kTypeValue);
+      builder->Add(ikey.Encode(), "value");
+      ASSERT_OK(builder->Finish());
+      ASSERT_OK(fwriter->Flush());
+      uint64_t file_size = 0;
+      s = fs_->GetFileSize(fname, IOOptions(), &file_size, nullptr);
+      ASSERT_OK(s);
+      ASSERT_NE(0, file_size);
+      file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey,
+                               ikey, 0, 0, false, Temperature::kUnknown, 0, 0,
+                               0, kUnknownFileChecksum,
+                               kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    }
+  }
+
+  // This method updates last_sequence_.
+  void WriteFileAdditionAndDeletionToManifest(
+      uint32_t cf, const std::vector<std::pair<int, FileMetaData>>& added_files,
+      const std::vector<std::pair<int, uint64_t>>& deleted_files) {
+    VersionEdit edit;
+    edit.SetColumnFamily(cf);
+    for (const auto& elem : added_files) {
+      int level = elem.first;
+      edit.AddFile(level, elem.second);
+    }
+    for (const auto& elem : deleted_files) {
+      int level = elem.first;
+      edit.DeleteFile(level, elem.second);
+    }
+    edit.SetLastSequence(last_seqno_);
+    ++last_seqno_;
+    assert(log_writer_.get() != nullptr);
+    std::string record;
+    ASSERT_TRUE(edit.EncodeTo(&record));
+    Status s = log_writer_->AddRecord(record);
+    ASSERT_OK(s);
+  }
+
+  BlockBasedTableOptions block_based_table_options_;
+  std::shared_ptr<TableFactory> table_factory_;
+  std::shared_ptr<InternalKeyComparator> internal_comparator_;
+  std::vector<ColumnFamilyDescriptor> column_families_;
+  SequenceNumber last_seqno_;
+  std::unique_ptr<log::Writer> log_writer_;
+};
+
+TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (uint64_t file_num = 10; file_num < 15; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(0, 10);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    ASSERT_TRUE(files.empty());
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (size_t i = 3; i != 5; ++i) {
+    added_files.emplace_back(0, file_metas[i]);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+
+  added_files.clear();
+  for (uint64_t file_num = 120; file_num < 130; ++file_num) {
+    std::string smallest_ukey = "a";
+    std::string largest_ukey = "b";
+    InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue);
+    InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue);
+    FileMetaData meta = FileMetaData(
+        file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey,
+        largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0,
+        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2);
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_TRUE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(2, files.size());
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() != 107 && fmeta->fd.GetNumber() != 110) {
+          ASSERT_FALSE(true);
+        }
+      }
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
+  std::vector<SstInfo> existing_files = {
+      SstInfo(100, kDefaultColumnFamilyName, "a"),
+      SstInfo(102, kDefaultColumnFamilyName, "b"),
+      SstInfo(103, kDefaultColumnFamilyName, "c"),
+      SstInfo(107, kDefaultColumnFamilyName, "d"),
+      SstInfo(110, kDefaultColumnFamilyName, "e")};
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles(existing_files, &file_metas);
+
+  PrepareManifest(&column_families_, &last_seqno_, &log_writer_);
+  std::vector<std::pair<int, FileMetaData>> added_files;
+  for (const auto& meta : file_metas) {
+    added_files.emplace_back(0, meta);
+  }
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, added_files, std::vector<std::pair<int, uint64_t>>());
+  std::vector<std::pair<int, uint64_t>> deleted_files;
+  deleted_files.emplace_back(/*level=*/0, 100);
+  WriteFileAdditionAndDeletionToManifest(
+      /*cf=*/0, std::vector<std::pair<int, FileMetaData>>(), deleted_files);
+  log_writer_.reset();
+  Status s = SetCurrentFile(fs_.get(), dbname_, 1, nullptr);
+  ASSERT_OK(s);
+  std::string manifest_path;
+  VerifyManifest(&manifest_path);
+  std::string db_id;
+  bool has_missing_table_file = false;
+  s = versions_->TryRecoverFromOneManifest(manifest_path, column_families_,
+                                           /*read_only=*/false, &db_id,
+                                           &has_missing_table_file);
+  ASSERT_OK(s);
+  ASSERT_FALSE(has_missing_table_file);
+  for (ColumnFamilyData* cfd : *(versions_->GetColumnFamilySet())) {
+    VersionStorageInfo* vstorage = cfd->current()->storage_info();
+    const std::vector<FileMetaData*>& files = vstorage->LevelFiles(0);
+    if (cfd->GetName() == kDefaultColumnFamilyName) {
+      ASSERT_EQ(existing_files.size() - deleted_files.size(), files.size());
+      bool has_deleted_file = false;
+      for (const auto* fmeta : files) {
+        if (fmeta->fd.GetNumber() == 100) {
+          has_deleted_file = true;
+          break;
+        }
+      }
+      ASSERT_FALSE(has_deleted_file);
+    } else {
+      ASSERT_TRUE(files.empty());
+    }
+  }
+}
+
+TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  db_options_.allow_2pc = true;
+  NewDB();
+
+  SstInfo sst(100, kDefaultColumnFamilyName, "a");
+  std::vector<FileMetaData> file_metas;
+  CreateDummyTableFiles({sst}, &file_metas);
+
+  constexpr WalNumber kMinWalNumberToKeep2PC = 10;
+  VersionEdit edit;
+  edit.AddFile(0, file_metas[0]);
+  edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
+  ASSERT_OK(LogAndApplyToDefaultCF(edit));
+  ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+
+  for (int i = 0; i < 3; i++) {
+    CreateNewManifest();
+    ReopenDB();
+    ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
+  }
+}
+
+class ChargeFileMetadataTest : public DBTestBase {
+ public:
+  ChargeFileMetadataTest()
+      : DBTestBase("charge_file_metadata_test", /*env_do_fsync=*/true) {}
+};
+
+class ChargeFileMetadataTestWithParam
+    : public ChargeFileMetadataTest,
+      public testing::WithParamInterface<CacheEntryRoleOptions::Decision> {
+ public:
+  ChargeFileMetadataTestWithParam() {}
+};
+
+#ifndef ROCKSDB_LITE
+INSTANTIATE_TEST_CASE_P(
+    ChargeFileMetadataTestWithParam, ChargeFileMetadataTestWithParam,
+    ::testing::Values(CacheEntryRoleOptions::Decision::kEnabled,
+                      CacheEntryRoleOptions::Decision::kDisabled));
+
+TEST_P(ChargeFileMetadataTestWithParam, Basic) {
+  Options options;
+  BlockBasedTableOptions table_options;
+  CacheEntryRoleOptions::Decision charge_file_metadata = GetParam();
+  table_options.cache_usage_options.options_overrides.insert(
+      {CacheEntryRole::kFileMetadata, {/*.charged = */ charge_file_metadata}});
+  std::shared_ptr<TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>
+      file_metadata_charge_only_cache = std::make_shared<
+          TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+          NewLRUCache(
+              4 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+              0 /* num_shard_bits */, true /* strict_capacity_limit */));
+  table_options.block_cache = file_metadata_charge_only_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  options.create_if_missing = true;
+  options.disable_auto_compactions = true;
+  DestroyAndReopen(options);
+
+  // Create 128 file metadata, each of which is roughly 1024 bytes.
+  // This results in 1 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+  // cache reservation for file metadata.
+  for (int i = 1; i <= 128; ++i) {
+    ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+    ASSERT_OK(Put("b", "vb"));
+    ASSERT_OK(Flush());
+  }
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              1 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+
+  // Create another 128 file metadata.
+  // This increases the file metadata cache reservation to 2 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize().
+  for (int i = 1; i <= 128; ++i) {
+    ASSERT_OK(Put(std::string(1024, 'a'), "vva"));
+    ASSERT_OK(Put("b", "vvb"));
+    ASSERT_OK(Flush());
+  }
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              2 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+  // Compaction will create 1 new file metadata, obsolete and delete all 256
+  // file metadata above. This results in 1 *
+  // CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>::GetDummyEntrySize()
+  // cache reservation for file metadata.
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles",
+        "ChargeFileMetadataTestWithParam::"
+        "PreVerifyingCacheReservationRelease"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr));
+  ASSERT_EQ("0,1", FilesPerLevel(0));
+  TEST_SYNC_POINT(
+      "ChargeFileMetadataTestWithParam::PreVerifyingCacheReservationRelease");
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+              1 * CacheReservationManagerImpl<
+                      CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+  } else {
+    EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(), 0);
+  }
+  SyncPoint::GetInstance()->DisableProcessing();
+
+  // Destroying the db will delete the remaining 1 new file metadata
+  // This results in no cache reservation for file metadata.
+  Destroy(options);
+  EXPECT_EQ(file_metadata_charge_only_cache->GetCacheCharge(),
+            0 * CacheReservationManagerImpl<
+                    CacheEntryRole::kFileMetadata>::GetDummyEntrySize());
+
+  // Reopen the db with a smaller cache in order to test failure in allocating
+  // file metadata due to memory limit based on cache capacity
+  file_metadata_charge_only_cache = std::make_shared<
+      TargetCacheChargeTrackingCache<CacheEntryRole::kFileMetadata>>(
+      NewLRUCache(1 * CacheReservationManagerImpl<
+                          CacheEntryRole::kFileMetadata>::GetDummyEntrySize(),
+                  0 /* num_shard_bits */, true /* strict_capacity_limit */));
+  table_options.block_cache = file_metadata_charge_only_cache;
+  options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Reopen(options);
+  ASSERT_OK(Put(std::string(1024, 'a'), "va"));
+  ASSERT_OK(Put("b", "vb"));
+  Status s = Flush();
+  if (charge_file_metadata == CacheEntryRoleOptions::Decision::kEnabled) {
+    EXPECT_TRUE(s.IsMemoryLimit());
+    EXPECT_TRUE(s.ToString().find(
+                    kCacheEntryRoleToCamelString[static_cast<std::uint32_t>(
+                        CacheEntryRole::kFileMetadata)]) != std::string::npos);
+    EXPECT_TRUE(s.ToString().find("memory limit based on cache capacity") !=
+                std::string::npos);
+  } else {
+    EXPECT_TRUE(s.ok());
+  }
+}
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/version_util.h b/src/rocksdb/db/version_util.h
new file mode 100644
index 000000000..5ec6fda11
--- /dev/null
+++ b/src/rocksdb/db/version_util.h
@@ -0,0 +1,71 @@
+//  Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "db/version_set.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Instead of opening a `DB` to perform certain manifest updates, this
+// uses the underlying `VersionSet` API to read and modify the MANIFEST. This
+// allows us to use the user's real options, while not having to worry about
+// the DB persisting new SST files via flush/compaction or attempting to read/
+// compact files which may fail, particularly for the file we intend to remove
+// (the user may want to remove an already deleted file from MANIFEST).
+class OfflineManifestWriter {
+ public:
+  OfflineManifestWriter(const DBOptions& options, const std::string& db_path)
+      : wc_(options.delayed_write_rate),
+        wb_(options.db_write_buffer_size),
+        immutable_db_options_(WithDbPath(options, db_path)),
+        tc_(NewLRUCache(1 << 20 /* capacity */,
+                        options.table_cache_numshardbits)),
+        versions_(db_path, &immutable_db_options_, sopt_, tc_.get(), &wb_, &wc_,
+                  /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                  /*db_id*/ "", /*db_session_id*/ "") {}
+
+  Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families) {
+    return versions_.Recover(column_families, /*read_only*/ false,
+                             /*db_id*/ nullptr,
+                             /*no_error_if_files_missing*/ true);
+  }
+
+  Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit,
+                     FSDirectory* dir_contains_current_file) {
+    // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`.
+    InstrumentedMutex mutex;
+    mutex.Lock();
+    Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
+                                     edit, &mutex, dir_contains_current_file,
+                                     false /* new_descriptor_log */);
+    mutex.Unlock();
+    return s;
+  }
+
+  VersionSet& Versions() { return versions_; }
+  const ImmutableDBOptions& IOptions() { return immutable_db_options_; }
+
+ private:
+  WriteController wc_;
+  WriteBufferManager wb_;
+  ImmutableDBOptions immutable_db_options_;
+  std::shared_ptr<Cache> tc_;
+  EnvOptions sopt_;
+  VersionSet versions_;
+
+  static ImmutableDBOptions WithDbPath(const DBOptions& options,
+                                       const std::string& db_path) {
+    ImmutableDBOptions rv(options);
+    if (rv.db_paths.empty()) {
+      // `VersionSet` expects options that have been through
+      // `SanitizeOptions()`, which would sanitize an empty `db_paths`.
+      rv.db_paths.emplace_back(db_path, 0 /* target_size */);
+    }
+    return rv;
+  }
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.cc b/src/rocksdb/db/wal_edit.cc
new file mode 100644
index 000000000..2525be610
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void WalAddition::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+
+  if (metadata_.HasSyncedSize()) {
+    PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kSyncedSize));
+    PutVarint64(dst, metadata_.GetSyncedSizeInBytes());
+  }
+
+  PutVarint32(dst, static_cast<uint32_t>(WalAdditionTag::kTerminate));
+}
+
+Status WalAddition::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalAddition";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  while (true) {
+    uint32_t tag_value = 0;
+    if (!GetVarint32(src, &tag_value)) {
+      return Status::Corruption(class_name, "Error decoding tag");
+    }
+    WalAdditionTag tag = static_cast<WalAdditionTag>(tag_value);
+    switch (tag) {
+      case WalAdditionTag::kSyncedSize: {
+        uint64_t size = 0;
+        if (!GetVarint64(src, &size)) {
+          return Status::Corruption(class_name, "Error decoding WAL file size");
+        }
+        metadata_.SetSyncedSizeInBytes(size);
+        break;
+      }
+      // TODO: process future tags such as checksum.
+      case WalAdditionTag::kTerminate:
+        return Status::OK();
+      default: {
+        std::stringstream ss;
+        ss << "Unknown tag " << tag_value;
+        return Status::Corruption(class_name, ss.str());
+      }
+    }
+  }
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal) {
+  jw << "LogNumber" << wal.GetLogNumber() << "SyncedSizeInBytes"
+     << wal.GetMetadata().GetSyncedSizeInBytes();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal) {
+  os << "log_number: " << wal.GetLogNumber()
+     << " synced_size_in_bytes: " << wal.GetMetadata().GetSyncedSizeInBytes();
+  return os;
+}
+
+std::string WalAddition::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+void WalDeletion::EncodeTo(std::string* dst) const {
+  PutVarint64(dst, number_);
+}
+
+Status WalDeletion::DecodeFrom(Slice* src) {
+  constexpr char class_name[] = "WalDeletion";
+
+  if (!GetVarint64(src, &number_)) {
+    return Status::Corruption(class_name, "Error decoding WAL log number");
+  }
+
+  return Status::OK();
+}
+
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal) {
+  jw << "LogNumber" << wal.GetLogNumber();
+  return jw;
+}
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal) {
+  os << "log_number: " << wal.GetLogNumber();
+  return os;
+}
+
+std::string WalDeletion::DebugString() const {
+  std::ostringstream oss;
+  oss << *this;
+  return oss.str();
+}
+
+Status WalSet::AddWal(const WalAddition& wal) {
+  if (wal.GetLogNumber() < min_wal_number_to_keep_) {
+    // The WAL has been obsolete, ignore it.
+    return Status::OK();
+  }
+
+  auto it = wals_.lower_bound(wal.GetLogNumber());
+  bool existing = it != wals_.end() && it->first == wal.GetLogNumber();
+
+  if (!existing) {
+    wals_.insert(it, {wal.GetLogNumber(), wal.GetMetadata()});
+    return Status::OK();
+  }
+
+  assert(existing);
+  if (!wal.GetMetadata().HasSyncedSize()) {
+    std::stringstream ss;
+    ss << "WAL " << wal.GetLogNumber() << " is created more than once";
+    return Status::Corruption("WalSet::AddWal", ss.str());
+  }
+
+  assert(wal.GetMetadata().HasSyncedSize());
+  if (it->second.HasSyncedSize() && wal.GetMetadata().GetSyncedSizeInBytes() <=
+                                        it->second.GetSyncedSizeInBytes()) {
+    // This is possible because version edits with different synced WAL sizes
+    // for the same WAL can be committed out-of-order. For example, thread
+    // 1 synces the first 10 bytes of 1.log, while thread 2 synces the first 20
+    // bytes of 1.log. It's possible that thread 1 calls LogAndApply() after
+    // thread 2.
+    // In this case, just return ok.
+    return Status::OK();
+  }
+
+  // Update synced size for the given WAL.
+  it->second.SetSyncedSizeInBytes(wal.GetMetadata().GetSyncedSizeInBytes());
+  return Status::OK();
+}
+
+Status WalSet::AddWals(const WalAdditions& wals) {
+  Status s;
+  for (const WalAddition& wal : wals) {
+    s = AddWal(wal);
+    if (!s.ok()) {
+      break;
+    }
+  }
+  return s;
+}
+
+Status WalSet::DeleteWalsBefore(WalNumber wal) {
+  if (wal > min_wal_number_to_keep_) {
+    min_wal_number_to_keep_ = wal;
+    wals_.erase(wals_.begin(), wals_.lower_bound(wal));
+  }
+  return Status::OK();
+}
+
+void WalSet::Reset() {
+  wals_.clear();
+  min_wal_number_to_keep_ = 0;
+}
+
+Status WalSet::CheckWals(
+    Env* env,
+    const std::unordered_map<WalNumber, std::string>& logs_on_disk) const {
+  assert(env != nullptr);
+
+  Status s;
+  for (const auto& wal : wals_) {
+    const uint64_t log_number = wal.first;
+    const WalMetadata& wal_meta = wal.second;
+
+    if (!wal_meta.HasSyncedSize()) {
+      // The WAL and WAL directory is not even synced,
+      // so the WAL's inode may not be persisted,
+      // then the WAL might not show up when listing WAL directory.
+      continue;
+    }
+
+    if (logs_on_disk.find(log_number) == logs_on_disk.end()) {
+      std::stringstream ss;
+      ss << "Missing WAL with log number: " << log_number << ".";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+
+    uint64_t log_file_size = 0;
+    s = env->GetFileSize(logs_on_disk.at(log_number), &log_file_size);
+    if (!s.ok()) {
+      break;
+    }
+    if (log_file_size < wal_meta.GetSyncedSizeInBytes()) {
+      std::stringstream ss;
+      ss << "Size mismatch: WAL (log number: " << log_number
+         << ") in MANIFEST is " << wal_meta.GetSyncedSizeInBytes()
+         << " bytes , but actually is " << log_file_size << " bytes on disk.";
+      s = Status::Corruption(ss.str());
+      break;
+    }
+  }
+
+  return s;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit.h b/src/rocksdb/db/wal_edit.h
new file mode 100644
index 000000000..bb5c5e292
--- /dev/null
+++ b/src/rocksdb/db/wal_edit.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+// WAL related classes used in VersionEdit and VersionSet.
+// Modifications to WalAddition and WalDeletion may need to update
+// VersionEdit and its related tests.
+
+#pragma once
+
+#include <map>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "logging/event_logger.h"
+#include "port/port.h"
+#include "rocksdb/rocksdb_namespace.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class JSONWriter;
+class Slice;
+class Status;
+
+using WalNumber = uint64_t;
+
+// Metadata of a WAL.
+class WalMetadata {
+ public:
+  WalMetadata() = default;
+
+  explicit WalMetadata(uint64_t synced_size_bytes)
+      : synced_size_bytes_(synced_size_bytes) {}
+
+  bool HasSyncedSize() const { return synced_size_bytes_ != kUnknownWalSize; }
+
+  void SetSyncedSizeInBytes(uint64_t bytes) { synced_size_bytes_ = bytes; }
+
+  uint64_t GetSyncedSizeInBytes() const { return synced_size_bytes_; }
+
+ private:
+  friend bool operator==(const WalMetadata& lhs, const WalMetadata& rhs);
+  friend bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs);
+  // The size of WAL is unknown, used when the WAL is not synced yet or is
+  // empty.
+  constexpr static uint64_t kUnknownWalSize =
+      std::numeric_limits<uint64_t>::max();
+
+  // Size of the most recently synced WAL in bytes.
+  uint64_t synced_size_bytes_ = kUnknownWalSize;
+};
+
+inline bool operator==(const WalMetadata& lhs, const WalMetadata& rhs) {
+  return lhs.synced_size_bytes_ == rhs.synced_size_bytes_;
+}
+
+inline bool operator!=(const WalMetadata& lhs, const WalMetadata& rhs) {
+  return !(lhs == rhs);
+}
+
+// These tags are persisted to MANIFEST, so it's part of the user API.
+enum class WalAdditionTag : uint32_t {
+  // Indicates that there are no more tags.
+  kTerminate = 1,
+  // Synced Size in bytes.
+  kSyncedSize = 2,
+  // Add tags in the future, such as checksum?
+};
+
+// Records the event of adding a WAL in VersionEdit.
+class WalAddition {
+ public:
+  WalAddition() : number_(0), metadata_() {}
+
+  explicit WalAddition(WalNumber number) : number_(number), metadata_() {}
+
+  WalAddition(WalNumber number, WalMetadata meta)
+      : number_(number), metadata_(std::move(meta)) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  const WalMetadata& GetMetadata() const { return metadata_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+ private:
+  WalNumber number_;
+  WalMetadata metadata_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalAddition& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalAddition& wal);
+
+using WalAdditions = std::vector<WalAddition>;
+
+// Records the event of deleting WALs before the specified log number.
+class WalDeletion {
+ public:
+  WalDeletion() : number_(kEmpty) {}
+
+  explicit WalDeletion(WalNumber number) : number_(number) {}
+
+  WalNumber GetLogNumber() const { return number_; }
+
+  void EncodeTo(std::string* dst) const;
+
+  Status DecodeFrom(Slice* src);
+
+  std::string DebugString() const;
+
+  bool IsEmpty() const { return number_ == kEmpty; }
+
+  void Reset() { number_ = kEmpty; }
+
+ private:
+  static constexpr WalNumber kEmpty = 0;
+
+  WalNumber number_;
+};
+
+std::ostream& operator<<(std::ostream& os, const WalDeletion& wal);
+JSONWriter& operator<<(JSONWriter& jw, const WalDeletion& wal);
+
+// Used in VersionSet to keep the current set of WALs.
+//
+// When a WAL is synced or becomes obsoleted,
+// a VersionEdit is logged to MANIFEST and
+// the WAL is added to or deleted from WalSet.
+//
+// Not thread safe, needs external synchronization such as holding DB mutex.
+class WalSet {
+ public:
+  // Add WAL(s).
+  // If the WAL is closed,
+  // then there must be an existing unclosed WAL,
+  // otherwise, return Status::Corruption.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status AddWal(const WalAddition& wal);
+  Status AddWals(const WalAdditions& wals);
+
+  // Delete WALs with log number smaller than the specified wal number.
+  // Can happen when applying a VersionEdit or recovering from MANIFEST.
+  Status DeleteWalsBefore(WalNumber wal);
+
+  // Resets the internal state.
+  void Reset();
+
+  // WALs with number less than MinWalNumberToKeep should not exist in WalSet.
+  WalNumber GetMinWalNumberToKeep() const { return min_wal_number_to_keep_; }
+
+  const std::map<WalNumber, WalMetadata>& GetWals() const { return wals_; }
+
+  // Checks whether there are missing or corrupted WALs.
+  // Returns Status::OK if there is no missing nor corrupted WAL,
+  // otherwise returns Status::Corruption.
+  // logs_on_disk is a map from log number to the log filename.
+  // Note that logs_on_disk may contain logs that is obsolete but
+  // haven't been deleted from disk.
+  Status CheckWals(
+      Env* env,
+      const std::unordered_map<WalNumber, std::string>& logs_on_disk) const;
+
+ private:
+  std::map<WalNumber, WalMetadata> wals_;
+  // WAL number < min_wal_number_to_keep_ should not exist in wals_.
+  // It's monotonically increasing, in-memory only, not written to MANIFEST.
+  WalNumber min_wal_number_to_keep_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_edit_test.cc b/src/rocksdb/db/wal_edit_test.cc
new file mode 100644
index 000000000..0c18fb125
--- /dev/null
+++ b/src/rocksdb/db/wal_edit_test.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+// This source code is licensed under both the GPLv2 (found in the
+// COPYING file in the root directory) and Apache 2.0 License
+// (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wal_edit.h"
+
+#include "db/db_test_util.h"
+#include "file/file_util.h"
+#include "port/port.h"
+#include "port/stack_trace.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WalSet, AddDeleteReset) {
+  WalSet wals;
+  ASSERT_TRUE(wals.GetWals().empty());
+
+  // Create WAL 1 - 10.
+  for (WalNumber log_number = 1; log_number <= 10; log_number++) {
+    wals.AddWal(WalAddition(log_number));
+  }
+  ASSERT_EQ(wals.GetWals().size(), 10);
+
+  // Delete WAL 1 - 5.
+  wals.DeleteWalsBefore(6);
+  ASSERT_EQ(wals.GetWals().size(), 5);
+
+  WalNumber expected_log_number = 6;
+  for (auto it : wals.GetWals()) {
+    WalNumber log_number = it.first;
+    ASSERT_EQ(log_number, expected_log_number++);
+  }
+
+  wals.Reset();
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, Overwrite) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 200;
+  WalSet wals;
+  wals.AddWal(WalAddition(kNumber));
+  ASSERT_FALSE(wals.GetWals().at(kNumber).HasSyncedSize());
+  wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes)));
+  ASSERT_TRUE(wals.GetWals().at(kNumber).HasSyncedSize());
+  ASSERT_EQ(wals.GetWals().at(kNumber).GetSyncedSizeInBytes(), kBytes);
+}
+
+TEST(WalSet, SmallerSyncedSize) {
+  constexpr WalNumber kNumber = 100;
+  constexpr uint64_t kBytes = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber, WalMetadata(kBytes))));
+  const auto wals1 = wals.GetWals();
+  Status s = wals.AddWal(WalAddition(kNumber, WalMetadata(0)));
+  const auto wals2 = wals.GetWals();
+  ASSERT_OK(s);
+  ASSERT_EQ(wals1, wals2);
+}
+
+TEST(WalSet, CreateTwice) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  Status s = wals.AddWal(WalAddition(kNumber));
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(s.ToString().find("WAL 100 is created more than once") !=
+              std::string::npos);
+}
+
+TEST(WalSet, DeleteAllWals) {
+  constexpr WalNumber kMaxWalNumber = 10;
+  WalSet wals;
+  for (WalNumber i = 1; i <= kMaxWalNumber; i++) {
+    wals.AddWal(WalAddition(i));
+  }
+  ASSERT_OK(wals.DeleteWalsBefore(kMaxWalNumber + 1));
+}
+
+TEST(WalSet, AddObsoleteWal) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_OK(wals.AddWal(WalAddition(kNumber)));
+  ASSERT_TRUE(wals.GetWals().empty());
+}
+
+TEST(WalSet, MinWalNumberToKeep) {
+  constexpr WalNumber kNumber = 100;
+  WalSet wals;
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), 0);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber - 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber);
+  ASSERT_OK(wals.DeleteWalsBefore(kNumber + 1));
+  ASSERT_EQ(wals.GetMinWalNumberToKeep(), kNumber + 1);
+}
+
+class WalSetTest : public DBTestBase {
+ public:
+  WalSetTest() : DBTestBase("WalSetTest", /* env_do_fsync */ true) {}
+
+  void SetUp() override {
+    test_dir_ = test::PerThreadDBPath("wal_set_test");
+    ASSERT_OK(env_->CreateDir(test_dir_));
+  }
+
+  void TearDown() override {
+    EXPECT_OK(DestroyDir(env_, test_dir_));
+    logs_on_disk_.clear();
+    wals_.Reset();
+  }
+
+  void CreateWalOnDisk(WalNumber number, const std::string& fname,
+                       uint64_t size_bytes) {
+    std::unique_ptr<WritableFile> f;
+    std::string fpath = Path(fname);
+    ASSERT_OK(env_->NewWritableFile(fpath, &f, EnvOptions()));
+    std::string content(size_bytes, '0');
+    ASSERT_OK(f->Append(content));
+    ASSERT_OK(f->Close());
+
+    logs_on_disk_[number] = fpath;
+  }
+
+  void AddWalToWalSet(WalNumber number, uint64_t size_bytes) {
+    // Create WAL.
+    ASSERT_OK(wals_.AddWal(WalAddition(number)));
+    // Close WAL.
+    WalMetadata wal(size_bytes);
+    ASSERT_OK(wals_.AddWal(WalAddition(number, wal)));
+  }
+
+  Status CheckWals() const { return wals_.CheckWals(env_, logs_on_disk_); }
+
+ private:
+  std::string test_dir_;
+  std::unordered_map<WalNumber, std::string> logs_on_disk_;
+  WalSet wals_;
+
+  std::string Path(const std::string& fname) { return test_dir_ + "/" + fname; }
+};
+
+TEST_F(WalSetTest, CheckEmptyWals) { ASSERT_OK(CheckWals()); }
+
+TEST_F(WalSetTest, CheckWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, size);
+    // log 0 - 5 are obsolete.
+    if (number > 5) {
+      AddWalToWalSet(number, size);
+    }
+  }
+  ASSERT_OK(CheckWals());
+}
+
+TEST_F(WalSetTest, CheckMissingWals) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100;
+    AddWalToWalSet(number, size);
+    // logs with even number are missing from disk.
+    if (number % 2) {
+      std::stringstream ss;
+      ss << "log" << number;
+      std::string fname = ss.str();
+      CreateWalOnDisk(number, fname, size);
+    }
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number is missing.
+  std::stringstream expected_err;
+  expected_err << "Missing WAL with log number: " << 2;
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+TEST_F(WalSetTest, CheckWalsWithShrinkedSize) {
+  for (int number = 1; number < 10; number++) {
+    uint64_t size = rand() % 100 + 1;
+    AddWalToWalSet(number, size);
+    // logs with even number have shrinked size.
+    std::stringstream ss;
+    ss << "log" << number;
+    std::string fname = ss.str();
+    CreateWalOnDisk(number, fname, (number % 2) ? size : size - 1);
+  }
+
+  Status s = CheckWals();
+  ASSERT_TRUE(s.IsCorruption()) << s.ToString();
+  // The first log with even number has wrong size.
+  std::stringstream expected_err;
+  expected_err << "Size mismatch: WAL (log number: " << 2 << ")";
+  ASSERT_TRUE(s.ToString().find(expected_err.str()) != std::string::npos)
+      << s.ToString();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wal_manager.cc b/src/rocksdb/db/wal_manager.cc
new file mode 100644
index 000000000..a6060235f
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.cc
@@ -0,0 +1,529 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "db/wal_manager.h"
+
+#include <algorithm>
+#include <cinttypes>
+#include <memory>
+#include <vector>
+
+#include "db/log_reader.h"
+#include "db/log_writer.h"
+#include "db/transaction_log_impl.h"
+#include "db/write_batch_internal.h"
+#include "file/file_util.h"
+#include "file/filename.h"
+#include "file/sequence_file_reader.h"
+#include "logging/logging.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/options.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/mutexlock.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+Status WalManager::DeleteFile(const std::string& fname, uint64_t number) {
+  auto s = env_->DeleteFile(wal_dir_ + "/" + fname);
+  if (s.ok()) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.erase(number);
+  }
+  return s;
+}
+
+Status WalManager::GetSortedWalFiles(VectorLogPtr& files) {
+  // First get sorted files in db dir, then get sorted files from archived
+  // dir, to avoid a race condition where a log file is moved to archived
+  // dir in between.
+  Status s;
+  // list wal files in main db dir.
+  VectorLogPtr logs;
+  s = GetSortedWalsOfType(wal_dir_, logs, kAliveLogFile);
+  if (!s.ok()) {
+    return s;
+  }
+
+  // Reproduce the race condition where a log file is moved
+  // to archived dir, between these two sync points, used in
+  // (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1");
+  TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2");
+
+  files.clear();
+  // list wal files in archive dir.
+  std::string archivedir = ArchivalDirectory(wal_dir_);
+  Status exists = env_->FileExists(archivedir);
+  if (exists.ok()) {
+    s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
+    if (!s.ok()) {
+      return s;
+    }
+  } else if (!exists.IsNotFound()) {
+    assert(s.IsIOError());
+    return s;
+  }
+
+  uint64_t latest_archived_log_number = 0;
+  if (!files.empty()) {
+    latest_archived_log_number = files.back()->LogNumber();
+    ROCKS_LOG_INFO(db_options_.info_log, "Latest Archived log: %" PRIu64,
+                   latest_archived_log_number);
+  }
+
+  files.reserve(files.size() + logs.size());
+  for (auto& log : logs) {
+    if (log->LogNumber() > latest_archived_log_number) {
+      files.push_back(std::move(log));
+    } else {
+      // When the race condition happens, we could see the
+      // same log in both db dir and archived dir. Simply
+      // ignore the one in db dir. Note that, if we read
+      // archived dir first, we would have missed the log file.
+      ROCKS_LOG_WARN(db_options_.info_log, "%s already moved to archive",
+                     log->PathName().c_str());
+    }
+  }
+
+  return s;
+}
+
+Status WalManager::GetUpdatesSince(
+    SequenceNumber seq, std::unique_ptr<TransactionLogIterator>* iter,
+    const TransactionLogIterator::ReadOptions& read_options,
+    VersionSet* version_set) {
+  if (seq_per_batch_) {
+    return Status::NotSupported();
+  }
+
+  assert(!seq_per_batch_);
+
+  //  Get all sorted Wal Files.
+  //  Do binary search and open files and find the seq number.
+
+  std::unique_ptr<VectorLogPtr> wal_files(new VectorLogPtr);
+  Status s = GetSortedWalFiles(*wal_files);
+  if (!s.ok()) {
+    return s;
+  }
+
+  s = RetainProbableWalFiles(*wal_files, seq);
+  if (!s.ok()) {
+    return s;
+  }
+  iter->reset(new TransactionLogIteratorImpl(
+      wal_dir_, &db_options_, read_options, file_options_, seq,
+      std::move(wal_files), version_set, seq_per_batch_, io_tracer_));
+  return (*iter)->status();
+}
+
+// 1. Go through all archived files and
+//    a. if ttl is enabled, delete outdated files
+//    b. if archive size limit is enabled, delete empty files,
+//        compute file number and size.
+// 2. If size limit is enabled:
+//    a. compute how many files should be deleted
+//    b. get sorted non-empty archived logs
+//    c. delete what should be deleted
+void WalManager::PurgeObsoleteWALFiles() {
+  bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0;
+  bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0;
+  if (!ttl_enabled && !size_limit_enabled) {
+    return;
+  }
+
+  int64_t current_time = 0;
+  Status s = db_options_.clock->GetCurrentTime(&current_time);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get current time: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+  uint64_t const now_seconds = static_cast<uint64_t>(current_time);
+  uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled)
+                                     ? db_options_.WAL_ttl_seconds / 2
+                                     : kDefaultIntervalToDeleteObsoleteWAL;
+
+  if (purge_wal_files_last_run_ + time_to_check > now_seconds) {
+    return;
+  }
+
+  purge_wal_files_last_run_ = now_seconds;
+
+  std::string archival_dir = ArchivalDirectory(wal_dir_);
+  std::vector<std::string> files;
+  s = env_->GetChildren(archival_dir, &files);
+  if (!s.ok()) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "Can't get archive files: %s",
+                    s.ToString().c_str());
+    assert(false);
+    return;
+  }
+
+  size_t log_files_num = 0;
+  uint64_t log_file_size = 0;
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      std::string const file_path = archival_dir + "/" + f;
+      if (ttl_enabled) {
+        uint64_t file_m_time;
+        s = env_->GetFileModificationTime(file_path, &file_m_time);
+        if (!s.ok()) {
+          ROCKS_LOG_WARN(db_options_.info_log,
+                         "Can't get file mod time: %s: %s", file_path.c_str(),
+                         s.ToString().c_str());
+          continue;
+        }
+        if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) {
+          s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                           /*force_fg=*/!wal_in_db_path_);
+          if (!s.ok()) {
+            ROCKS_LOG_WARN(db_options_.info_log, "Can't delete file: %s: %s",
+                           file_path.c_str(), s.ToString().c_str());
+            continue;
+          } else {
+            MutexLock l(&read_first_record_cache_mutex_);
+            read_first_record_cache_.erase(number);
+          }
+          continue;
+        }
+      }
+
+      if (size_limit_enabled) {
+        uint64_t file_size;
+        s = env_->GetFileSize(file_path, &file_size);
+        if (!s.ok()) {
+          ROCKS_LOG_ERROR(db_options_.info_log,
+                          "Unable to get file size: %s: %s", file_path.c_str(),
+                          s.ToString().c_str());
+          return;
+        } else {
+          if (file_size > 0) {
+            log_file_size = std::max(log_file_size, file_size);
+            ++log_files_num;
+          } else {
+            s = DeleteDBFile(&db_options_, file_path, archival_dir, false,
+                             /*force_fg=*/!wal_in_db_path_);
+            if (!s.ok()) {
+              ROCKS_LOG_WARN(db_options_.info_log,
+                             "Unable to delete file: %s: %s", file_path.c_str(),
+                             s.ToString().c_str());
+              continue;
+            } else {
+              MutexLock l(&read_first_record_cache_mutex_);
+              read_first_record_cache_.erase(number);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (0 == log_files_num || !size_limit_enabled) {
+    return;
+  }
+
+  size_t const files_keep_num = static_cast<size_t>(
+      db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size);
+  if (log_files_num <= files_keep_num) {
+    return;
+  }
+
+  size_t files_del_num = log_files_num - files_keep_num;
+  VectorLogPtr archived_logs;
+  s = GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile);
+  if (!s.ok()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Unable to get archived WALs from: %s: %s",
+                   archival_dir.c_str(), s.ToString().c_str());
+    files_del_num = 0;
+  } else if (files_del_num > archived_logs.size()) {
+    ROCKS_LOG_WARN(db_options_.info_log,
+                   "Trying to delete more archived log files than "
+                   "exist. Deleting all");
+    files_del_num = archived_logs.size();
+  }
+
+  for (size_t i = 0; i < files_del_num; ++i) {
+    std::string const file_path = archived_logs[i]->PathName();
+    s = DeleteDBFile(&db_options_, wal_dir_ + "/" + file_path, wal_dir_, false,
+                     /*force_fg=*/!wal_in_db_path_);
+    if (!s.ok()) {
+      ROCKS_LOG_WARN(db_options_.info_log, "Unable to delete file: %s: %s",
+                     file_path.c_str(), s.ToString().c_str());
+      continue;
+    } else {
+      MutexLock l(&read_first_record_cache_mutex_);
+      read_first_record_cache_.erase(archived_logs[i]->LogNumber());
+    }
+  }
+}
+
+void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) {
+  auto archived_log_name = ArchivedLogFileName(wal_dir_, number);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1");
+  Status s = env_->RenameFile(fname, archived_log_name);
+  // The sync point below is used in (DBTest,TransactionLogIteratorRace)
+  TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2");
+  ROCKS_LOG_INFO(db_options_.info_log, "Move log file %s to %s -- %s\n",
+                 fname.c_str(), archived_log_name.c_str(),
+                 s.ToString().c_str());
+}
+
+Status WalManager::GetSortedWalsOfType(const std::string& path,
+                                       VectorLogPtr& log_files,
+                                       WalFileType log_type) {
+  std::vector<std::string> all_files;
+  const Status status = env_->GetChildren(path, &all_files);
+  if (!status.ok()) {
+    return status;
+  }
+  log_files.reserve(all_files.size());
+  for (const auto& f : all_files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      SequenceNumber sequence;
+      Status s = ReadFirstRecord(log_type, number, &sequence);
+      if (!s.ok()) {
+        return s;
+      }
+      if (sequence == 0) {
+        // empty file
+        continue;
+      }
+
+      // Reproduce the race condition where a log file is moved
+      // to archived dir, between these two sync points, used in
+      // (DBTest,TransactionLogIteratorRace)
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1");
+      TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2");
+
+      uint64_t size_bytes;
+      s = env_->GetFileSize(LogFileName(path, number), &size_bytes);
+      // re-try in case the alive log file has been moved to archive.
+      if (!s.ok() && log_type == kAliveLogFile) {
+        std::string archived_file = ArchivedLogFileName(path, number);
+        if (env_->FileExists(archived_file).ok()) {
+          s = env_->GetFileSize(archived_file, &size_bytes);
+          if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+            // oops, the file just got deleted from archived dir! move on
+            s = Status::OK();
+            continue;
+          }
+        }
+      }
+      if (!s.ok()) {
+        return s;
+      }
+
+      log_files.push_back(std::unique_ptr<LogFile>(
+          new LogFileImpl(number, log_type, sequence, size_bytes)));
+    }
+  }
+  std::sort(
+      log_files.begin(), log_files.end(),
+      [](const std::unique_ptr<LogFile>& a, const std::unique_ptr<LogFile>& b) {
+        LogFileImpl* a_impl = static_cast_with_check<LogFileImpl>(a.get());
+        LogFileImpl* b_impl = static_cast_with_check<LogFileImpl>(b.get());
+        return *a_impl < *b_impl;
+      });
+  return status;
+}
+
+Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                          const SequenceNumber target) {
+  int64_t start = 0;  // signed to avoid overflow when target is < first file.
+  int64_t end = static_cast<int64_t>(all_logs.size()) - 1;
+  // Binary Search. avoid opening all files.
+  while (end >= start) {
+    int64_t mid = start + (end - start) / 2;  // Avoid overflow.
+    SequenceNumber current_seq_num =
+        all_logs.at(static_cast<size_t>(mid))->StartSequence();
+    if (current_seq_num == target) {
+      end = mid;
+      break;
+    } else if (current_seq_num < target) {
+      start = mid + 1;
+    } else {
+      end = mid - 1;
+    }
+  }
+  // end could be -ve.
+  size_t start_index =
+      static_cast<size_t>(std::max(static_cast<int64_t>(0), end));
+  // The last wal file is always included
+  all_logs.erase(all_logs.begin(), all_logs.begin() + start_index);
+  return Status::OK();
+}
+
+Status WalManager::ReadFirstRecord(const WalFileType type,
+                                   const uint64_t number,
+                                   SequenceNumber* sequence) {
+  *sequence = 0;
+  if (type != kAliveLogFile && type != kArchivedLogFile) {
+    ROCKS_LOG_ERROR(db_options_.info_log, "[WalManger] Unknown file type %s",
+                    std::to_string(type).c_str());
+    return Status::NotSupported("File Type Not Known " + std::to_string(type));
+  }
+  {
+    MutexLock l(&read_first_record_cache_mutex_);
+    auto itr = read_first_record_cache_.find(number);
+    if (itr != read_first_record_cache_.end()) {
+      *sequence = itr->second;
+      return Status::OK();
+    }
+  }
+  Status s;
+  if (type == kAliveLogFile) {
+    std::string fname = LogFileName(wal_dir_, number);
+    s = ReadFirstLine(fname, number, sequence);
+    if (!s.ok() && env_->FileExists(fname).ok()) {
+      // return any error that is not caused by non-existing file
+      return s;
+    }
+  }
+
+  if (type == kArchivedLogFile || !s.ok()) {
+    //  check if the file got moved to archive.
+    std::string archived_file = ArchivedLogFileName(wal_dir_, number);
+    s = ReadFirstLine(archived_file, number, sequence);
+    // maybe the file was deleted from archive dir. If that's the case, return
+    // Status::OK(). The caller with identify this as empty file because
+    // *sequence == 0
+    if (!s.ok() && env_->FileExists(archived_file).IsNotFound()) {
+      return Status::OK();
+    }
+  }
+
+  if (s.ok() && *sequence != 0) {
+    MutexLock l(&read_first_record_cache_mutex_);
+    read_first_record_cache_.insert({number, *sequence});
+  }
+  return s;
+}
+
+Status WalManager::GetLiveWalFile(uint64_t number,
+                                  std::unique_ptr<LogFile>* log_file) {
+  if (!log_file) {
+    return Status::InvalidArgument("log_file not preallocated.");
+  }
+
+  if (!number) {
+    return Status::PathNotFound("log file not available");
+  }
+
+  Status s;
+
+  uint64_t size_bytes;
+  s = env_->GetFileSize(LogFileName(wal_dir_, number), &size_bytes);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  log_file->reset(new LogFileImpl(number, kAliveLogFile,
+                                  0,  // SequenceNumber
+                                  size_bytes));
+
+  return Status::OK();
+}
+
+// the function returns status.ok() and sequence == 0 if the file exists, but is
+// empty
+Status WalManager::ReadFirstLine(const std::string& fname,
+                                 const uint64_t number,
+                                 SequenceNumber* sequence) {
+  struct LogReporter : public log::Reader::Reporter {
+    Env* env;
+    Logger* info_log;
+    const char* fname;
+
+    Status* status;
+    bool ignore_error;  // true if db_options_.paranoid_checks==false
+    void Corruption(size_t bytes, const Status& s) override {
+      ROCKS_LOG_WARN(info_log, "[WalManager] %s%s: dropping %d bytes; %s",
+                     (this->ignore_error ? "(ignoring error) " : ""), fname,
+                     static_cast<int>(bytes), s.ToString().c_str());
+      if (this->status->ok()) {
+        // only keep the first error
+        *this->status = s;
+      }
+    }
+  };
+
+  std::unique_ptr<FSSequentialFile> file;
+  Status status = fs_->NewSequentialFile(
+      fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
+  std::unique_ptr<SequentialFileReader> file_reader(
+      new SequentialFileReader(std::move(file), fname, io_tracer_));
+
+  if (!status.ok()) {
+    return status;
+  }
+
+  LogReporter reporter;
+  reporter.env = env_;
+  reporter.info_log = db_options_.info_log.get();
+  reporter.fname = fname.c_str();
+  reporter.status = &status;
+  reporter.ignore_error = !db_options_.paranoid_checks;
+  log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
+                     true /*checksum*/, number);
+  std::string scratch;
+  Slice record;
+
+  if (reader.ReadRecord(&record, &scratch) &&
+      (status.ok() || !db_options_.paranoid_checks)) {
+    if (record.size() < WriteBatchInternal::kHeader) {
+      reporter.Corruption(record.size(),
+                          Status::Corruption("log record too small"));
+      // TODO read record's till the first no corrupt entry?
+    } else {
+      WriteBatch batch;
+      // We can overwrite an existing non-OK Status since it'd only reach here
+      // with `paranoid_checks == false`.
+      status = WriteBatchInternal::SetContents(&batch, record);
+      if (status.ok()) {
+        *sequence = WriteBatchInternal::Sequence(&batch);
+        return status;
+      }
+    }
+  }
+
+  if (status.ok() && reader.IsCompressedAndEmptyFile()) {
+    // In case of wal_compression, it writes a `kSetCompressionType` record
+    // which is not associated with any sequence number. As result for an empty
+    // file, GetSortedWalsOfType() will skip these WALs causing the operations
+    // to fail.
+    // Therefore, in order to avoid that failure, it sets sequence_number to 1
+    // indicating those WALs should be included.
+    *sequence = 1;
+  } else {
+    // ReadRecord might have returned false on EOF, which means that the log
+    // file is empty. Or, a failure may have occurred while processing the first
+    // entry. In any case, return status and set sequence number to 0.
+    *sequence = 0;
+  }
+  return status;
+}
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager.h b/src/rocksdb/db/wal_manager.h
new file mode 100644
index 000000000..8cc067935
--- /dev/null
+++ b/src/rocksdb/db/wal_manager.h
@@ -0,0 +1,138 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+#pragma once
+
+#include <atomic>
+#include <deque>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/version_set.h"
+#include "file/file_util.h"
+#include "options/db_options.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/status.h"
+#include "rocksdb/transaction_log.h"
+#include "rocksdb/types.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+#ifndef ROCKSDB_LITE
+
+// WAL manager provides the abstraction for reading the WAL files as a single
+// unit. Internally, it opens and reads the files using Reader or Writer
+// abstraction.
+class WalManager {
+ public:
+  WalManager(const ImmutableDBOptions& db_options,
+             const FileOptions& file_options,
+             const std::shared_ptr<IOTracer>& io_tracer,
+             const bool seq_per_batch = false)
+      : db_options_(db_options),
+        file_options_(file_options),
+        env_(db_options.env),
+        fs_(db_options.fs, io_tracer),
+        purge_wal_files_last_run_(0),
+        seq_per_batch_(seq_per_batch),
+        wal_dir_(db_options_.GetWalDir()),
+        wal_in_db_path_(db_options_.IsWalDirSameAsDBPath()),
+        io_tracer_(io_tracer) {}
+
+  Status GetSortedWalFiles(VectorLogPtr& files);
+
+  // Allow user to tail transaction log to find all recent changes to the
+  // database that are newer than `seq_number`.
+  Status GetUpdatesSince(
+      SequenceNumber seq_number, std::unique_ptr<TransactionLogIterator>* iter,
+      const TransactionLogIterator::ReadOptions& read_options,
+      VersionSet* version_set);
+
+  void PurgeObsoleteWALFiles();
+
+  void ArchiveWALFile(const std::string& fname, uint64_t number);
+
+  Status DeleteFile(const std::string& fname, uint64_t number);
+
+  Status GetLiveWalFile(uint64_t number, std::unique_ptr<LogFile>* log_file);
+
+  Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number,
+                              SequenceNumber* sequence) {
+    return ReadFirstRecord(type, number, sequence);
+  }
+
+  Status TEST_ReadFirstLine(const std::string& fname, const uint64_t number,
+                            SequenceNumber* sequence) {
+    return ReadFirstLine(fname, number, sequence);
+  }
+
+ private:
+  Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files,
+                             WalFileType type);
+  // Requires: all_logs should be sorted with earliest log file first
+  // Retains all log files in all_logs which contain updates with seq no.
+  // Greater Than or Equal to the requested SequenceNumber.
+  Status RetainProbableWalFiles(VectorLogPtr& all_logs,
+                                const SequenceNumber target);
+
+  // ReadFirstRecord checks the read_first_record_cache_ to see if the entry
+  // exists or not. If not, it will read the WAL file.
+  // In case of wal_compression, WAL contains a `kSetCompressionType` record
+  // which is not associated with any sequence number. So the sequence_number is
+  // set to 1 if that WAL doesn't include any other record (basically empty) in
+  // order to include that WAL and is inserted in read_first_record_cache_.
+  // Therefore, sequence_number is used as boolean if WAL should be included or
+  // not and that sequence_number shouldn't be use for any other purpose.
+  Status ReadFirstRecord(const WalFileType type, const uint64_t number,
+                         SequenceNumber* sequence);
+
+  // In case of no wal_compression, ReadFirstLine returns status.ok() and
+  // sequence == 0 if the file exists, but is empty.
+  // In case of wal_compression, WAL contains
+  // `kSetCompressionType` record which is not associated with any sequence
+  // number if that WAL doesn't include any other record (basically empty). As
+  // result for an empty file, GetSortedWalsOfType() will skip these WALs
+  // causing the operations to fail. To avoid that, it sets sequence_number to
+  // 1 inorder to include that WAL.
+  Status ReadFirstLine(const std::string& fname, const uint64_t number,
+                       SequenceNumber* sequence);
+
+  // ------- state from DBImpl ------
+  const ImmutableDBOptions& db_options_;
+  const FileOptions file_options_;
+  Env* env_;
+  const FileSystemPtr fs_;
+
+  // ------- WalManager state -------
+  // cache for ReadFirstRecord() calls
+  std::unordered_map<uint64_t, SequenceNumber> read_first_record_cache_;
+  port::Mutex read_first_record_cache_mutex_;
+
+  // last time when PurgeObsoleteWALFiles ran.
+  uint64_t purge_wal_files_last_run_;
+
+  bool seq_per_batch_;
+
+  const std::string& wal_dir_;
+
+  bool wal_in_db_path_;
+
+  // obsolete files will be deleted every this seconds if ttl deletion is
+  // enabled and archive size_limit is disabled.
+  static constexpr uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600;
+
+  std::shared_ptr<IOTracer> io_tracer_;
+};
+
+#endif  // ROCKSDB_LITE
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wal_manager_test.cc b/src/rocksdb/db/wal_manager_test.cc
new file mode 100644
index 000000000..4ad4e9749
--- /dev/null
+++ b/src/rocksdb/db/wal_manager_test.cc
@@ -0,0 +1,346 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/wal_manager.h"
+
+#include <map>
+#include <string>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/log_writer.h"
+#include "db/version_set.h"
+#include "env/mock_env.h"
+#include "file/writable_file_writer.h"
+#include "rocksdb/cache.h"
+#include "rocksdb/file_system.h"
+#include "rocksdb/write_batch.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/mock_table.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(icanadi) mock out VersionSet
+// TODO(icanadi) move other WalManager-specific tests from db_test here
+class WalManagerTest : public testing::Test {
+ public:
+  WalManagerTest()
+      : dbname_(test::PerThreadDBPath("wal_manager_test")),
+        db_options_(),
+        table_cache_(NewLRUCache(50000, 16)),
+        write_buffer_manager_(db_options_.db_write_buffer_size),
+        current_log_number_(0) {
+    env_.reset(MockEnv::Create(Env::Default()));
+    EXPECT_OK(DestroyDB(dbname_, Options()));
+  }
+
+  void Init() {
+    ASSERT_OK(env_->CreateDirIfMissing(dbname_));
+    ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_)));
+    db_options_.db_paths.emplace_back(dbname_,
+                                      std::numeric_limits<uint64_t>::max());
+    db_options_.wal_dir = dbname_;
+    db_options_.env = env_.get();
+    db_options_.fs = env_->GetFileSystem();
+    db_options_.clock = env_->GetSystemClock().get();
+
+    versions_.reset(
+        new VersionSet(dbname_, &db_options_, env_options_, table_cache_.get(),
+                       &write_buffer_manager_, &write_controller_,
+                       /*block_cache_tracer=*/nullptr, /*io_tracer=*/nullptr,
+                       /*db_id*/ "", /*db_session_id*/ ""));
+
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+  }
+
+  void Reopen() {
+    wal_manager_.reset(
+        new WalManager(db_options_, env_options_, nullptr /*IOTracer*/));
+  }
+
+  // NOT thread safe
+  void Put(const std::string& key, const std::string& value) {
+    assert(current_log_writer_.get() != nullptr);
+    uint64_t seq = versions_->LastSequence() + 1;
+    WriteBatch batch;
+    ASSERT_OK(batch.Put(key, value));
+    WriteBatchInternal::SetSequence(&batch, seq);
+    ASSERT_OK(
+        current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)));
+    versions_->SetLastAllocatedSequence(seq);
+    versions_->SetLastPublishedSequence(seq);
+    versions_->SetLastSequence(seq);
+  }
+
+  // NOT thread safe
+  void RollTheLog(bool /*archived*/) {
+    current_log_number_++;
+    std::string fname = ArchivedLogFileName(dbname_, current_log_number_);
+    const auto& fs = env_->GetFileSystem();
+    std::unique_ptr<WritableFileWriter> file_writer;
+    ASSERT_OK(WritableFileWriter::Create(fs, fname, env_options_, &file_writer,
+                                         nullptr));
+    current_log_writer_.reset(
+        new log::Writer(std::move(file_writer), 0, false));
+  }
+
+  void CreateArchiveLogs(int num_logs, int entries_per_log) {
+    for (int i = 1; i <= num_logs; ++i) {
+      RollTheLog(true);
+      for (int k = 0; k < entries_per_log; ++k) {
+        Put(std::to_string(k), std::string(1024, 'a'));
+      }
+    }
+  }
+
+  std::unique_ptr<TransactionLogIterator> OpenTransactionLogIter(
+      const SequenceNumber seq) {
+    std::unique_ptr<TransactionLogIterator> iter;
+    Status status = wal_manager_->GetUpdatesSince(
+        seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get());
+    EXPECT_OK(status);
+    return iter;
+  }
+
+  std::unique_ptr<MockEnv> env_;
+  std::string dbname_;
+  ImmutableDBOptions db_options_;
+  WriteController write_controller_;
+  EnvOptions env_options_;
+  std::shared_ptr<Cache> table_cache_;
+  WriteBufferManager write_buffer_manager_;
+  std::unique_ptr<VersionSet> versions_;
+  std::unique_ptr<WalManager> wal_manager_;
+
+  std::unique_ptr<log::Writer> current_log_writer_;
+  uint64_t current_log_number_;
+};
+
+TEST_F(WalManagerTest, ReadFirstRecordCache) {
+  Init();
+  std::string path = dbname_ + "/000001.log";
+  std::unique_ptr<FSWritableFile> file;
+  ASSERT_OK(env_->GetFileSystem()->NewWritableFile(path, FileOptions(), &file,
+                                                   nullptr));
+
+  SequenceNumber s;
+  ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  ASSERT_OK(
+      wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1 /* number */, &s));
+  ASSERT_EQ(s, 0U);
+
+  std::unique_ptr<WritableFileWriter> file_writer(
+      new WritableFileWriter(std::move(file), path, FileOptions()));
+  log::Writer writer(std::move(file_writer), 1,
+                     db_options_.recycle_log_file_num > 0);
+  WriteBatch batch;
+  ASSERT_OK(batch.Put("foo", "bar"));
+  WriteBatchInternal::SetSequence(&batch, 10);
+  ASSERT_OK(writer.AddRecord(WriteBatchInternal::Contents(&batch)));
+
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here.
+  // Waiting for lei to finish with db_test
+  // env_->count_sequential_reads_ = true;
+  // sequential_read_counter_ sanity test
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // did a read
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+
+  ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s));
+  ASSERT_EQ(s, 10U);
+  // no new reads since the value is cached
+  // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here
+  // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1);
+}
+
+namespace {
+uint64_t GetLogDirSize(std::string dir_path, Env* env) {
+  uint64_t dir_size = 0;
+  std::vector<std::string> files;
+  EXPECT_OK(env->GetChildren(dir_path, &files));
+  for (auto& f : files) {
+    uint64_t number;
+    FileType type;
+    if (ParseFileName(f, &number, &type) && type == kWalFile) {
+      std::string const file_path = dir_path + "/" + f;
+      uint64_t file_size;
+      EXPECT_OK(env->GetFileSize(file_path, &file_size));
+      dir_size += file_size;
+    }
+  }
+  return dir_size;
+}
+std::vector<std::uint64_t> ListSpecificFiles(
+    Env* env, const std::string& path, const FileType expected_file_type) {
+  std::vector<std::string> files;
+  std::vector<uint64_t> file_numbers;
+  uint64_t number;
+  FileType type;
+  EXPECT_OK(env->GetChildren(path, &files));
+  for (size_t i = 0; i < files.size(); ++i) {
+    if (ParseFileName(files[i], &number, &type)) {
+      if (type == expected_file_type) {
+        file_numbers.push_back(number);
+      }
+    }
+  }
+  return file_numbers;
+}
+
+int CountRecords(TransactionLogIterator* iter) {
+  int count = 0;
+  SequenceNumber lastSequence = 0;
+  BatchResult res;
+  while (iter->Valid()) {
+    res = iter->GetBatch();
+    EXPECT_TRUE(res.sequence > lastSequence);
+    ++count;
+    lastSequence = res.sequence;
+    EXPECT_OK(iter->status());
+    iter->Next();
+  }
+  EXPECT_OK(iter->status());
+  return count;
+}
+}  // anonymous namespace
+
+TEST_F(WalManagerTest, WALArchivalSizeLimit) {
+  db_options_.WAL_ttl_seconds = 0;
+  db_options_.WAL_size_limit_MB = 1000;
+  Init();
+
+  // TEST : Create WalManager with huge size limit and no ttl.
+  // Create some archived files and call PurgeObsoleteWALFiles().
+  // Count the archived log files that survived.
+  // Assert that all of them did.
+  // Change size limit. Re-open WalManager.
+  // Assert that archive is not greater than WAL_size_limit_MB after
+  // PurgeObsoleteWALFiles()
+  // Set ttl and time_to_check_ to small values. Re-open db.
+  // Assert that there are no archived logs left.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<std::uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_EQ(log_files.size(), 20U);
+
+  db_options_.WAL_size_limit_MB = 8;
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  uint64_t archive_size = GetLogDirSize(archive_dir, env_.get());
+  ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(2 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, WALArchivalTtl) {
+  db_options_.WAL_ttl_seconds = 1000;
+  Init();
+
+  // TEST : Create WalManager with a ttl and no size limit.
+  // Create some archived log files and call PurgeObsoleteWALFiles().
+  // Assert that files are not deleted
+  // Reopen db with small ttl.
+  // Assert that all archived logs was removed.
+
+  std::string archive_dir = ArchivalDirectory(dbname_);
+  CreateArchiveLogs(20, 5000);
+
+  std::vector<uint64_t> log_files =
+      ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_GT(log_files.size(), 0U);
+
+  db_options_.WAL_ttl_seconds = 1;
+  env_->SleepForMicroseconds(3 * 1000 * 1000);
+  Reopen();
+  wal_manager_->PurgeObsoleteWALFiles();
+
+  log_files = ListSpecificFiles(env_.get(), archive_dir, kWalFile);
+  ASSERT_TRUE(log_files.empty());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) {
+  Init();
+  RollTheLog(false);
+  Put("key1", std::string(1024, 'a'));
+  // Create a zero record WAL file.
+  RollTheLog(false);
+  RollTheLog(false);
+
+  Put("key2", std::string(1024, 'a'));
+
+  auto iter = OpenTransactionLogIter(0);
+  ASSERT_EQ(2, CountRecords(iter.get()));
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorJustEmptyFile) {
+  Init();
+  RollTheLog(false);
+  auto iter = OpenTransactionLogIter(0);
+  // Check that an empty iterator is returned
+  ASSERT_TRUE(!iter->Valid());
+}
+
+TEST_F(WalManagerTest, TransactionLogIteratorNewFileWhileScanning) {
+  Init();
+  CreateArchiveLogs(2, 100);
+  auto iter = OpenTransactionLogIter(0);
+  CreateArchiveLogs(1, 100);
+  int i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 200);
+  // A new log file was added after the iterator was created.
+  // TryAgain indicates a new iterator is needed to fetch the new data
+  ASSERT_TRUE(iter->status().IsTryAgain());
+
+  iter = OpenTransactionLogIter(0);
+  i = 0;
+  for (; iter->Valid(); iter->Next()) {
+    i++;
+  }
+  ASSERT_EQ(i, 300);
+  ASSERT_TRUE(iter->status().ok());
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr, "SKIPPED as WalManager is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/wide/db_wide_basic_test.cc b/src/rocksdb/db/wide/db_wide_basic_test.cc
new file mode 100644
index 000000000..1ffe314fe
--- /dev/null
+++ b/src/rocksdb/db/wide/db_wide_basic_test.cc
@@ -0,0 +1,654 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include <array>
+#include <memory>
+
+#include "db/db_test_util.h"
+#include "port/stack_trace.h"
+#include "test_util/testutil.h"
+#include "utilities/merge_operators.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DBWideBasicTest : public DBTestBase {
+ protected:
+  explicit DBWideBasicTest()
+      : DBTestBase("db_wide_basic_test", /* env_do_fsync */ false) {}
+};
+
+TEST_F(DBWideBasicTest, PutEntity) {
+  Options options = GetDefaultOptions();
+
+  // Write a couple of wide-column entities and a plain old key-value, then read
+  // them back.
+  constexpr char first_key[] = "first";
+  constexpr char first_value_of_default_column[] = "hello";
+  WideColumns first_columns{
+      {kDefaultWideColumnName, first_value_of_default_column},
+      {"attr_name1", "foo"},
+      {"attr_name2", "bar"}};
+
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+  constexpr char third_key[] = "third";
+  constexpr char third_value[] = "baz";
+
+  auto verify = [&]() {
+    const WideColumns expected_third_columns{
+        {kDefaultWideColumnName, third_value}};
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+                         &result));
+      ASSERT_EQ(result, first_value_of_default_column);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), first_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+                         &result));
+      ASSERT_TRUE(result.empty());
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), second_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), third_key,
+                         &result));
+      ASSERT_EQ(result, third_value);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               third_key, &result));
+
+      ASSERT_EQ(result.columns(), expected_third_columns);
+    }
+
+    {
+      constexpr size_t num_keys = 3;
+
+      std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+      std::array<PinnableSlice, num_keys> values;
+      std::array<Status, num_keys> statuses;
+
+      db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                    &keys[0], &values[0], &statuses[0]);
+
+      ASSERT_OK(statuses[0]);
+      ASSERT_EQ(values[0], first_value_of_default_column);
+
+      ASSERT_OK(statuses[1]);
+      ASSERT_TRUE(values[1].empty());
+
+      ASSERT_OK(statuses[2]);
+      ASSERT_EQ(values[2], third_value);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_value_of_default_column);
+      ASSERT_EQ(iter->columns(), first_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_EQ(iter->columns(), second_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), third_value);
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), third_value);
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_TRUE(iter->value().empty());
+      ASSERT_EQ(iter->columns(), second_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_value_of_default_column);
+      ASSERT_EQ(iter->columns(), first_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  // Use the DB::PutEntity API to write the first entity
+  ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                           first_key, first_columns));
+
+  // Use WriteBatch to write the second entity
+  WriteBatch batch;
+  ASSERT_OK(
+      batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+
+  // Use Put to write the plain key-value
+  ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+                     third_value));
+
+  // Try reading from memtable
+  verify();
+
+  // Try reading after recovery
+  Close();
+  options.avoid_flush_during_recovery = true;
+  Reopen(options);
+
+  verify();
+
+  // Try reading from storage
+  ASSERT_OK(Flush());
+
+  verify();
+}
+
+TEST_F(DBWideBasicTest, PutEntityColumnFamily) {
+  Options options = GetDefaultOptions();
+  CreateAndReopenWithCF({"corinthian"}, options);
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+  ASSERT_OK(
+      db_->PutEntity(WriteOptions(), handles_[1], first_key, first_columns));
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+
+  WriteBatch batch;
+  ASSERT_OK(batch.PutEntity(handles_[1], second_key, second_columns));
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, MergePlainKeyValue) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  Reopen(options);
+
+  // Put + Merge
+  constexpr char first_key[] = "first";
+  constexpr char first_base_value[] = "hello";
+  constexpr char first_merge_op[] = "world";
+
+  // Delete + Merge
+  constexpr char second_key[] = "second";
+  constexpr char second_merge_op[] = "foo";
+
+  // Merge without any preceding KV
+  constexpr char third_key[] = "third";
+  constexpr char third_merge_op[] = "bar";
+
+  auto write_base = [&]() {
+    // Write "base" KVs: a Put for the 1st key and a Delete for the 2nd one;
+    // note there is no "base" KV for the 3rd
+    ASSERT_OK(db_->Put(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                       first_base_value));
+    ASSERT_OK(
+        db_->Delete(WriteOptions(), db_->DefaultColumnFamily(), second_key));
+  };
+
+  auto write_merge = [&]() {
+    // Write Merge operands
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                         first_merge_op));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+                         second_merge_op));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), third_key,
+                         third_merge_op));
+  };
+
+  const std::string expected_first_column(std::string(first_base_value) + "," +
+                                          first_merge_op);
+  const WideColumns expected_first_columns{
+      {kDefaultWideColumnName, expected_first_column}};
+  const WideColumns expected_second_columns{
+      {kDefaultWideColumnName, second_merge_op}};
+  const WideColumns expected_third_columns{
+      {kDefaultWideColumnName, third_merge_op}};
+
+  auto verify = [&]() {
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), expected_first_columns);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), expected_second_columns);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               third_key, &result));
+
+      ASSERT_EQ(result.columns(), expected_third_columns);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_first_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_second_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), third_key);
+      ASSERT_EQ(iter->value(), expected_third_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_third_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), expected_second_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_second_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), expected_first_columns[0].value());
+      ASSERT_EQ(iter->columns(), expected_first_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  {
+    // Base KVs (if any) and Merge operands both in memtable (note: we take a
+    // snapshot in between to make sure they do not get reconciled during the
+    // subsequent flush)
+    write_base();
+    ManagedSnapshot snapshot(db_);
+    write_merge();
+    verify();
+
+    // Base KVs (if any) and Merge operands both in storage
+    ASSERT_OK(Flush());
+    verify();
+  }
+
+  // Base KVs (if any) in storage, Merge operands in memtable
+  DestroyAndReopen(options);
+  write_base();
+  ASSERT_OK(Flush());
+  write_merge();
+  verify();
+}
+
+TEST_F(DBWideBasicTest, MergeEntity) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+
+  const std::string delim("|");
+  options.merge_operator = MergeOperators::CreateStringAppendOperator(delim);
+
+  Reopen(options);
+
+  // Test Merge with two entities: one that has the default column and one that
+  // doesn't
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{kDefaultWideColumnName, "a"},
+                            {"attr_name1", "foo"},
+                            {"attr_name2", "bar"}};
+  constexpr char first_merge_operand[] = "bla1";
+
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"attr_one", "two"}, {"attr_three", "four"}};
+  constexpr char second_merge_operand[] = "bla2";
+
+  auto write_base = [&]() {
+    // Use the DB::PutEntity API
+    ASSERT_OK(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                             first_key, first_columns));
+
+    // Use WriteBatch
+    WriteBatch batch;
+    ASSERT_OK(batch.PutEntity(db_->DefaultColumnFamily(), second_key,
+                              second_columns));
+    ASSERT_OK(db_->Write(WriteOptions(), &batch));
+  };
+
+  auto write_merge = [&]() {
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), first_key,
+                         first_merge_operand));
+    ASSERT_OK(db_->Merge(WriteOptions(), db_->DefaultColumnFamily(), second_key,
+                         second_merge_operand));
+  };
+
+  const std::string first_expected_default(first_columns[0].value().ToString() +
+                                           delim + first_merge_operand);
+  const std::string second_expected_default(delim + second_merge_operand);
+
+  auto verify_basic = [&]() {
+    WideColumns first_expected_columns{
+        {kDefaultWideColumnName, first_expected_default},
+        first_columns[1],
+        first_columns[2]};
+
+    WideColumns second_expected_columns{
+        {kDefaultWideColumnName, second_expected_default},
+        second_columns[0],
+        second_columns[1]};
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), first_key,
+                         &result));
+      ASSERT_EQ(result, first_expected_default);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               first_key, &result));
+      ASSERT_EQ(result.columns(), first_expected_columns);
+    }
+
+    {
+      PinnableSlice result;
+      ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), second_key,
+                         &result));
+      ASSERT_EQ(result, second_expected_default);
+    }
+
+    {
+      PinnableWideColumns result;
+      ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                               second_key, &result));
+      ASSERT_EQ(result.columns(), second_expected_columns);
+    }
+
+    {
+      constexpr size_t num_keys = 2;
+
+      std::array<Slice, num_keys> keys{{first_key, second_key}};
+      std::array<PinnableSlice, num_keys> values;
+      std::array<Status, num_keys> statuses;
+
+      db_->MultiGet(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                    &keys[0], &values[0], &statuses[0]);
+
+      ASSERT_EQ(values[0], first_expected_default);
+      ASSERT_OK(statuses[0]);
+
+      ASSERT_EQ(values[1], second_expected_default);
+      ASSERT_OK(statuses[1]);
+    }
+
+    {
+      std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
+
+      iter->SeekToFirst();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_expected_default);
+      ASSERT_EQ(iter->columns(), first_expected_columns);
+
+      iter->Next();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), second_expected_default);
+      ASSERT_EQ(iter->columns(), second_expected_columns);
+
+      iter->Next();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+
+      iter->SeekToLast();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), second_key);
+      ASSERT_EQ(iter->value(), second_expected_default);
+      ASSERT_EQ(iter->columns(), second_expected_columns);
+
+      iter->Prev();
+      ASSERT_TRUE(iter->Valid());
+      ASSERT_OK(iter->status());
+      ASSERT_EQ(iter->key(), first_key);
+      ASSERT_EQ(iter->value(), first_expected_default);
+      ASSERT_EQ(iter->columns(), first_expected_columns);
+
+      iter->Prev();
+      ASSERT_FALSE(iter->Valid());
+      ASSERT_OK(iter->status());
+    }
+  };
+
+  auto verify_merge_ops_pre_compaction = [&]() {
+    constexpr size_t num_merge_operands = 2;
+
+    GetMergeOperandsOptions get_merge_opts;
+    get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      first_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], first_columns[0].value());
+      ASSERT_EQ(merge_operands[1], first_merge_operand);
+    }
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      second_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_TRUE(merge_operands[0].empty());
+      ASSERT_EQ(merge_operands[1], second_merge_operand);
+    }
+  };
+
+  auto verify_merge_ops_post_compaction = [&]() {
+    constexpr size_t num_merge_operands = 1;
+
+    GetMergeOperandsOptions get_merge_opts;
+    get_merge_opts.expected_max_number_of_operands = num_merge_operands;
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      first_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], first_expected_default);
+    }
+
+    {
+      std::array<PinnableSlice, num_merge_operands> merge_operands;
+      int number_of_operands = 0;
+
+      ASSERT_OK(db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(),
+                                      second_key, &merge_operands[0],
+                                      &get_merge_opts, &number_of_operands));
+
+      ASSERT_EQ(number_of_operands, num_merge_operands);
+      ASSERT_EQ(merge_operands[0], second_expected_default);
+    }
+  };
+
+  {
+    // Base KVs and Merge operands both in memtable (note: we take a snapshot in
+    // between to make sure they do not get reconciled during the subsequent
+    // flush)
+    write_base();
+    ManagedSnapshot snapshot(db_);
+    write_merge();
+    verify_basic();
+    verify_merge_ops_pre_compaction();
+
+    // Base KVs and Merge operands both in storage
+    ASSERT_OK(Flush());
+    verify_basic();
+    verify_merge_ops_pre_compaction();
+  }
+
+  // Base KVs in storage, Merge operands in memtable
+  DestroyAndReopen(options);
+  write_base();
+  ASSERT_OK(Flush());
+  write_merge();
+  verify_basic();
+  verify_merge_ops_pre_compaction();
+
+  // Flush and compact
+  ASSERT_OK(Flush());
+  ASSERT_OK(db_->CompactRange(CompactRangeOptions(), /* begin */ nullptr,
+                              /* end */ nullptr));
+  verify_basic();
+  verify_merge_ops_post_compaction();
+}
+
+TEST_F(DBWideBasicTest, PutEntityTimestampError) {
+  // Note: timestamps are currently not supported
+
+  Options options = GetDefaultOptions();
+  options.comparator = test::BytewiseComparatorWithU64TsWrapper();
+
+  ColumnFamilyHandle* handle = nullptr;
+  ASSERT_OK(db_->CreateColumnFamily(options, "corinthian", &handle));
+  std::unique_ptr<ColumnFamilyHandle> handle_guard(handle);
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"attr_name1", "foo"}, {"attr_name2", "bar"}};
+
+  ASSERT_TRUE(db_->PutEntity(WriteOptions(), handle, first_key, first_columns)
+                  .IsInvalidArgument());
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"doric", "column"}, {"ionic", "column"}};
+
+  WriteBatch batch;
+  ASSERT_TRUE(
+      batch.PutEntity(handle, second_key, second_columns).IsInvalidArgument());
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+TEST_F(DBWideBasicTest, PutEntitySerializationError) {
+  // Make sure duplicate columns are caught
+
+  Options options = GetDefaultOptions();
+
+  // Use the DB::PutEntity API
+  constexpr char first_key[] = "first";
+  WideColumns first_columns{{"foo", "bar"}, {"foo", "baz"}};
+
+  ASSERT_TRUE(db_->PutEntity(WriteOptions(), db_->DefaultColumnFamily(),
+                             first_key, first_columns)
+                  .IsCorruption());
+
+  // Use WriteBatch
+  constexpr char second_key[] = "second";
+  WideColumns second_columns{{"column", "doric"}, {"column", "ionic"}};
+
+  WriteBatch batch;
+  ASSERT_TRUE(
+      batch.PutEntity(db_->DefaultColumnFamily(), second_key, second_columns)
+          .IsCorruption());
+  ASSERT_OK(db_->Write(WriteOptions(), &batch));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  RegisterCustomObjects(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_column_serialization.cc b/src/rocksdb/db/wide/wide_column_serialization.cc
new file mode 100644
index 000000000..f62143c40
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.cc
@@ -0,0 +1,182 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include <algorithm>
+#include <cassert>
+#include <limits>
+
+#include "rocksdb/slice.h"
+#include "util/autovector.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+Status WideColumnSerialization::SerializeImpl(const Slice* value_of_default,
+                                              const WideColumns& columns,
+                                              std::string& output) {
+  const size_t num_columns =
+      value_of_default ? columns.size() + 1 : columns.size();
+
+  if (num_columns > static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+    return Status::InvalidArgument("Too many wide columns");
+  }
+
+  PutVarint32(&output, kCurrentVersion);
+
+  PutVarint32(&output, static_cast<uint32_t>(num_columns));
+
+  const Slice* prev_name = nullptr;
+  if (value_of_default) {
+    if (value_of_default->size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column value too long");
+    }
+
+    PutLengthPrefixedSlice(&output, kDefaultWideColumnName);
+    PutVarint32(&output, static_cast<uint32_t>(value_of_default->size()));
+
+    prev_name = &kDefaultWideColumnName;
+  }
+
+  for (size_t i = 0; i < columns.size(); ++i) {
+    const WideColumn& column = columns[i];
+
+    const Slice& name = column.name();
+    if (name.size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column name too long");
+    }
+
+    if (prev_name && prev_name->compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+
+    const Slice& value = column.value();
+    if (value.size() >
+        static_cast<size_t>(std::numeric_limits<uint32_t>::max())) {
+      return Status::InvalidArgument("Wide column value too long");
+    }
+
+    PutLengthPrefixedSlice(&output, name);
+    PutVarint32(&output, static_cast<uint32_t>(value.size()));
+
+    prev_name = &name;
+  }
+
+  if (value_of_default) {
+    output.append(value_of_default->data(), value_of_default->size());
+  }
+
+  for (const auto& column : columns) {
+    const Slice& value = column.value();
+
+    output.append(value.data(), value.size());
+  }
+
+  return Status::OK();
+}
+
+Status WideColumnSerialization::Deserialize(Slice& input,
+                                            WideColumns& columns) {
+  assert(columns.empty());
+
+  uint32_t version = 0;
+  if (!GetVarint32(&input, &version)) {
+    return Status::Corruption("Error decoding wide column version");
+  }
+
+  if (version > kCurrentVersion) {
+    return Status::NotSupported("Unsupported wide column version");
+  }
+
+  uint32_t num_columns = 0;
+  if (!GetVarint32(&input, &num_columns)) {
+    return Status::Corruption("Error decoding number of wide columns");
+  }
+
+  if (!num_columns) {
+    return Status::OK();
+  }
+
+  columns.reserve(num_columns);
+
+  autovector<uint32_t, 16> column_value_sizes;
+  column_value_sizes.reserve(num_columns);
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    Slice name;
+    if (!GetLengthPrefixedSlice(&input, &name)) {
+      return Status::Corruption("Error decoding wide column name");
+    }
+
+    if (!columns.empty() && columns.back().name().compare(name) >= 0) {
+      return Status::Corruption("Wide columns out of order");
+    }
+
+    columns.emplace_back(name, Slice());
+
+    uint32_t value_size = 0;
+    if (!GetVarint32(&input, &value_size)) {
+      return Status::Corruption("Error decoding wide column value size");
+    }
+
+    column_value_sizes.emplace_back(value_size);
+  }
+
+  const Slice data(input);
+  size_t pos = 0;
+
+  for (uint32_t i = 0; i < num_columns; ++i) {
+    const uint32_t value_size = column_value_sizes[i];
+
+    if (pos + value_size > data.size()) {
+      return Status::Corruption("Error decoding wide column value payload");
+    }
+
+    columns[i].value() = Slice(data.data() + pos, value_size);
+
+    pos += value_size;
+  }
+
+  return Status::OK();
+}
+
+WideColumns::const_iterator WideColumnSerialization::Find(
+    const WideColumns& columns, const Slice& column_name) {
+  const auto it =
+      std::lower_bound(columns.cbegin(), columns.cend(), column_name,
+                       [](const WideColumn& lhs, const Slice& rhs) {
+                         return lhs.name().compare(rhs) < 0;
+                       });
+
+  if (it == columns.cend() || it->name() != column_name) {
+    return columns.cend();
+  }
+
+  return it;
+}
+
+Status WideColumnSerialization::GetValueOfDefaultColumn(Slice& input,
+                                                        Slice& value) {
+  WideColumns columns;
+
+  const Status s = Deserialize(input, columns);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (columns.empty() || columns[0].name() != kDefaultWideColumnName) {
+    value.clear();
+    return Status::OK();
+  }
+
+  value = columns[0].value();
+
+  return Status::OK();
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization.h b/src/rocksdb/db/wide/wide_column_serialization.h
new file mode 100644
index 000000000..f0ffbd392
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization.h
@@ -0,0 +1,77 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "rocksdb/rocksdb_namespace.h"
+#include "rocksdb/status.h"
+#include "rocksdb/wide_columns.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class Slice;
+
+// Wide-column serialization/deserialization primitives.
+//
+// The two main parts of the layout are 1) a sorted index containing the column
+// names and column value sizes and 2) the column values themselves. Keeping the
+// index and the values separate will enable selectively reading column values
+// down the line. Note that currently the index has to be fully parsed in order
+// to find out the offset of each column value.
+//
+// Legend: cn = column name, cv = column value, cns = column name size, cvs =
+// column value size.
+//
+//      +----------+--------------+----------+-------+----------+---...
+//      | version  | # of columns |  cns 1   | cn 1  |  cvs 1   |
+//      +----------+--------------+------------------+--------- +---...
+//      | varint32 |   varint32   | varint32 | bytes | varint32 |
+//      +----------+--------------+----------+-------+----------+---...
+//
+//      ... continued ...
+//
+//          ...---+----------+-------+----------+-------+---...---+-------+
+//                |  cns N   | cn N  |  cvs N   | cv 1  |         | cv N  |
+//          ...---+----------+-------+----------+-------+---...---+-------+
+//                | varint32 | bytes | varint32 | bytes |         | bytes |
+//          ...---+----------+-------+----------+-------+---...---+-------+
+
+class WideColumnSerialization {
+ public:
+  static Status Serialize(const WideColumns& columns, std::string& output);
+  static Status Serialize(const Slice& value_of_default,
+                          const WideColumns& other_columns,
+                          std::string& output);
+
+  static Status Deserialize(Slice& input, WideColumns& columns);
+
+  static WideColumns::const_iterator Find(const WideColumns& columns,
+                                          const Slice& column_name);
+  static Status GetValueOfDefaultColumn(Slice& input, Slice& value);
+
+  static constexpr uint32_t kCurrentVersion = 1;
+
+ private:
+  static Status SerializeImpl(const Slice* value_of_default,
+                              const WideColumns& columns, std::string& output);
+};
+
+inline Status WideColumnSerialization::Serialize(const WideColumns& columns,
+                                                 std::string& output) {
+  constexpr Slice* value_of_default = nullptr;
+
+  return SerializeImpl(value_of_default, columns, output);
+}
+
+inline Status WideColumnSerialization::Serialize(
+    const Slice& value_of_default, const WideColumns& other_columns,
+    std::string& output) {
+  return SerializeImpl(&value_of_default, other_columns, output);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/wide/wide_column_serialization_test.cc b/src/rocksdb/db/wide/wide_column_serialization_test.cc
new file mode 100644
index 000000000..8060d2f24
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_column_serialization_test.cc
@@ -0,0 +1,338 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/wide/wide_column_serialization.h"
+
+#include "test_util/testharness.h"
+#include "util/coding.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+TEST(WideColumnSerializationTest, Construct) {
+  constexpr char foo[] = "foo";
+  constexpr char bar[] = "bar";
+
+  const std::string foo_str(foo);
+  const std::string bar_str(bar);
+
+  const Slice foo_slice(foo_str);
+  const Slice bar_slice(bar_str);
+
+  {
+    WideColumn column(foo, bar);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo_str, bar);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo_slice, bar);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar);
+  }
+
+  {
+    WideColumn column(foo, bar_str);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo_str, bar_str);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo_slice, bar_str);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar_str);
+  }
+
+  {
+    WideColumn column(foo, bar_slice);
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    WideColumn column(foo_str, bar_slice);
+    ASSERT_EQ(column.name(), foo_str);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    WideColumn column(foo_slice, bar_slice);
+    ASSERT_EQ(column.name(), foo_slice);
+    ASSERT_EQ(column.value(), bar_slice);
+  }
+
+  {
+    constexpr char foo_name[] = "foo_name";
+    constexpr char bar_value[] = "bar_value";
+
+    WideColumn column(std::piecewise_construct,
+                      std::forward_as_tuple(foo_name, sizeof(foo) - 1),
+                      std::forward_as_tuple(bar_value, sizeof(bar) - 1));
+    ASSERT_EQ(column.name(), foo);
+    ASSERT_EQ(column.value(), bar);
+  }
+}
+
+TEST(WideColumnSerializationTest, SerializeDeserialize) {
+  WideColumns columns{{"foo", "bar"}, {"hello", "world"}};
+  std::string output;
+
+  ASSERT_OK(WideColumnSerialization::Serialize(columns, output));
+
+  Slice input(output);
+  WideColumns deserialized_columns;
+
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+  ASSERT_EQ(columns, deserialized_columns);
+
+  {
+    const auto it = WideColumnSerialization::Find(deserialized_columns, "foo");
+    ASSERT_NE(it, deserialized_columns.cend());
+    ASSERT_EQ(*it, deserialized_columns.front());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "hello");
+    ASSERT_NE(it, deserialized_columns.cend());
+    ASSERT_EQ(*it, deserialized_columns.back());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "fubar");
+    ASSERT_EQ(it, deserialized_columns.cend());
+  }
+
+  {
+    const auto it =
+        WideColumnSerialization::Find(deserialized_columns, "snafu");
+    ASSERT_EQ(it, deserialized_columns.cend());
+  }
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrepend) {
+  Slice value_of_default("baz");
+  WideColumns other_columns{{"foo", "bar"}, {"hello", "world"}};
+
+  std::string output;
+  ASSERT_OK(WideColumnSerialization::Serialize(value_of_default, other_columns,
+                                               output));
+
+  Slice input(output);
+
+  WideColumns deserialized_columns;
+  ASSERT_OK(WideColumnSerialization::Deserialize(input, deserialized_columns));
+
+  WideColumns expected_columns{{kDefaultWideColumnName, value_of_default},
+                               other_columns[0],
+                               other_columns[1]};
+  ASSERT_EQ(deserialized_columns, expected_columns);
+}
+
+TEST(WideColumnSerializationTest, SerializeDuplicateError) {
+  WideColumns columns{{"foo", "bar"}, {"foo", "baz"}};
+  std::string output;
+
+  ASSERT_TRUE(
+      WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeWithPrependDuplicateError) {
+  Slice value_of_default("baz");
+  WideColumns other_columns{{kDefaultWideColumnName, "dup"}, {"foo", "bar"}};
+
+  std::string output;
+  ASSERT_TRUE(WideColumnSerialization::Serialize(value_of_default,
+                                                 other_columns, output)
+                  .IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, SerializeOutOfOrderError) {
+  WideColumns columns{{"hello", "world"}, {"foo", "bar"}};
+  std::string output;
+
+  ASSERT_TRUE(
+      WideColumnSerialization::Serialize(columns, output).IsCorruption());
+}
+
+TEST(WideColumnSerializationTest, DeserializeVersionError) {
+  // Can't decode version
+
+  std::string buf;
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeUnsupportedVersion) {
+  // Unsupported version
+  constexpr uint32_t future_version = 1000;
+
+  std::string buf;
+  PutVarint32(&buf, future_version);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsNotSupported());
+  ASSERT_TRUE(std::strstr(s.getState(), "version"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeNumberOfColumnsError) {
+  // Can't decode number of columns
+
+  std::string buf;
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "number"));
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsError) {
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  // Can't decode the first column name
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "name"));
+  }
+
+  constexpr char first_column_name[] = "foo";
+  PutLengthPrefixedSlice(&buf, first_column_name);
+
+  // Can't decode the size of the first column value
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+  }
+
+  constexpr uint32_t first_value_size = 16;
+  PutVarint32(&buf, first_value_size);
+
+  // Can't decode the second column name
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "name"));
+  }
+
+  constexpr char second_column_name[] = "hello";
+  PutLengthPrefixedSlice(&buf, second_column_name);
+
+  // Can't decode the size of the second column value
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "value size"));
+  }
+
+  constexpr uint32_t second_value_size = 64;
+  PutVarint32(&buf, second_value_size);
+
+  // Can't decode the payload of the first column
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+  }
+
+  buf.append(first_value_size, '0');
+
+  // Can't decode the payload of the second column
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    const Status s = WideColumnSerialization::Deserialize(input, columns);
+    ASSERT_TRUE(s.IsCorruption());
+    ASSERT_TRUE(std::strstr(s.getState(), "payload"));
+  }
+
+  buf.append(second_value_size, 'x');
+
+  // Success
+  {
+    Slice input(buf);
+    WideColumns columns;
+
+    ASSERT_OK(WideColumnSerialization::Deserialize(input, columns));
+  }
+}
+
+TEST(WideColumnSerializationTest, DeserializeColumnsOutOfOrder) {
+  std::string buf;
+
+  PutVarint32(&buf, WideColumnSerialization::kCurrentVersion);
+
+  constexpr uint32_t num_columns = 2;
+  PutVarint32(&buf, num_columns);
+
+  constexpr char first_column_name[] = "b";
+  PutLengthPrefixedSlice(&buf, first_column_name);
+
+  constexpr uint32_t first_value_size = 16;
+  PutVarint32(&buf, first_value_size);
+
+  constexpr char second_column_name[] = "a";
+  PutLengthPrefixedSlice(&buf, second_column_name);
+
+  Slice input(buf);
+  WideColumns columns;
+
+  const Status s = WideColumnSerialization::Deserialize(input, columns);
+  ASSERT_TRUE(s.IsCorruption());
+  ASSERT_TRUE(std::strstr(s.getState(), "order"));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/wide/wide_columns.cc b/src/rocksdb/db/wide/wide_columns.cc
new file mode 100644
index 000000000..186be7f85
--- /dev/null
+++ b/src/rocksdb/db/wide/wide_columns.cc
@@ -0,0 +1,22 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/wide_columns.h"
+
+#include "db/wide/wide_column_serialization.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+const Slice kDefaultWideColumnName;
+
+const WideColumns kNoWideColumns;
+
+Status PinnableWideColumns::CreateIndexForWideColumns() {
+  Slice value_copy = value_;
+
+  return WideColumnSerialization::Deserialize(value_copy, columns_);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch.cc b/src/rocksdb/db/write_batch.cc
new file mode 100644
index 000000000..796697cfc
--- /dev/null
+++ b/src/rocksdb/db/write_batch.cc
@@ -0,0 +1,3137 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// WriteBatch::rep_ :=
+//    sequence: fixed64
+//    count: fixed32
+//    data: record[count]
+// record :=
+//    kTypeValue varstring varstring
+//    kTypeDeletion varstring
+//    kTypeSingleDeletion varstring
+//    kTypeRangeDeletion varstring varstring
+//    kTypeMerge varstring varstring
+//    kTypeColumnFamilyValue varint32 varstring varstring
+//    kTypeColumnFamilyDeletion varint32 varstring
+//    kTypeColumnFamilySingleDeletion varint32 varstring
+//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
+//    kTypeColumnFamilyMerge varint32 varstring varstring
+//    kTypeBeginPrepareXID
+//    kTypeEndPrepareXID varstring
+//    kTypeCommitXID varstring
+//    kTypeCommitXIDAndTimestamp varstring varstring
+//    kTypeRollbackXID varstring
+//    kTypeBeginPersistedPrepareXID
+//    kTypeBeginUnprepareXID
+//    kTypeWideColumnEntity varstring varstring
+//    kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
+//    kTypeNoop
+// varstring :=
+//    len: varint32
+//    data: uint8[len]
+
+#include "rocksdb/write_batch.h"
+
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <stack>
+#include <stdexcept>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "db/column_family.h"
+#include "db/db_impl/db_impl.h"
+#include "db/dbformat.h"
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/memtable.h"
+#include "db/merge_context.h"
+#include "db/snapshot_impl.h"
+#include "db/trim_history_scheduler.h"
+#include "db/wide/wide_column_serialization.h"
+#include "db/write_batch_internal.h"
+#include "monitoring/perf_context_imp.h"
+#include "monitoring/statistics.h"
+#include "port/lang.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/system_clock.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+#include "util/coding.h"
+#include "util/duplicate_detector.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// anon namespace for file-local types
+namespace {
+
+enum ContentFlags : uint32_t {
+  DEFERRED = 1 << 0,
+  HAS_PUT = 1 << 1,
+  HAS_DELETE = 1 << 2,
+  HAS_SINGLE_DELETE = 1 << 3,
+  HAS_MERGE = 1 << 4,
+  HAS_BEGIN_PREPARE = 1 << 5,
+  HAS_END_PREPARE = 1 << 6,
+  HAS_COMMIT = 1 << 7,
+  HAS_ROLLBACK = 1 << 8,
+  HAS_DELETE_RANGE = 1 << 9,
+  HAS_BLOB_INDEX = 1 << 10,
+  HAS_BEGIN_UNPREPARE = 1 << 11,
+  HAS_PUT_ENTITY = 1 << 12,
+};
+
+struct BatchContentClassifier : public WriteBatch::Handler {
+  uint32_t content_flags = 0;
+
+  Status PutCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_PUT;
+    return Status::OK();
+  }
+
+  Status PutEntityCF(uint32_t /* column_family_id */, const Slice& /* key */,
+                     const Slice& /* entity */) override {
+    content_flags |= ContentFlags::HAS_PUT_ENTITY;
+    return Status::OK();
+  }
+
+  Status DeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE;
+    return Status::OK();
+  }
+
+  Status SingleDeleteCF(uint32_t, const Slice&) override {
+    content_flags |= ContentFlags::HAS_SINGLE_DELETE;
+    return Status::OK();
+  }
+
+  Status DeleteRangeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_DELETE_RANGE;
+    return Status::OK();
+  }
+
+  Status MergeCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_MERGE;
+    return Status::OK();
+  }
+
+  Status PutBlobIndexCF(uint32_t, const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_BLOB_INDEX;
+    return Status::OK();
+  }
+
+  Status MarkBeginPrepare(bool unprepare) override {
+    content_flags |= ContentFlags::HAS_BEGIN_PREPARE;
+    if (unprepare) {
+      content_flags |= ContentFlags::HAS_BEGIN_UNPREPARE;
+    }
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice&) override {
+    content_flags |= ContentFlags::HAS_END_PREPARE;
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
+    return Status::OK();
+  }
+
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    content_flags |= ContentFlags::HAS_COMMIT;
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override {
+    content_flags |= ContentFlags::HAS_ROLLBACK;
+    return Status::OK();
+  }
+};
+
+}  // anonymous namespace
+
+struct SavePoints {
+  std::stack<SavePoint, autovector<SavePoint>> stack;
+};
+
+WriteBatch::WriteBatch(size_t reserved_bytes, size_t max_bytes,
+                       size_t protection_bytes_per_key, size_t default_cf_ts_sz)
+    : content_flags_(0),
+      max_bytes_(max_bytes),
+      default_cf_ts_sz_(default_cf_ts_sz),
+      rep_() {
+  // Currently `protection_bytes_per_key` can only be enabled at 8 bytes per
+  // entry.
+  assert(protection_bytes_per_key == 0 || protection_bytes_per_key == 8);
+  if (protection_bytes_per_key != 0) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+  }
+  rep_.reserve((reserved_bytes > WriteBatchInternal::kHeader)
+                   ? reserved_bytes
+                   : WriteBatchInternal::kHeader);
+  rep_.resize(WriteBatchInternal::kHeader);
+}
+
+WriteBatch::WriteBatch(const std::string& rep)
+    : content_flags_(ContentFlags::DEFERRED), max_bytes_(0), rep_(rep) {}
+
+WriteBatch::WriteBatch(std::string&& rep)
+    : content_flags_(ContentFlags::DEFERRED),
+      max_bytes_(0),
+      rep_(std::move(rep)) {}
+
+WriteBatch::WriteBatch(const WriteBatch& src)
+    : wal_term_point_(src.wal_term_point_),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      default_cf_ts_sz_(src.default_cf_ts_sz_),
+      rep_(src.rep_) {
+  if (src.save_points_ != nullptr) {
+    save_points_.reset(new SavePoints());
+    save_points_->stack = src.save_points_->stack;
+  }
+  if (src.prot_info_ != nullptr) {
+    prot_info_.reset(new WriteBatch::ProtectionInfo());
+    prot_info_->entries_ = src.prot_info_->entries_;
+  }
+}
+
+WriteBatch::WriteBatch(WriteBatch&& src) noexcept
+    : save_points_(std::move(src.save_points_)),
+      wal_term_point_(std::move(src.wal_term_point_)),
+      content_flags_(src.content_flags_.load(std::memory_order_relaxed)),
+      max_bytes_(src.max_bytes_),
+      prot_info_(std::move(src.prot_info_)),
+      default_cf_ts_sz_(src.default_cf_ts_sz_),
+      rep_(std::move(src.rep_)) {}
+
+WriteBatch& WriteBatch::operator=(const WriteBatch& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(src);
+  }
+  return *this;
+}
+
+WriteBatch& WriteBatch::operator=(WriteBatch&& src) {
+  if (&src != this) {
+    this->~WriteBatch();
+    new (this) WriteBatch(std::move(src));
+  }
+  return *this;
+}
+
+WriteBatch::~WriteBatch() {}
+
+WriteBatch::Handler::~Handler() {}
+
+void WriteBatch::Handler::LogData(const Slice& /*blob*/) {
+  // If the user has not specified something to do with blobs, then we ignore
+  // them.
+}
+
+bool WriteBatch::Handler::Continue() { return true; }
+
+void WriteBatch::Clear() {
+  rep_.clear();
+  rep_.resize(WriteBatchInternal::kHeader);
+
+  content_flags_.store(0, std::memory_order_relaxed);
+
+  if (save_points_ != nullptr) {
+    while (!save_points_->stack.empty()) {
+      save_points_->stack.pop();
+    }
+  }
+
+  if (prot_info_ != nullptr) {
+    prot_info_->entries_.clear();
+  }
+  wal_term_point_.clear();
+  default_cf_ts_sz_ = 0;
+}
+
+uint32_t WriteBatch::Count() const { return WriteBatchInternal::Count(this); }
+
+uint32_t WriteBatch::ComputeContentFlags() const {
+  auto rv = content_flags_.load(std::memory_order_relaxed);
+  if ((rv & ContentFlags::DEFERRED) != 0) {
+    BatchContentClassifier classifier;
+    // Should we handle status here?
+    Iterate(&classifier).PermitUncheckedError();
+    rv = classifier.content_flags;
+
+    // this method is conceptually const, because it is performing a lazy
+    // computation that doesn't affect the abstract state of the batch.
+    // content_flags_ is marked mutable so that we can perform the
+    // following assignment
+    content_flags_.store(rv, std::memory_order_relaxed);
+  }
+  return rv;
+}
+
+void WriteBatch::MarkWalTerminationPoint() {
+  wal_term_point_.size = GetDataSize();
+  wal_term_point_.count = Count();
+  wal_term_point_.content_flags = content_flags_;
+}
+
+size_t WriteBatch::GetProtectionBytesPerKey() const {
+  if (prot_info_ != nullptr) {
+    return prot_info_->GetBytesPerKey();
+  }
+  return 0;
+}
+
+bool WriteBatch::HasPut() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_PUT) != 0;
+}
+
+bool WriteBatch::HasPutEntity() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_PUT_ENTITY) != 0;
+}
+
+bool WriteBatch::HasDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE) != 0;
+}
+
+bool WriteBatch::HasSingleDelete() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_SINGLE_DELETE) != 0;
+}
+
+bool WriteBatch::HasDeleteRange() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_DELETE_RANGE) != 0;
+}
+
+bool WriteBatch::HasMerge() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_MERGE) != 0;
+}
+
+bool ReadKeyFromWriteBatchEntry(Slice* input, Slice* key, bool cf_record) {
+  assert(input != nullptr && key != nullptr);
+  // Skip tag byte
+  input->remove_prefix(1);
+
+  if (cf_record) {
+    // Skip column_family bytes
+    uint32_t cf;
+    if (!GetVarint32(input, &cf)) {
+      return false;
+    }
+  }
+
+  // Extract key
+  return GetLengthPrefixedSlice(input, key);
+}
+
+bool WriteBatch::HasBeginPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_BEGIN_PREPARE) != 0;
+}
+
+bool WriteBatch::HasEndPrepare() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_END_PREPARE) != 0;
+}
+
+bool WriteBatch::HasCommit() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_COMMIT) != 0;
+}
+
+bool WriteBatch::HasRollback() const {
+  return (ComputeContentFlags() & ContentFlags::HAS_ROLLBACK) != 0;
+}
+
+Status ReadRecordFromWriteBatch(Slice* input, char* tag,
+                                uint32_t* column_family, Slice* key,
+                                Slice* value, Slice* blob, Slice* xid) {
+  assert(key != nullptr && value != nullptr);
+  *tag = (*input)[0];
+  input->remove_prefix(1);
+  *column_family = 0;  // default
+  switch (*tag) {
+    case kTypeColumnFamilyValue:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeValue:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Put");
+      }
+      break;
+    case kTypeColumnFamilyDeletion:
+    case kTypeColumnFamilySingleDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeDeletion:
+    case kTypeSingleDeletion:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad WriteBatch Delete");
+      }
+      break;
+    case kTypeColumnFamilyRangeDeletion:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeRangeDeletion:
+      // for range delete, "key" is begin_key, "value" is end_key
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch DeleteRange");
+      }
+      break;
+    case kTypeColumnFamilyMerge:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeMerge:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch Merge");
+      }
+      break;
+    case kTypeColumnFamilyBlobIndex:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeBlobIndex:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch BlobIndex");
+      }
+      break;
+    case kTypeLogData:
+      assert(blob != nullptr);
+      if (!GetLengthPrefixedSlice(input, blob)) {
+        return Status::Corruption("bad WriteBatch Blob");
+      }
+      break;
+    case kTypeNoop:
+    case kTypeBeginPrepareXID:
+      // This indicates that the prepared batch is also persisted in the db.
+      // This is used in WritePreparedTxn
+    case kTypeBeginPersistedPrepareXID:
+      // This is used in WriteUnpreparedTxn
+    case kTypeBeginUnprepareXID:
+      break;
+    case kTypeEndPrepareXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad EndPrepare XID");
+      }
+      break;
+    case kTypeCommitXIDAndTimestamp:
+      if (!GetLengthPrefixedSlice(input, key)) {
+        return Status::Corruption("bad commit timestamp");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeCommitXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Commit XID");
+      }
+      break;
+    case kTypeRollbackXID:
+      if (!GetLengthPrefixedSlice(input, xid)) {
+        return Status::Corruption("bad Rollback XID");
+      }
+      break;
+    case kTypeColumnFamilyWideColumnEntity:
+      if (!GetVarint32(input, column_family)) {
+        return Status::Corruption("bad WriteBatch PutEntity");
+      }
+      FALLTHROUGH_INTENDED;
+    case kTypeWideColumnEntity:
+      if (!GetLengthPrefixedSlice(input, key) ||
+          !GetLengthPrefixedSlice(input, value)) {
+        return Status::Corruption("bad WriteBatch PutEntity");
+      }
+      break;
+    default:
+      return Status::Corruption("unknown WriteBatch tag");
+  }
+  return Status::OK();
+}
+
+Status WriteBatch::Iterate(Handler* handler) const {
+  if (rep_.size() < WriteBatchInternal::kHeader) {
+    return Status::Corruption("malformed WriteBatch (too small)");
+  }
+
+  return WriteBatchInternal::Iterate(this, handler, WriteBatchInternal::kHeader,
+                                     rep_.size());
+}
+
+Status WriteBatchInternal::Iterate(const WriteBatch* wb,
+                                   WriteBatch::Handler* handler, size_t begin,
+                                   size_t end) {
+  if (begin > wb->rep_.size() || end > wb->rep_.size() || end < begin) {
+    return Status::Corruption("Invalid start/end bounds for Iterate");
+  }
+  assert(begin <= end);
+  Slice input(wb->rep_.data() + begin, static_cast<size_t>(end - begin));
+  bool whole_batch =
+      (begin == WriteBatchInternal::kHeader) && (end == wb->rep_.size());
+
+  Slice key, value, blob, xid;
+
+  // Sometimes a sub-batch starts with a Noop. We want to exclude such Noops as
+  // the batch boundary symbols otherwise we would mis-count the number of
+  // batches. We do that by checking whether the accumulated batch is empty
+  // before seeing the next Noop.
+  bool empty_batch = true;
+  uint32_t found = 0;
+  Status s;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  bool last_was_try_again = false;
+  bool handler_continue = true;
+  while (((s.ok() && !input.empty()) || UNLIKELY(s.IsTryAgain()))) {
+    handler_continue = handler->Continue();
+    if (!handler_continue) {
+      break;
+    }
+
+    if (LIKELY(!s.IsTryAgain())) {
+      last_was_try_again = false;
+      tag = 0;
+      column_family = 0;  // default
+
+      s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                   &blob, &xid);
+      if (!s.ok()) {
+        return s;
+      }
+    } else {
+      assert(s.IsTryAgain());
+      assert(!last_was_try_again);  // to detect infinite loop bugs
+      if (UNLIKELY(last_was_try_again)) {
+        return Status::Corruption(
+            "two consecutive TryAgain in WriteBatch handler; this is either a "
+            "software bug or data corruption.");
+      }
+      last_was_try_again = true;
+      s = Status::OK();
+    }
+
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_PUT));
+        s = handler->PutCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE));
+        s = handler->DeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_SINGLE_DELETE));
+        s = handler->SingleDeleteCF(column_family, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyRangeDeletion:
+      case kTypeRangeDeletion:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_DELETE_RANGE));
+        s = handler->DeleteRangeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_MERGE));
+        s = handler->MergeCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          found++;
+        }
+        break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BLOB_INDEX));
+        s = handler->PutBlobIndexCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          found++;
+        }
+        break;
+      case kTypeLogData:
+        handler->LogData(blob);
+        // A batch might have nothing but LogData. It is still a batch.
+        empty_batch = false;
+        break;
+      case kTypeBeginPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kDisabled) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_after_commit_ is disabled (in "
+              "WritePrepared/WriteUnprepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WriteCommitted txn tag when write_before_prepare_ is enabled "
+              "(in WriteUnprepared mode). If it is not due to corruption, the "
+              "WAL must be emptied before changing the WritePolicy.");
+        }
+        break;
+      case kTypeBeginPersistedPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_PREPARE));
+        s = handler->MarkBeginPrepare();
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WritePrepared/WriteUnprepared txn tag when write_after_commit_ "
+              "is enabled (in default WriteCommitted mode). If it is not due "
+              "to corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeBeginUnprepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_BEGIN_UNPREPARE));
+        s = handler->MarkBeginPrepare(true /* unprepared */);
+        assert(s.ok());
+        empty_batch = false;
+        if (handler->WriteAfterCommit() ==
+            WriteBatch::Handler::OptionState::kEnabled) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_after_commit_ is enabled (in "
+              "default WriteCommitted mode). If it is not due to corruption, "
+              "the WAL must be emptied before changing the WritePolicy.");
+        }
+        if (handler->WriteBeforePrepare() ==
+            WriteBatch::Handler::OptionState::kDisabled) {
+          s = Status::NotSupported(
+              "WriteUnprepared txn tag when write_before_prepare_ is disabled "
+              "(in WriteCommitted/WritePrepared mode). If it is not due to "
+              "corruption, the WAL must be emptied before changing the "
+              "WritePolicy.");
+        }
+        break;
+      case kTypeEndPrepareXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_END_PREPARE));
+        s = handler->MarkEndPrepare(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeCommitXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        s = handler->MarkCommit(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeCommitXIDAndTimestamp:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_COMMIT));
+        // key stores the commit timestamp.
+        assert(!key.empty());
+        s = handler->MarkCommitWithTimestamp(xid, key);
+        if (LIKELY(s.ok())) {
+          empty_batch = true;
+        }
+        break;
+      case kTypeRollbackXID:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_ROLLBACK));
+        s = handler->MarkRollback(xid);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeNoop:
+        s = handler->MarkNoop(empty_batch);
+        assert(s.ok());
+        empty_batch = true;
+        break;
+      case kTypeWideColumnEntity:
+      case kTypeColumnFamilyWideColumnEntity:
+        assert(wb->content_flags_.load(std::memory_order_relaxed) &
+               (ContentFlags::DEFERRED | ContentFlags::HAS_PUT_ENTITY));
+        s = handler->PutEntityCF(column_family, key, value);
+        if (LIKELY(s.ok())) {
+          empty_batch = false;
+          ++found;
+        }
+        break;
+      default:
+        return Status::Corruption("unknown WriteBatch tag");
+    }
+  }
+  if (!s.ok()) {
+    return s;
+  }
+  if (handler_continue && whole_batch &&
+      found != WriteBatchInternal::Count(wb)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  } else {
+    return Status::OK();
+  }
+}
+
+bool WriteBatchInternal::IsLatestPersistentState(const WriteBatch* b) {
+  return b->is_latest_persistent_state_;
+}
+
+void WriteBatchInternal::SetAsLatestPersistentState(WriteBatch* b) {
+  b->is_latest_persistent_state_ = true;
+}
+
+uint32_t WriteBatchInternal::Count(const WriteBatch* b) {
+  return DecodeFixed32(b->rep_.data() + 8);
+}
+
+void WriteBatchInternal::SetCount(WriteBatch* b, uint32_t n) {
+  EncodeFixed32(&b->rep_[8], n);
+}
+
+SequenceNumber WriteBatchInternal::Sequence(const WriteBatch* b) {
+  return SequenceNumber(DecodeFixed64(b->rep_.data()));
+}
+
+void WriteBatchInternal::SetSequence(WriteBatch* b, SequenceNumber seq) {
+  EncodeFixed64(&b->rep_[0], seq);
+}
+
+size_t WriteBatchInternal::GetFirstOffset(WriteBatch* /*b*/) {
+  return WriteBatchInternal::kHeader;
+}
+
+std::tuple<Status, uint32_t, size_t>
+WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(
+    WriteBatch* b, ColumnFamilyHandle* column_family) {
+  uint32_t cf_id = GetColumnFamilyID(column_family);
+  size_t ts_sz = 0;
+  Status s;
+  if (column_family) {
+    const Comparator* const ucmp = column_family->GetComparator();
+    if (ucmp) {
+      ts_sz = ucmp->timestamp_size();
+      if (0 == cf_id && b->default_cf_ts_sz_ != ts_sz) {
+        s = Status::InvalidArgument("Default cf timestamp size mismatch");
+      }
+    }
+  } else if (b->default_cf_ts_sz_ > 0) {
+    ts_sz = b->default_cf_ts_sz_;
+  }
+  return std::make_tuple(s, cf_id, ts_sz);
+}
+
+namespace {
+Status CheckColumnFamilyTimestampSize(ColumnFamilyHandle* column_family,
+                                      const Slice& ts) {
+  if (!column_family) {
+    return Status::InvalidArgument("column family handle cannot be null");
+  }
+  const Comparator* const ucmp = column_family->GetComparator();
+  assert(ucmp);
+  size_t cf_ts_sz = ucmp->timestamp_size();
+  if (0 == cf_ts_sz) {
+    return Status::InvalidArgument("timestamp disabled");
+  }
+  if (cf_ts_sz != ts.size()) {
+    return Status::InvalidArgument("timestamp size mismatch");
+  }
+  return Status::OK();
+}
+}  // anonymous namespace
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const Slice& key, const Slice& value) {
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // Technically the optype could've been `kTypeColumnFamilyValue` with the
+    // CF ID encoded in the `WriteBatch`. That distinction is unimportant
+    // however since we verify CF ID is correct, as well as all other fields
+    // (a missing/extra encoded CF ID would corrupt another field). It is
+    // convenient to consolidate on `kTypeValue` here as that is what will be
+    // inserted into memtable.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Put(this, cf_id, key, value);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+                                 SliceParts(&value, 1));
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
+                       const Slice& ts, const Slice& value) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Put(this, cf_id, SliceParts(key_with_ts.data(), 2),
+                                 SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::CheckSlicePartsLength(const SliceParts& key,
+                                                 const SliceParts& value) {
+  size_t total_key_bytes = 0;
+  for (int i = 0; i < key.num_parts; ++i) {
+    total_key_bytes += key.parts[i].size();
+  }
+  if (total_key_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  size_t total_value_bytes = 0;
+  for (int i = 0; i < value.num_parts; ++i) {
+    total_value_bytes += value.parts[i].size();
+  }
+  if (total_value_bytes >= size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
+                               const SliceParts& key, const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeValue));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(
+      b->content_flags_.load(std::memory_order_relaxed) | ContentFlags::HAS_PUT,
+      std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeValue)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Put(ColumnFamilyHandle* column_family, const SliceParts& key,
+                       const SliceParts& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (ts_sz == 0) {
+    return WriteBatchInternal::Put(this, cf_id, key, value);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutEntity(WriteBatch* b, uint32_t column_family_id,
+                                     const Slice& key,
+                                     const WideColumns& columns) {
+  assert(b);
+
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+
+  WideColumns sorted_columns(columns);
+  std::sort(sorted_columns.begin(), sorted_columns.end(),
+            [](const WideColumn& lhs, const WideColumn& rhs) {
+              return lhs.name().compare(rhs.name()) < 0;
+            });
+
+  std::string entity;
+  const Status s = WideColumnSerialization::Serialize(sorted_columns, entity);
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (entity.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("wide column entity is too large");
+  }
+
+  LocalSavePoint save(b);
+
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeWideColumnEntity));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyWideColumnEntity));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, entity);
+
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_PUT_ENTITY,
+                          std::memory_order_relaxed);
+
+  if (b->prot_info_ != nullptr) {
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, entity, kTypeWideColumnEntity)
+            .ProtectC(column_family_id));
+  }
+
+  return save.commit();
+}
+
+Status WriteBatch::PutEntity(ColumnFamilyHandle* column_family,
+                             const Slice& key, const WideColumns& columns) {
+  if (!column_family) {
+    return Status::InvalidArgument(
+        "Cannot call this method without a column family handle");
+  }
+
+  Status s;
+  uint32_t cf_id = 0;
+  size_t ts_sz = 0;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (ts_sz) {
+    return Status::InvalidArgument(
+        "Cannot call this method on column family enabling timestamp");
+  }
+
+  return WriteBatchInternal::PutEntity(this, cf_id, key, columns);
+}
+
+Status WriteBatchInternal::InsertNoop(WriteBatch* b) {
+  b->rep_.push_back(static_cast<char>(kTypeNoop));
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkEndPrepare(WriteBatch* b, const Slice& xid,
+                                          bool write_after_commit,
+                                          bool unprepared_batch) {
+  // a manually constructed batch can only contain one prepare section
+  assert(b->rep_[12] == static_cast<char>(kTypeNoop));
+
+  // all savepoints up to this point are cleared
+  if (b->save_points_ != nullptr) {
+    while (!b->save_points_->stack.empty()) {
+      b->save_points_->stack.pop();
+    }
+  }
+
+  // rewrite noop as begin marker
+  b->rep_[12] = static_cast<char>(
+      write_after_commit ? kTypeBeginPrepareXID
+                         : (unprepared_batch ? kTypeBeginUnprepareXID
+                                             : kTypeBeginPersistedPrepareXID));
+  b->rep_.push_back(static_cast<char>(kTypeEndPrepareXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_END_PREPARE |
+                              ContentFlags::HAS_BEGIN_PREPARE,
+                          std::memory_order_relaxed);
+  if (unprepared_batch) {
+    b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                                ContentFlags::HAS_BEGIN_UNPREPARE,
+                            std::memory_order_relaxed);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommit(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeCommitXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkCommitWithTimestamp(WriteBatch* b,
+                                                   const Slice& xid,
+                                                   const Slice& commit_ts) {
+  assert(!commit_ts.empty());
+  b->rep_.push_back(static_cast<char>(kTypeCommitXIDAndTimestamp));
+  PutLengthPrefixedSlice(&b->rep_, commit_ts);
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_COMMIT,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::MarkRollback(WriteBatch* b, const Slice& xid) {
+  b->rep_.push_back(static_cast<char>(kTypeRollbackXID));
+  PutLengthPrefixedSlice(&b->rep_, xid);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_ROLLBACK,
+                          std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Delete(this, cf_id, key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::Delete(this, cf_id,
+                                    SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family, const Slice& key,
+                          const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(column_family);
+  has_key_with_ts_ = true;
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Delete(this, cf_id,
+                                    SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::Delete(WriteBatch* b, uint32_t column_family_id,
+                                  const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */, 0 /* _num_parts */),
+                        kTypeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Delete(ColumnFamilyHandle* column_family,
+                          const SliceParts& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Delete(this, cf_id, key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, "" /* value */, kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const Slice& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::SingleDelete(this, cf_id, key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+  return WriteBatchInternal::SingleDelete(this, cf_id,
+                                          SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const Slice& key, const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::SingleDelete(this, cf_id,
+                                          SliceParts(key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::SingleDelete(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const SliceParts& key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeSingleDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilySingleDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_SINGLE_DELETE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key,
+                        SliceParts(nullptr /* _parts */,
+                                   0 /* _num_parts */) /* value */,
+                        kTypeSingleDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::SingleDelete(ColumnFamilyHandle* column_family,
+                                const SliceParts& key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::SingleDelete(this, cf_id, key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const Slice& begin_key,
+                                       const Slice& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, begin_key);
+  PutLengthPrefixedSlice(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const Slice& begin_key, const Slice& end_key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> begin_key_with_ts{{begin_key, dummy_ts}};
+  std::array<Slice, 2> end_key_with_ts{{end_key, dummy_ts}};
+  return WriteBatchInternal::DeleteRange(
+      this, cf_id, SliceParts(begin_key_with_ts.data(), 2),
+      SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const Slice& begin_key, const Slice& end_key,
+                               const Slice& ts) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  assert(column_family);
+  has_key_with_ts_ = true;
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{begin_key, ts}};
+  std::array<Slice, 2> end_key_with_ts{{end_key, ts}};
+  return WriteBatchInternal::DeleteRange(this, cf_id,
+                                         SliceParts(key_with_ts.data(), 2),
+                                         SliceParts(end_key_with_ts.data(), 2));
+}
+
+Status WriteBatchInternal::DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                                       const SliceParts& begin_key,
+                                       const SliceParts& end_key) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeRangeDeletion));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyRangeDeletion));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, begin_key);
+  PutLengthPrefixedSliceParts(&b->rep_, end_key);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_DELETE_RANGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    // In `DeleteRange()`, the end key is treated as the value.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(begin_key, end_key, kTypeRangeDeletion)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::DeleteRange(ColumnFamilyHandle* column_family,
+                               const SliceParts& begin_key,
+                               const SliceParts& end_key) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::DeleteRange(this, cf_id, begin_key, end_key);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const Slice& key, const Slice& value) {
+  if (key.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("key is too large");
+  }
+  if (value.size() > size_t{std::numeric_limits<uint32_t>::max()}) {
+    return Status::InvalidArgument("value is too large");
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                         const Slice& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Merge(this, cf_id, key, value);
+  }
+
+  needs_in_place_update_ts_ = true;
+  has_key_with_ts_ = true;
+  std::string dummy_ts(ts_sz, '\0');
+  std::array<Slice, 2> key_with_ts{{key, dummy_ts}};
+
+  return WriteBatchInternal::Merge(
+      this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family, const Slice& key,
+                         const Slice& ts, const Slice& value) {
+  const Status s = CheckColumnFamilyTimestampSize(column_family, ts);
+  if (!s.ok()) {
+    return s;
+  }
+  has_key_with_ts_ = true;
+  assert(column_family);
+  uint32_t cf_id = column_family->GetID();
+  std::array<Slice, 2> key_with_ts{{key, ts}};
+  return WriteBatchInternal::Merge(
+      this, cf_id, SliceParts(key_with_ts.data(), 2), SliceParts(&value, 1));
+}
+
+Status WriteBatchInternal::Merge(WriteBatch* b, uint32_t column_family_id,
+                                 const SliceParts& key,
+                                 const SliceParts& value) {
+  Status s = CheckSlicePartsLength(key, value);
+  if (!s.ok()) {
+    return s;
+  }
+
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeMerge));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyMerge));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSliceParts(&b->rep_, key);
+  PutLengthPrefixedSliceParts(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_MERGE,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(ProtectionInfo64()
+                                             .ProtectKVO(key, value, kTypeMerge)
+                                             .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::Merge(ColumnFamilyHandle* column_family,
+                         const SliceParts& key, const SliceParts& value) {
+  size_t ts_sz = 0;
+  uint32_t cf_id = 0;
+  Status s;
+
+  std::tie(s, cf_id, ts_sz) =
+      WriteBatchInternal::GetColumnFamilyIdAndTimestampSize(this,
+                                                            column_family);
+
+  if (!s.ok()) {
+    return s;
+  }
+
+  if (0 == ts_sz) {
+    return WriteBatchInternal::Merge(this, cf_id, key, value);
+  }
+
+  return Status::InvalidArgument(
+      "Cannot call this method on column family enabling timestamp");
+}
+
+Status WriteBatchInternal::PutBlobIndex(WriteBatch* b,
+                                        uint32_t column_family_id,
+                                        const Slice& key, const Slice& value) {
+  LocalSavePoint save(b);
+  WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
+  if (column_family_id == 0) {
+    b->rep_.push_back(static_cast<char>(kTypeBlobIndex));
+  } else {
+    b->rep_.push_back(static_cast<char>(kTypeColumnFamilyBlobIndex));
+    PutVarint32(&b->rep_, column_family_id);
+  }
+  PutLengthPrefixedSlice(&b->rep_, key);
+  PutLengthPrefixedSlice(&b->rep_, value);
+  b->content_flags_.store(b->content_flags_.load(std::memory_order_relaxed) |
+                              ContentFlags::HAS_BLOB_INDEX,
+                          std::memory_order_relaxed);
+  if (b->prot_info_ != nullptr) {
+    // See comment in first `WriteBatchInternal::Put()` overload concerning the
+    // `ValueType` argument passed to `ProtectKVO()`.
+    b->prot_info_->entries_.emplace_back(
+        ProtectionInfo64()
+            .ProtectKVO(key, value, kTypeBlobIndex)
+            .ProtectC(column_family_id));
+  }
+  return save.commit();
+}
+
+Status WriteBatch::PutLogData(const Slice& blob) {
+  LocalSavePoint save(this);
+  rep_.push_back(static_cast<char>(kTypeLogData));
+  PutLengthPrefixedSlice(&rep_, blob);
+  return save.commit();
+}
+
+void WriteBatch::SetSavePoint() {
+  if (save_points_ == nullptr) {
+    save_points_.reset(new SavePoints());
+  }
+  // Record length and count of current batch of writes.
+  save_points_->stack.push(SavePoint(
+      GetDataSize(), Count(), content_flags_.load(std::memory_order_relaxed)));
+}
+
+Status WriteBatch::RollbackToSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  SavePoint savepoint = save_points_->stack.top();
+  save_points_->stack.pop();
+
+  assert(savepoint.size <= rep_.size());
+  assert(static_cast<uint32_t>(savepoint.count) <= Count());
+
+  if (savepoint.size == rep_.size()) {
+    // No changes to rollback
+  } else if (savepoint.size == 0) {
+    // Rollback everything
+    Clear();
+  } else {
+    rep_.resize(savepoint.size);
+    if (prot_info_ != nullptr) {
+      prot_info_->entries_.resize(savepoint.count);
+    }
+    WriteBatchInternal::SetCount(this, savepoint.count);
+    content_flags_.store(savepoint.content_flags, std::memory_order_relaxed);
+  }
+
+  return Status::OK();
+}
+
+Status WriteBatch::PopSavePoint() {
+  if (save_points_ == nullptr || save_points_->stack.size() == 0) {
+    return Status::NotFound();
+  }
+
+  // Pop the most recent savepoint off the stack
+  save_points_->stack.pop();
+
+  return Status::OK();
+}
+
+Status WriteBatch::UpdateTimestamps(
+    const Slice& ts, std::function<size_t(uint32_t)> ts_sz_func) {
+  TimestampUpdater<decltype(ts_sz_func)> ts_updater(prot_info_.get(),
+                                                    std::move(ts_sz_func), ts);
+  const Status s = Iterate(&ts_updater);
+  if (s.ok()) {
+    needs_in_place_update_ts_ = false;
+  }
+  return s;
+}
+
+Status WriteBatch::VerifyChecksum() const {
+  if (prot_info_ == nullptr) {
+    return Status::OK();
+  }
+  Slice input(rep_.data() + WriteBatchInternal::kHeader,
+              rep_.size() - WriteBatchInternal::kHeader);
+  Slice key, value, blob, xid;
+  char tag = 0;
+  uint32_t column_family = 0;  // default
+  Status s;
+  size_t prot_info_idx = 0;
+  bool checksum_protected = true;
+  while (!input.empty() && prot_info_idx < prot_info_->entries_.size()) {
+    // In case key/value/column_family are not updated by
+    // ReadRecordFromWriteBatch
+    key.clear();
+    value.clear();
+    column_family = 0;
+    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
+                                 &blob, &xid);
+    if (!s.ok()) {
+      return s;
+    }
+    checksum_protected = true;
+    // Write batch checksum uses op_type without ColumnFamily (e.g., if op_type
+    // in the write batch is kTypeColumnFamilyValue, kTypeValue is used to
+    // compute the checksum), and encodes column family id separately. See
+    // comment in first `WriteBatchInternal::Put()` for more detail.
+    switch (tag) {
+      case kTypeColumnFamilyValue:
+      case kTypeValue:
+        tag = kTypeValue;
+        break;
+      case kTypeColumnFamilyDeletion:
+      case kTypeDeletion:
+        tag = kTypeDeletion;
+        break;
+      case kTypeColumnFamilySingleDeletion:
+      case kTypeSingleDeletion:
+        tag = kTypeSingleDeletion;
+        break;
+      case kTypeColumnFamilyRangeDeletion:
+      case kTypeRangeDeletion:
+        tag = kTypeRangeDeletion;
+        break;
+      case kTypeColumnFamilyMerge:
+      case kTypeMerge:
+        tag = kTypeMerge;
+        break;
+      case kTypeColumnFamilyBlobIndex:
+      case kTypeBlobIndex:
+        tag = kTypeBlobIndex;
+        break;
+      case kTypeLogData:
+      case kTypeBeginPrepareXID:
+      case kTypeEndPrepareXID:
+      case kTypeCommitXID:
+      case kTypeRollbackXID:
+      case kTypeNoop:
+      case kTypeBeginPersistedPrepareXID:
+      case kTypeBeginUnprepareXID:
+      case kTypeDeletionWithTimestamp:
+      case kTypeCommitXIDAndTimestamp:
+        checksum_protected = false;
+        break;
+      case kTypeColumnFamilyWideColumnEntity:
+      case kTypeWideColumnEntity:
+        tag = kTypeWideColumnEntity;
+        break;
+      default:
+        return Status::Corruption(
+            "unknown WriteBatch tag",
+            std::to_string(static_cast<unsigned int>(tag)));
+    }
+    if (checksum_protected) {
+      s = prot_info_->entries_[prot_info_idx++]
+              .StripC(column_family)
+              .StripKVO(key, value, static_cast<ValueType>(tag))
+              .GetStatus();
+      if (!s.ok()) {
+        return s;
+      }
+    }
+  }
+
+  if (prot_info_idx != WriteBatchInternal::Count(this)) {
+    return Status::Corruption("WriteBatch has wrong count");
+  }
+  assert(WriteBatchInternal::Count(this) == prot_info_->entries_.size());
+  return Status::OK();
+}
+
+namespace {
+
+class MemTableInserter : public WriteBatch::Handler {
+  SequenceNumber sequence_;
+  ColumnFamilyMemTables* const cf_mems_;
+  FlushScheduler* const flush_scheduler_;
+  TrimHistoryScheduler* const trim_history_scheduler_;
+  const bool ignore_missing_column_families_;
+  const uint64_t recovering_log_number_;
+  // log number that all Memtables inserted into should reference
+  uint64_t log_number_ref_;
+  DBImpl* db_;
+  const bool concurrent_memtable_writes_;
+  bool post_info_created_;
+  const WriteBatch::ProtectionInfo* prot_info_;
+  size_t prot_info_idx_;
+
+  bool* has_valid_writes_;
+  // On some (!) platforms just default creating
+  // a map is too expensive in the Write() path as they
+  // cause memory allocations though unused.
+  // Make creation optional but do not incur
+  // std::unique_ptr additional allocation
+  using MemPostInfoMap = std::map<MemTable*, MemTablePostProcessInfo>;
+  using PostMapType = std::aligned_storage<sizeof(MemPostInfoMap)>::type;
+  PostMapType mem_post_info_map_;
+  // current recovered transaction we are rebuilding (recovery)
+  WriteBatch* rebuilding_trx_;
+  SequenceNumber rebuilding_trx_seq_;
+  // Increase seq number once per each write batch. Otherwise increase it once
+  // per key.
+  bool seq_per_batch_;
+  // Whether the memtable write will be done only after the commit
+  bool write_after_commit_;
+  // Whether memtable write can be done before prepare
+  bool write_before_prepare_;
+  // Whether this batch was unprepared or not
+  bool unprepared_batch_;
+  using DupDetector = std::aligned_storage<sizeof(DuplicateDetector)>::type;
+  DupDetector duplicate_detector_;
+  bool dup_dectector_on_;
+
+  bool hint_per_batch_;
+  bool hint_created_;
+  // Hints for this batch
+  using HintMap = std::unordered_map<MemTable*, void*>;
+  using HintMapType = std::aligned_storage<sizeof(HintMap)>::type;
+  HintMapType hint_;
+
+  HintMap& GetHintMap() {
+    assert(hint_per_batch_);
+    if (!hint_created_) {
+      new (&hint_) HintMap();
+      hint_created_ = true;
+    }
+    return *reinterpret_cast<HintMap*>(&hint_);
+  }
+
+  MemPostInfoMap& GetPostMap() {
+    assert(concurrent_memtable_writes_);
+    if (!post_info_created_) {
+      new (&mem_post_info_map_) MemPostInfoMap();
+      post_info_created_ = true;
+    }
+    return *reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_);
+  }
+
+  bool IsDuplicateKeySeq(uint32_t column_family_id, const Slice& key) {
+    assert(!write_after_commit_);
+    assert(rebuilding_trx_ != nullptr);
+    if (!dup_dectector_on_) {
+      new (&duplicate_detector_) DuplicateDetector(db_);
+      dup_dectector_on_ = true;
+    }
+    return reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+        ->IsDuplicateKeySeq(column_family_id, key, sequence_);
+  }
+
+  const ProtectionInfoKVOC64* NextProtectionInfo() {
+    const ProtectionInfoKVOC64* res = nullptr;
+    if (prot_info_ != nullptr) {
+      assert(prot_info_idx_ < prot_info_->entries_.size());
+      res = &prot_info_->entries_[prot_info_idx_];
+      ++prot_info_idx_;
+    }
+    return res;
+  }
+
+  void DecrementProtectionInfoIdxForTryAgain() {
+    if (prot_info_ != nullptr) --prot_info_idx_;
+  }
+
+  void ResetProtectionInfo() {
+    prot_info_idx_ = 0;
+    prot_info_ = nullptr;
+  }
+
+ protected:
+  Handler::OptionState WriteBeforePrepare() const override {
+    return write_before_prepare_ ? Handler::OptionState::kEnabled
+                                 : Handler::OptionState::kDisabled;
+  }
+  Handler::OptionState WriteAfterCommit() const override {
+    return write_after_commit_ ? Handler::OptionState::kEnabled
+                               : Handler::OptionState::kDisabled;
+  }
+
+ public:
+  // cf_mems should not be shared with concurrent inserters
+  MemTableInserter(SequenceNumber _sequence, ColumnFamilyMemTables* cf_mems,
+                   FlushScheduler* flush_scheduler,
+                   TrimHistoryScheduler* trim_history_scheduler,
+                   bool ignore_missing_column_families,
+                   uint64_t recovering_log_number, DB* db,
+                   bool concurrent_memtable_writes,
+                   const WriteBatch::ProtectionInfo* prot_info,
+                   bool* has_valid_writes = nullptr, bool seq_per_batch = false,
+                   bool batch_per_txn = true, bool hint_per_batch = false)
+      : sequence_(_sequence),
+        cf_mems_(cf_mems),
+        flush_scheduler_(flush_scheduler),
+        trim_history_scheduler_(trim_history_scheduler),
+        ignore_missing_column_families_(ignore_missing_column_families),
+        recovering_log_number_(recovering_log_number),
+        log_number_ref_(0),
+        db_(static_cast_with_check<DBImpl>(db)),
+        concurrent_memtable_writes_(concurrent_memtable_writes),
+        post_info_created_(false),
+        prot_info_(prot_info),
+        prot_info_idx_(0),
+        has_valid_writes_(has_valid_writes),
+        rebuilding_trx_(nullptr),
+        rebuilding_trx_seq_(0),
+        seq_per_batch_(seq_per_batch),
+        // Write after commit currently uses one seq per key (instead of per
+        // batch). So seq_per_batch being false indicates write_after_commit
+        // approach.
+        write_after_commit_(!seq_per_batch),
+        // WriteUnprepared can write WriteBatches per transaction, so
+        // batch_per_txn being false indicates write_before_prepare.
+        write_before_prepare_(!batch_per_txn),
+        unprepared_batch_(false),
+        duplicate_detector_(),
+        dup_dectector_on_(false),
+        hint_per_batch_(hint_per_batch),
+        hint_created_(false) {
+    assert(cf_mems_);
+  }
+
+  ~MemTableInserter() override {
+    if (dup_dectector_on_) {
+      reinterpret_cast<DuplicateDetector*>(&duplicate_detector_)
+          ->~DuplicateDetector();
+    }
+    if (post_info_created_) {
+      reinterpret_cast<MemPostInfoMap*>(&mem_post_info_map_)->~MemPostInfoMap();
+    }
+    if (hint_created_) {
+      for (auto iter : GetHintMap()) {
+        delete[] reinterpret_cast<char*>(iter.second);
+      }
+      reinterpret_cast<HintMap*>(&hint_)->~HintMap();
+    }
+    delete rebuilding_trx_;
+  }
+
+  MemTableInserter(const MemTableInserter&) = delete;
+  MemTableInserter& operator=(const MemTableInserter&) = delete;
+
+  // The batch seq is regularly restarted; In normal mode it is set when
+  // MemTableInserter is constructed in the write thread and in recovery mode it
+  // is set when a batch, which is tagged with seq, is read from the WAL.
+  // Within a sequenced batch, which could be a merge of multiple batches, we
+  // have two policies to advance the seq: i) seq_per_key (default) and ii)
+  // seq_per_batch. To implement the latter we need to mark the boundary between
+  // the individual batches. The approach is this: 1) Use the terminating
+  // markers to indicate the boundary (kTypeEndPrepareXID, kTypeCommitXID,
+  // kTypeRollbackXID) 2) Terminate a batch with kTypeNoop in the absence of a
+  // natural boundary marker.
+  void MaybeAdvanceSeq(bool batch_boundry = false) {
+    if (batch_boundry == seq_per_batch_) {
+      sequence_++;
+    }
+  }
+
+  void set_log_number_ref(uint64_t log) { log_number_ref_ = log; }
+  void set_prot_info(const WriteBatch::ProtectionInfo* prot_info) {
+    prot_info_ = prot_info;
+    prot_info_idx_ = 0;
+  }
+
+  SequenceNumber sequence() const { return sequence_; }
+
+  void PostProcess() {
+    assert(concurrent_memtable_writes_);
+    // If post info was not created there is nothing
+    // to process and no need to create on demand
+    if (post_info_created_) {
+      for (auto& pair : GetPostMap()) {
+        pair.first->BatchPostProcess(pair.second);
+      }
+    }
+  }
+
+  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
+    // If we are in a concurrent mode, it is the caller's responsibility
+    // to clone the original ColumnFamilyMemTables so that each thread
+    // has its own instance.  Otherwise, it must be guaranteed that there
+    // is no concurrent access
+    bool found = cf_mems_->Seek(column_family_id);
+    if (!found) {
+      if (ignore_missing_column_families_) {
+        *s = Status::OK();
+      } else {
+        *s = Status::InvalidArgument(
+            "Invalid column family specified in write batch");
+      }
+      return false;
+    }
+    if (recovering_log_number_ != 0 &&
+        recovering_log_number_ < cf_mems_->GetLogNumber()) {
+      // This is true only in recovery environment (recovering_log_number_ is
+      // always 0 in
+      // non-recovery, regular write code-path)
+      // * If recovering_log_number_ < cf_mems_->GetLogNumber(), this means that
+      // column family already contains updates from this log. We can't apply
+      // updates twice because of update-in-place or merge workloads -- ignore
+      // the update
+      *s = Status::OK();
+      return false;
+    }
+
+    if (has_valid_writes_ != nullptr) {
+      *has_valid_writes_ = true;
+    }
+
+    if (log_number_ref_ > 0) {
+      cf_mems_->GetMemTable()->RefLogContainingPrepSection(log_number_ref_);
+    }
+
+    return true;
+  }
+
+  Status PutCFImpl(uint32_t column_family_id, const Slice& key,
+                   const Slice& value, ValueType value_type,
+                   const ProtectionInfoKVOS64* kv_prot_info) {
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Put(rebuilding_trx_, column_family_id, key,
+                                     value);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                             key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    // inplace_update_support is inconsistent with snapshots, and therefore with
+    // any kind of transactions including the ones that use seq_per_batch
+    assert(!seq_per_batch_ || !moptions->inplace_update_support);
+    if (!moptions->inplace_update_support) {
+      ret_status =
+          mem->Add(sequence_, value_type, key, value, kv_prot_info,
+                   concurrent_memtable_writes_, get_post_process_info(mem),
+                   hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+    } else if (moptions->inplace_callback == nullptr ||
+               value_type != kTypeValue) {
+      assert(!concurrent_memtable_writes_);
+      ret_status = mem->Update(sequence_, value_type, key, value, kv_prot_info);
+    } else {
+      assert(!concurrent_memtable_writes_);
+      assert(value_type == kTypeValue);
+      ret_status = mem->UpdateCallback(sequence_, key, value, kv_prot_info);
+      if (ret_status.IsNotFound()) {
+        // key not found in memtable. Do sst get, update, add
+        SnapshotImpl read_from_snapshot;
+        read_from_snapshot.number_ = sequence_;
+        ReadOptions ropts;
+        // it's going to be overwritten for sure, so no point caching data block
+        // containing the old version
+        ropts.fill_cache = false;
+        ropts.snapshot = &read_from_snapshot;
+
+        std::string prev_value;
+        std::string merged_value;
+
+        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+        Status get_status = Status::NotSupported();
+        if (db_ != nullptr && recovering_log_number_ == 0) {
+          if (cf_handle == nullptr) {
+            cf_handle = db_->DefaultColumnFamily();
+          }
+          // TODO (yanqin): fix when user-defined timestamp is enabled.
+          get_status = db_->Get(ropts, cf_handle, key, &prev_value);
+        }
+        // Intentionally overwrites the `NotFound` in `ret_status`.
+        if (!get_status.ok() && !get_status.IsNotFound()) {
+          ret_status = get_status;
+        } else {
+          ret_status = Status::OK();
+        }
+        if (ret_status.ok()) {
+          UpdateStatus update_status;
+          char* prev_buffer = const_cast<char*>(prev_value.c_str());
+          uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
+          if (get_status.ok()) {
+            update_status = moptions->inplace_callback(prev_buffer, &prev_size,
+                                                       value, &merged_value);
+          } else {
+            update_status = moptions->inplace_callback(
+                nullptr /* existing_value */, nullptr /* existing_value_size */,
+                value, &merged_value);
+          }
+          if (update_status == UpdateStatus::UPDATED_INPLACE) {
+            assert(get_status.ok());
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value,
+                                           Slice(prev_buffer, prev_size));
+              // prev_value is updated in-place with final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    &updated_kv_prot_info);
+            } else {
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(prev_buffer, prev_size),
+                                    nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          } else if (update_status == UpdateStatus::UPDATED) {
+            if (kv_prot_info != nullptr) {
+              ProtectionInfoKVOS64 updated_kv_prot_info(*kv_prot_info);
+              updated_kv_prot_info.UpdateV(value, merged_value);
+              // merged_value contains the final value.
+              ret_status = mem->Add(sequence_, value_type, key,
+                                    Slice(merged_value), &updated_kv_prot_info);
+            } else {
+              // merged_value contains the final value.
+              ret_status =
+                  mem->Add(sequence_, value_type, key, Slice(merged_value),
+                           nullptr /* kv_prot_info */);
+            }
+            if (ret_status.ok()) {
+              RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
+            }
+          }
+        }
+      }
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Put(rebuilding_trx_, column_family_id,
+                                           key, value);
+    }
+    return ret_status;
+  }
+
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    Status ret_status;
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+                             &mem_kv_prot_info);
+    } else {
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeValue,
+                             nullptr /* kv_prot_info */);
+    }
+    // TODO: this assumes that if TryAgain status is returned to the caller,
+    // the operation is actually tried again. The proper way to do this is to
+    // pass a `try_again` parameter to the operation itself and decrement
+    // prot_info_idx_ based on that
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status PutEntityCF(uint32_t column_family_id, const Slice& key,
+                     const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+
+    Status s;
+    if (kv_prot_info) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+                    &mem_kv_prot_info);
+    } else {
+      s = PutCFImpl(column_family_id, key, value, kTypeWideColumnEntity,
+                    /* kv_prot_info */ nullptr);
+    }
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status DeleteImpl(uint32_t /*column_family_id*/, const Slice& key,
+                    const Slice& value, ValueType delete_type,
+                    const ProtectionInfoKVOS64* kv_prot_info) {
+    Status ret_status;
+    MemTable* mem = cf_mems_->GetMemTable();
+    ret_status =
+        mem->Add(sequence_, delete_type, key, value, kv_prot_info,
+                 concurrent_memtable_writes_, get_post_process_info(mem),
+                 hint_per_batch_ ? &GetHintMap()[mem] : nullptr);
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    return ret_status;
+  }
+
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status =
+            WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+
+    ColumnFamilyData* cfd = cf_mems_->current();
+    assert(!cfd || cfd->user_comparator());
+    const size_t ts_sz = (cfd && cfd->user_comparator())
+                             ? cfd->user_comparator()->timestamp_size()
+                             : 0;
+    const ValueType delete_type =
+        (0 == ts_sz) ? kTypeDeletion : kTypeDeletionWithTimestamp;
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      mem_kv_prot_info.UpdateO(kTypeDeletion, delete_type);
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(), delete_type,
+                              nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status =
+          WriteBatchInternal::Delete(rebuilding_trx_, column_family_id, key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::SingleDelete(rebuilding_trx_, column_family_id,
+                                              key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                      column_family_id, key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, key, Slice(),
+                              kTypeSingleDeletion, nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::SingleDelete(rebuilding_trx_,
+                                                    column_family_id, key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::DeleteRange(rebuilding_trx_, column_family_id,
+                                             begin_key, end_key);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::DeleteRange(
+            rebuilding_trx_, column_family_id, begin_key, end_key);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, begin_key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    if (db_ != nullptr) {
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      auto* cfd =
+          static_cast_with_check<ColumnFamilyHandleImpl>(cf_handle)->cfd();
+      if (!cfd->is_delete_range_supported()) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        return Status::NotSupported(
+            std::string("DeleteRange not supported for table type ") +
+            cfd->ioptions()->table_factory->Name() + " in CF " +
+            cfd->GetName());
+      }
+      int cmp =
+          cfd->user_comparator()->CompareWithoutTimestamp(begin_key, end_key);
+      if (cmp > 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range where endpoints appear mistaken. Don't bother
+        // applying it to the DB, and return an error to the user.
+        return Status::InvalidArgument("end key comes before start key");
+      } else if (cmp == 0) {
+        // TODO(ajkr): refactor `SeekToColumnFamily()` so it returns a `Status`.
+        ret_status.PermitUncheckedError();
+        // It's an empty range. Don't bother applying it to the DB.
+        return Status::OK();
+      }
+    }
+
+    if (kv_prot_info != nullptr) {
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, &mem_kv_prot_info);
+    } else {
+      ret_status = DeleteImpl(column_family_id, begin_key, end_key,
+                              kTypeRangeDeletion, nullptr /* kv_prot_info */);
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(!ret_status.IsTryAgain() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::DeleteRange(
+          rebuilding_trx_, column_family_id, begin_key, end_key);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    // optimize for non-recovery mode
+    if (UNLIKELY(write_after_commit_ && rebuilding_trx_ != nullptr)) {
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      return WriteBatchInternal::Merge(rebuilding_trx_, column_family_id, key,
+                                       value);
+      // else insert the values to the memtable right away
+    }
+
+    Status ret_status;
+    if (UNLIKELY(!SeekToColumnFamily(column_family_id, &ret_status))) {
+      if (ret_status.ok() && rebuilding_trx_ != nullptr) {
+        assert(!write_after_commit_);
+        // The CF is probably flushed and hence no need for insert but we still
+        // need to keep track of the keys for upcoming rollback/commit.
+        // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+        ret_status = WriteBatchInternal::Merge(rebuilding_trx_,
+                                               column_family_id, key, value);
+        if (ret_status.ok()) {
+          MaybeAdvanceSeq(IsDuplicateKeySeq(column_family_id, key));
+        }
+      } else if (ret_status.ok()) {
+        MaybeAdvanceSeq(false /* batch_boundary */);
+      }
+      if (UNLIKELY(ret_status.IsTryAgain())) {
+        DecrementProtectionInfoIdxForTryAgain();
+      }
+      return ret_status;
+    }
+    assert(ret_status.ok());
+
+    MemTable* mem = cf_mems_->GetMemTable();
+    auto* moptions = mem->GetImmutableMemTableOptions();
+    if (moptions->merge_operator == nullptr) {
+      return Status::InvalidArgument(
+          "Merge requires `ColumnFamilyOptions::merge_operator != nullptr`");
+    }
+    bool perform_merge = false;
+    assert(!concurrent_memtable_writes_ ||
+           moptions->max_successive_merges == 0);
+
+    // If we pass DB through and options.max_successive_merges is hit
+    // during recovery, Get() will be issued which will try to acquire
+    // DB mutex and cause deadlock, as DB mutex is already held.
+    // So we disable merge in recovery
+    if (moptions->max_successive_merges > 0 && db_ != nullptr &&
+        recovering_log_number_ == 0) {
+      assert(!concurrent_memtable_writes_);
+      LookupKey lkey(key, sequence_);
+
+      // Count the number of successive merges at the head
+      // of the key in the memtable
+      size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
+
+      if (num_merges >= moptions->max_successive_merges) {
+        perform_merge = true;
+      }
+    }
+
+    if (perform_merge) {
+      // 1) Get the existing value
+      std::string get_value;
+
+      // Pass in the sequence number so that we also include previous merge
+      // operations in the same batch.
+      SnapshotImpl read_from_snapshot;
+      read_from_snapshot.number_ = sequence_;
+      ReadOptions read_options;
+      read_options.snapshot = &read_from_snapshot;
+
+      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
+      if (cf_handle == nullptr) {
+        cf_handle = db_->DefaultColumnFamily();
+      }
+      Status get_status = db_->Get(read_options, cf_handle, key, &get_value);
+      if (!get_status.ok()) {
+        // Failed to read a key we know exists. Store the delta in memtable.
+        perform_merge = false;
+      } else {
+        Slice get_value_slice = Slice(get_value);
+
+        // 2) Apply this merge
+        auto merge_operator = moptions->merge_operator;
+        assert(merge_operator);
+
+        std::string new_value;
+        Status merge_status = MergeHelper::TimedFullMerge(
+            merge_operator, key, &get_value_slice, {value}, &new_value,
+            moptions->info_log, moptions->statistics,
+            SystemClock::Default().get(), /* result_operand */ nullptr,
+            /* update_num_ops_stats */ false);
+
+        if (!merge_status.ok()) {
+          // Failed to merge!
+          // Store the delta in memtable
+          perform_merge = false;
+        } else {
+          // 3) Add value to memtable
+          assert(!concurrent_memtable_writes_);
+          if (kv_prot_info != nullptr) {
+            auto merged_kv_prot_info =
+                kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+            merged_kv_prot_info.UpdateV(value, new_value);
+            merged_kv_prot_info.UpdateO(kTypeMerge, kTypeValue);
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  &merged_kv_prot_info);
+          } else {
+            ret_status = mem->Add(sequence_, kTypeValue, key, new_value,
+                                  nullptr /* kv_prot_info */);
+          }
+        }
+      }
+    }
+
+    if (!perform_merge) {
+      assert(ret_status.ok());
+      // Add merge operand to memtable
+      if (kv_prot_info != nullptr) {
+        auto mem_kv_prot_info =
+            kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+        ret_status =
+            mem->Add(sequence_, kTypeMerge, key, value, &mem_kv_prot_info,
+                     concurrent_memtable_writes_, get_post_process_info(mem));
+      } else {
+        ret_status = mem->Add(
+            sequence_, kTypeMerge, key, value, nullptr /* kv_prot_info */,
+            concurrent_memtable_writes_, get_post_process_info(mem));
+      }
+    }
+
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      assert(seq_per_batch_);
+      const bool kBatchBoundary = true;
+      MaybeAdvanceSeq(kBatchBoundary);
+    } else if (ret_status.ok()) {
+      MaybeAdvanceSeq();
+      CheckMemtableFull();
+    }
+    // optimize for non-recovery mode
+    // If `ret_status` is `TryAgain` then the next (successful) try will add
+    // the key to the rebuilding transaction object. If `ret_status` is
+    // another non-OK `Status`, then the `rebuilding_trx_` will be thrown
+    // away. So we only need to add to it when `ret_status.ok()`.
+    if (UNLIKELY(ret_status.ok() && rebuilding_trx_ != nullptr)) {
+      assert(!write_after_commit_);
+      // TODO(ajkr): propagate `ProtectionInfoKVOS64`.
+      ret_status = WriteBatchInternal::Merge(rebuilding_trx_, column_family_id,
+                                             key, value);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  Status PutBlobIndexCF(uint32_t column_family_id, const Slice& key,
+                        const Slice& value) override {
+    const auto* kv_prot_info = NextProtectionInfo();
+    Status ret_status;
+    if (kv_prot_info != nullptr) {
+      // Memtable needs seqno, doesn't need CF ID
+      auto mem_kv_prot_info =
+          kv_prot_info->StripC(column_family_id).ProtectS(sequence_);
+      // Same as PutCF except for value type.
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                             &mem_kv_prot_info);
+    } else {
+      ret_status = PutCFImpl(column_family_id, key, value, kTypeBlobIndex,
+                             nullptr /* kv_prot_info */);
+    }
+    if (UNLIKELY(ret_status.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+    return ret_status;
+  }
+
+  void CheckMemtableFull() {
+    if (flush_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+      assert(cfd != nullptr);
+      if (cfd->mem()->ShouldScheduleFlush() &&
+          cfd->mem()->MarkFlushScheduled()) {
+        // MarkFlushScheduled only returns true if we are the one that
+        // should take action, so no need to dedup further
+        flush_scheduler_->ScheduleWork(cfd);
+      }
+    }
+    // check if memtable_list size exceeds max_write_buffer_size_to_maintain
+    if (trim_history_scheduler_ != nullptr) {
+      auto* cfd = cf_mems_->current();
+
+      assert(cfd);
+      assert(cfd->ioptions());
+
+      const size_t size_to_maintain = static_cast<size_t>(
+          cfd->ioptions()->max_write_buffer_size_to_maintain);
+
+      if (size_to_maintain > 0) {
+        MemTableList* const imm = cfd->imm();
+        assert(imm);
+
+        if (imm->HasHistory()) {
+          const MemTable* const mem = cfd->mem();
+          assert(mem);
+
+          if (mem->MemoryAllocatedBytes() +
+                      imm->MemoryAllocatedBytesExcludingLast() >=
+                  size_to_maintain &&
+              imm->MarkTrimHistoryNeeded()) {
+            trim_history_scheduler_->ScheduleWork(cfd);
+          }
+        }
+      }
+    }
+  }
+
+  // The write batch handler calls MarkBeginPrepare with unprepare set to true
+  // if it encounters the kTypeBeginUnprepareXID marker.
+  Status MarkBeginPrepare(bool unprepare) override {
+    assert(rebuilding_trx_ == nullptr);
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+      // during recovery we rebuild a hollow transaction
+      // from all encountered prepare sections of the wal
+      if (db_->allow_2pc() == false) {
+        return Status::NotSupported(
+            "WAL contains prepared transactions. Open with "
+            "TransactionDB::Open().");
+      }
+
+      // we are now iterating through a prepared section
+      rebuilding_trx_ = new WriteBatch();
+      rebuilding_trx_seq_ = sequence_;
+      // Verify that we have matching MarkBeginPrepare/MarkEndPrepare markers.
+      // unprepared_batch_ should be false because it is false by default, and
+      // gets reset to false in MarkEndPrepare.
+      assert(!unprepared_batch_);
+      unprepared_batch_ = unprepare;
+
+      if (has_valid_writes_ != nullptr) {
+        *has_valid_writes_ = true;
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& name) override {
+    assert(db_);
+    assert((rebuilding_trx_ != nullptr) == (recovering_log_number_ != 0));
+
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+      assert(db_->allow_2pc());
+      size_t batch_cnt =
+          write_after_commit_
+              ? 0  // 0 will disable further checks
+              : static_cast<size_t>(sequence_ - rebuilding_trx_seq_ + 1);
+      db_->InsertRecoveredTransaction(recovering_log_number_, name.ToString(),
+                                      rebuilding_trx_, rebuilding_trx_seq_,
+                                      batch_cnt, unprepared_batch_);
+      unprepared_batch_ = false;
+      rebuilding_trx_ = nullptr;
+    } else {
+      assert(rebuilding_trx_ == nullptr);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+  Status MarkNoop(bool empty_batch) override {
+    if (recovering_log_number_ != 0) {
+      db_->mutex()->AssertHeld();
+    }
+    // A hack in pessimistic transaction could result into a noop at the start
+    // of the write batch, that should be ignored.
+    if (!empty_batch) {
+      // In the absence of Prepare markers, a kTypeNoop tag indicates the end of
+      // a batch. This happens when write batch commits skipping the prepare
+      // phase.
+      const bool batch_boundry = true;
+      MaybeAdvanceSeq(batch_boundry);
+    }
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& name) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // We must hold db mutex in recovery.
+      db_->mutex()->AssertHeld();
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx != nullptr) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(log_number_ref_ == 0);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+          ResetProtectionInfo();
+          s = batch_info.batch_->Iterate(this);
+          log_number_ref_ = 0;
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_ != nullptr) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no disconnect
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status MarkCommitWithTimestamp(const Slice& name,
+                                 const Slice& commit_ts) override {
+    assert(db_);
+
+    Status s;
+
+    if (recovering_log_number_ != 0) {
+      // In recovery, db mutex must be held.
+      db_->mutex()->AssertHeld();
+      // in recovery when we encounter a commit marker
+      // we lookup this transaction in our set of rebuilt transactions
+      // and commit.
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+      // the log containing the prepared section may have
+      // been released in the last incarnation because the
+      // data was flushed to L0
+      if (trx) {
+        // at this point individual CF lognumbers will prevent
+        // duplicate re-insertion of values.
+        assert(0 == log_number_ref_);
+        if (write_after_commit_) {
+          // write_after_commit_ can only have one batch in trx.
+          assert(trx->batches_.size() == 1);
+          const auto& batch_info = trx->batches_.begin()->second;
+          // all inserts must reference this trx log number
+          log_number_ref_ = batch_info.log_number_;
+
+          s = batch_info.batch_->UpdateTimestamps(
+              commit_ts, [this](uint32_t cf) {
+                assert(db_);
+                VersionSet* const vset = db_->GetVersionSet();
+                assert(vset);
+                ColumnFamilySet* const cf_set = vset->GetColumnFamilySet();
+                assert(cf_set);
+                ColumnFamilyData* cfd = cf_set->GetColumnFamily(cf);
+                assert(cfd);
+                const auto* const ucmp = cfd->user_comparator();
+                assert(ucmp);
+                return ucmp->timestamp_size();
+              });
+          if (s.ok()) {
+            ResetProtectionInfo();
+            s = batch_info.batch_->Iterate(this);
+            log_number_ref_ = 0;
+          }
+        }
+        // else the values are already inserted before the commit
+
+        if (s.ok()) {
+          db_->DeleteRecoveredTransaction(name.ToString());
+        }
+        if (has_valid_writes_) {
+          *has_valid_writes_ = true;
+        }
+      }
+    } else {
+      // When writes are not delayed until commit, there is no connection
+      // between a memtable write and the WAL that supports it. So the commit
+      // need not reference any log as the only log to which it depends.
+      assert(!write_after_commit_ || log_number_ref_ > 0);
+    }
+    constexpr bool batch_boundary = true;
+    MaybeAdvanceSeq(batch_boundary);
+
+    if (UNLIKELY(s.IsTryAgain())) {
+      DecrementProtectionInfoIdxForTryAgain();
+    }
+
+    return s;
+  }
+
+  Status MarkRollback(const Slice& name) override {
+    assert(db_);
+
+    if (recovering_log_number_ != 0) {
+      auto trx = db_->GetRecoveredTransaction(name.ToString());
+
+      // the log containing the transactions prep section
+      // may have been released in the previous incarnation
+      // because we knew it had been rolled back
+      if (trx != nullptr) {
+        db_->DeleteRecoveredTransaction(name.ToString());
+      }
+    } else {
+      // in non recovery we simply ignore this tag
+    }
+
+    const bool batch_boundry = true;
+    MaybeAdvanceSeq(batch_boundry);
+
+    return Status::OK();
+  }
+
+ private:
+  MemTablePostProcessInfo* get_post_process_info(MemTable* mem) {
+    if (!concurrent_memtable_writes_) {
+      // No need to batch counters locally if we don't use concurrent mode.
+      return nullptr;
+    }
+    return &GetPostMap()[mem];
+  }
+};
+
+}  // anonymous namespace
+
+// This function can only be called in these conditions:
+// 1) During Recovery()
+// 2) During Write(), in a single-threaded write thread
+// 3) During Write(), in a concurrent context where memtables has been cloned
+// The reason is that it calls memtables->Seek(), which has a stateful cache
+Status WriteBatchInternal::InsertInto(
+    WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t recovery_log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(
+      sequence, memtables, flush_scheduler, trim_history_scheduler,
+      ignore_missing_column_families, recovery_log_number, db,
+      concurrent_memtable_writes, nullptr /* prot_info */,
+      nullptr /*has_valid_writes*/, seq_per_batch, batch_per_txn);
+  for (auto w : write_group) {
+    if (w->CallbackFailed()) {
+      continue;
+    }
+    w->sequence = inserter.sequence();
+    if (!w->ShouldWriteToMemtable()) {
+      // In seq_per_batch_ mode this advances the seq by one.
+      inserter.MaybeAdvanceSeq(true);
+      continue;
+    }
+    SetSequence(w->batch, inserter.sequence());
+    inserter.set_log_number_ref(w->log_ref);
+    inserter.set_prot_info(w->batch->prot_info_.get());
+    w->status = w->batch->Iterate(&inserter);
+    if (!w->status.ok()) {
+      return w->status;
+    }
+    assert(!seq_per_batch || w->batch_cnt != 0);
+    assert(!seq_per_batch || inserter.sequence() - w->sequence == w->batch_cnt);
+  }
+  return Status::OK();
+}
+
+Status WriteBatchInternal::InsertInto(
+    WriteThread::Writer* writer, SequenceNumber sequence,
+    ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, bool seq_per_batch, size_t batch_cnt,
+    bool batch_per_txn, bool hint_per_batch) {
+#ifdef NDEBUG
+  (void)batch_cnt;
+#endif
+  assert(writer->ShouldWriteToMemtable());
+  MemTableInserter inserter(sequence, memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, nullptr /* prot_info */,
+                            nullptr /*has_valid_writes*/, seq_per_batch,
+                            batch_per_txn, hint_per_batch);
+  SetSequence(writer->batch, sequence);
+  inserter.set_log_number_ref(writer->log_ref);
+  inserter.set_prot_info(writer->batch->prot_info_.get());
+  Status s = writer->batch->Iterate(&inserter);
+  assert(!seq_per_batch || batch_cnt != 0);
+  assert(!seq_per_batch || inserter.sequence() - sequence == batch_cnt);
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+Status WriteBatchInternal::InsertInto(
+    const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+    FlushScheduler* flush_scheduler,
+    TrimHistoryScheduler* trim_history_scheduler,
+    bool ignore_missing_column_families, uint64_t log_number, DB* db,
+    bool concurrent_memtable_writes, SequenceNumber* next_seq,
+    bool* has_valid_writes, bool seq_per_batch, bool batch_per_txn) {
+  MemTableInserter inserter(Sequence(batch), memtables, flush_scheduler,
+                            trim_history_scheduler,
+                            ignore_missing_column_families, log_number, db,
+                            concurrent_memtable_writes, batch->prot_info_.get(),
+                            has_valid_writes, seq_per_batch, batch_per_txn);
+  Status s = batch->Iterate(&inserter);
+  if (next_seq != nullptr) {
+    *next_seq = inserter.sequence();
+  }
+  if (concurrent_memtable_writes) {
+    inserter.PostProcess();
+  }
+  return s;
+}
+
+namespace {
+
+// This class updates protection info for a WriteBatch.
+class ProtectionInfoUpdater : public WriteBatch::Handler {
+ public:
+  explicit ProtectionInfoUpdater(WriteBatch::ProtectionInfo* prot_info)
+      : prot_info_(prot_info) {}
+
+  ~ProtectionInfoUpdater() override {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeValue);
+  }
+
+  Status PutEntityCF(uint32_t cf, const Slice& key,
+                     const Slice& entity) override {
+    return UpdateProtInfo(cf, key, entity, kTypeWideColumnEntity);
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateProtInfo(cf, key, "", kTypeDeletion);
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateProtInfo(cf, key, "", kTypeSingleDeletion);
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
+    return UpdateProtInfo(cf, begin_key, end_key, kTypeRangeDeletion);
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeMerge);
+  }
+
+  Status PutBlobIndexCF(uint32_t cf, const Slice& key,
+                        const Slice& val) override {
+    return UpdateProtInfo(cf, key, val, kTypeBlobIndex);
+  }
+
+  Status MarkBeginPrepare(bool /* unprepare */) override {
+    return Status::OK();
+  }
+
+  Status MarkEndPrepare(const Slice& /* xid */) override {
+    return Status::OK();
+  }
+
+  Status MarkCommit(const Slice& /* xid */) override { return Status::OK(); }
+
+  Status MarkCommitWithTimestamp(const Slice& /* xid */,
+                                 const Slice& /* ts */) override {
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice& /* xid */) override { return Status::OK(); }
+
+  Status MarkNoop(bool /* empty_batch */) override { return Status::OK(); }
+
+ private:
+  Status UpdateProtInfo(uint32_t cf, const Slice& key, const Slice& val,
+                        const ValueType op_type) {
+    if (prot_info_) {
+      prot_info_->entries_.emplace_back(
+          ProtectionInfo64().ProtectKVO(key, val, op_type).ProtectC(cf));
+    }
+    return Status::OK();
+  }
+
+  // No copy or move.
+  ProtectionInfoUpdater(const ProtectionInfoUpdater&) = delete;
+  ProtectionInfoUpdater(ProtectionInfoUpdater&&) = delete;
+  ProtectionInfoUpdater& operator=(const ProtectionInfoUpdater&) = delete;
+  ProtectionInfoUpdater& operator=(ProtectionInfoUpdater&&) = delete;
+
+  WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+};
+
+}  // anonymous namespace
+
+Status WriteBatchInternal::SetContents(WriteBatch* b, const Slice& contents) {
+  assert(contents.size() >= WriteBatchInternal::kHeader);
+  assert(b->prot_info_ == nullptr);
+
+  b->rep_.assign(contents.data(), contents.size());
+  b->content_flags_.store(ContentFlags::DEFERRED, std::memory_order_relaxed);
+  return Status::OK();
+}
+
+Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
+                                  const bool wal_only) {
+  assert(dst->Count() == 0 ||
+         (dst->prot_info_ == nullptr) == (src->prot_info_ == nullptr));
+  if ((src->prot_info_ != nullptr &&
+       src->prot_info_->entries_.size() != src->Count()) ||
+      (dst->prot_info_ != nullptr &&
+       dst->prot_info_->entries_.size() != dst->Count())) {
+    return Status::Corruption(
+        "Write batch has inconsistent count and number of checksums");
+  }
+
+  size_t src_len;
+  int src_count;
+  uint32_t src_flags;
+
+  const SavePoint& batch_end = src->GetWalTerminationPoint();
+
+  if (wal_only && !batch_end.is_cleared()) {
+    src_len = batch_end.size - WriteBatchInternal::kHeader;
+    src_count = batch_end.count;
+    src_flags = batch_end.content_flags;
+  } else {
+    src_len = src->rep_.size() - WriteBatchInternal::kHeader;
+    src_count = Count(src);
+    src_flags = src->content_flags_.load(std::memory_order_relaxed);
+  }
+
+  if (src->prot_info_ != nullptr) {
+    if (dst->prot_info_ == nullptr) {
+      dst->prot_info_.reset(new WriteBatch::ProtectionInfo());
+    }
+    std::copy(src->prot_info_->entries_.begin(),
+              src->prot_info_->entries_.begin() + src_count,
+              std::back_inserter(dst->prot_info_->entries_));
+  } else if (dst->prot_info_ != nullptr) {
+    // dst has empty prot_info->entries
+    // In this special case, we allow write batch without prot_info to
+    // be appende to write batch with empty prot_info
+    dst->prot_info_ = nullptr;
+  }
+  SetCount(dst, Count(dst) + src_count);
+  assert(src->rep_.size() >= WriteBatchInternal::kHeader);
+  dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
+  dst->content_flags_.store(
+      dst->content_flags_.load(std::memory_order_relaxed) | src_flags,
+      std::memory_order_relaxed);
+  return Status::OK();
+}
+
+size_t WriteBatchInternal::AppendedByteSize(size_t leftByteSize,
+                                            size_t rightByteSize) {
+  if (leftByteSize == 0 || rightByteSize == 0) {
+    return leftByteSize + rightByteSize;
+  } else {
+    return leftByteSize + rightByteSize - WriteBatchInternal::kHeader;
+  }
+}
+
+Status WriteBatchInternal::UpdateProtectionInfo(WriteBatch* wb,
+                                                size_t bytes_per_key,
+                                                uint64_t* checksum) {
+  if (bytes_per_key == 0) {
+    if (wb->prot_info_ != nullptr) {
+      wb->prot_info_.reset();
+      return Status::OK();
+    } else {
+      // Already not protected.
+      return Status::OK();
+    }
+  } else if (bytes_per_key == 8) {
+    if (wb->prot_info_ == nullptr) {
+      wb->prot_info_.reset(new WriteBatch::ProtectionInfo());
+      ProtectionInfoUpdater prot_info_updater(wb->prot_info_.get());
+      Status s = wb->Iterate(&prot_info_updater);
+      if (s.ok() && checksum != nullptr) {
+        uint64_t expected_hash = XXH3_64bits(wb->rep_.data(), wb->rep_.size());
+        if (expected_hash != *checksum) {
+          return Status::Corruption("Write batch content corrupted.");
+        }
+      }
+      return s;
+    } else {
+      // Already protected.
+      return Status::OK();
+    }
+  }
+  return Status::NotSupported(
+      "WriteBatch protection info must be zero or eight bytes/key");
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_base.cc b/src/rocksdb/db/write_batch_base.cc
new file mode 100644
index 000000000..e4c0e74bd
--- /dev/null
+++ b/src/rocksdb/db/write_batch_base.cc
@@ -0,0 +1,94 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "rocksdb/write_batch_base.h"
+
+#include <string>
+
+#include "rocksdb/slice.h"
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Simple implementation of SlicePart variants of Put().  Child classes
+// can override these method with more performant solutions if they choose.
+Status WriteBatchBase::Put(ColumnFamilyHandle* column_family,
+                           const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Put(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Put(key_slice, value_slice);
+}
+
+Status WriteBatchBase::Delete(ColumnFamilyHandle* column_family,
+                              const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(column_family, key_slice);
+}
+
+Status WriteBatchBase::Delete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return Delete(key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(ColumnFamilyHandle* column_family,
+                                    const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(column_family, key_slice);
+}
+
+Status WriteBatchBase::SingleDelete(const SliceParts& key) {
+  std::string key_buf;
+  Slice key_slice(key, &key_buf);
+  return SingleDelete(key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(ColumnFamilyHandle* column_family,
+                                   const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(column_family, begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::DeleteRange(const SliceParts& begin_key,
+                                   const SliceParts& end_key) {
+  std::string begin_key_buf, end_key_buf;
+  Slice begin_key_slice(begin_key, &begin_key_buf);
+  Slice end_key_slice(end_key, &end_key_buf);
+  return DeleteRange(begin_key_slice, end_key_slice);
+}
+
+Status WriteBatchBase::Merge(ColumnFamilyHandle* column_family,
+                             const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(column_family, key_slice, value_slice);
+}
+
+Status WriteBatchBase::Merge(const SliceParts& key, const SliceParts& value) {
+  std::string key_buf, value_buf;
+  Slice key_slice(key, &key_buf);
+  Slice value_slice(value, &value_buf);
+
+  return Merge(key_slice, value_slice);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_internal.h b/src/rocksdb/db/write_batch_internal.h
new file mode 100644
index 000000000..1be0bd140
--- /dev/null
+++ b/src/rocksdb/db/write_batch_internal.h
@@ -0,0 +1,401 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+#include <array>
+#include <vector>
+
+#include "db/flush_scheduler.h"
+#include "db/kv_checksum.h"
+#include "db/trim_history_scheduler.h"
+#include "db/write_thread.h"
+#include "rocksdb/db.h"
+#include "rocksdb/options.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class MemTable;
+class FlushScheduler;
+class ColumnFamilyData;
+
+class ColumnFamilyMemTables {
+ public:
+  virtual ~ColumnFamilyMemTables() {}
+  virtual bool Seek(uint32_t column_family_id) = 0;
+  // returns true if the update to memtable should be ignored
+  // (useful when recovering from log whose updates have already
+  // been processed)
+  virtual uint64_t GetLogNumber() const = 0;
+  virtual MemTable* GetMemTable() const = 0;
+  virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
+  virtual ColumnFamilyData* current() { return nullptr; }
+};
+
+class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
+ public:
+  explicit ColumnFamilyMemTablesDefault(MemTable* mem)
+      : ok_(false), mem_(mem) {}
+
+  bool Seek(uint32_t column_family_id) override {
+    ok_ = (column_family_id == 0);
+    return ok_;
+  }
+
+  uint64_t GetLogNumber() const override { return 0; }
+
+  MemTable* GetMemTable() const override {
+    assert(ok_);
+    return mem_;
+  }
+
+  ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
+
+ private:
+  bool ok_;
+  MemTable* mem_;
+};
+
+struct WriteBatch::ProtectionInfo {
+  // `WriteBatch` usually doesn't contain a huge number of keys so protecting
+  // with a fixed, non-configurable eight bytes per key may work well enough.
+  autovector<ProtectionInfoKVOC64> entries_;
+
+  size_t GetBytesPerKey() const { return 8; }
+};
+
+// WriteBatchInternal provides static methods for manipulating a
+// WriteBatch that we don't want in the public WriteBatch interface.
+class WriteBatchInternal {
+ public:
+  // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
+  static constexpr size_t kHeader = 12;
+
+  // WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const Slice& key, const Slice& value);
+
+  static Status Put(WriteBatch* batch, uint32_t column_family_id,
+                    const SliceParts& key, const SliceParts& value);
+
+  static Status PutEntity(WriteBatch* batch, uint32_t column_family_id,
+                          const Slice& key, const WideColumns& columns);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const SliceParts& key);
+
+  static Status Delete(WriteBatch* batch, uint32_t column_family_id,
+                       const Slice& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const SliceParts& key);
+
+  static Status SingleDelete(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const Slice& begin_key, const Slice& end_key);
+
+  static Status DeleteRange(WriteBatch* b, uint32_t column_family_id,
+                            const SliceParts& begin_key,
+                            const SliceParts& end_key);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const Slice& key, const Slice& value);
+
+  static Status Merge(WriteBatch* batch, uint32_t column_family_id,
+                      const SliceParts& key, const SliceParts& value);
+
+  static Status PutBlobIndex(WriteBatch* batch, uint32_t column_family_id,
+                             const Slice& key, const Slice& value);
+
+  static Status MarkEndPrepare(WriteBatch* batch, const Slice& xid,
+                               const bool write_after_commit = true,
+                               const bool unprepared_batch = false);
+
+  static Status MarkRollback(WriteBatch* batch, const Slice& xid);
+
+  static Status MarkCommit(WriteBatch* batch, const Slice& xid);
+
+  static Status MarkCommitWithTimestamp(WriteBatch* batch, const Slice& xid,
+                                        const Slice& commit_ts);
+
+  static Status InsertNoop(WriteBatch* batch);
+
+  // Return the number of entries in the batch.
+  static uint32_t Count(const WriteBatch* batch);
+
+  // Set the count for the number of entries in the batch.
+  static void SetCount(WriteBatch* batch, uint32_t n);
+
+  // Return the sequence number for the start of this batch.
+  static SequenceNumber Sequence(const WriteBatch* batch);
+
+  // Store the specified number as the sequence number for the start of
+  // this batch.
+  static void SetSequence(WriteBatch* batch, SequenceNumber seq);
+
+  // Returns the offset of the first entry in the batch.
+  // This offset is only valid if the batch is not empty.
+  static size_t GetFirstOffset(WriteBatch* batch);
+
+  static Slice Contents(const WriteBatch* batch) { return Slice(batch->rep_); }
+
+  static size_t ByteSize(const WriteBatch* batch) { return batch->rep_.size(); }
+
+  static Status SetContents(WriteBatch* batch, const Slice& contents);
+
+  static Status CheckSlicePartsLength(const SliceParts& key,
+                                      const SliceParts& value);
+
+  // Inserts batches[i] into memtable, for i in 0..num_batches-1 inclusive.
+  //
+  // If ignore_missing_column_families == true. WriteBatch
+  // referencing non-existing column family will be ignored.
+  // If ignore_missing_column_families == false, processing of the
+  // batches will be stopped if a reference is found to a non-existing
+  // column family and InvalidArgument() will be returned.  The writes
+  // in batches may be only partially applied at that point.
+  //
+  // If log_number is non-zero, the memtable will be updated only if
+  // memtables->GetLogNumber() >= log_number.
+  //
+  // If flush_scheduler is non-null, it will be invoked if the memtable
+  // should be flushed.
+  //
+  // Under concurrent use, the caller is responsible for making sure that
+  // the memtables object itself is thread-local.
+  static Status InsertInto(
+      WriteThread::WriteGroup& write_group, SequenceNumber sequence,
+      ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  // Convenience form of InsertInto when you have only one batch
+  // next_seq returns the seq after last sequence number used in MemTable insert
+  static Status InsertInto(
+      const WriteBatch* batch, ColumnFamilyMemTables* memtables,
+      FlushScheduler* flush_scheduler,
+      TrimHistoryScheduler* trim_history_scheduler,
+      bool ignore_missing_column_families = false, uint64_t log_number = 0,
+      DB* db = nullptr, bool concurrent_memtable_writes = false,
+      SequenceNumber* next_seq = nullptr, bool* has_valid_writes = nullptr,
+      bool seq_per_batch = false, bool batch_per_txn = true);
+
+  static Status InsertInto(WriteThread::Writer* writer, SequenceNumber sequence,
+                           ColumnFamilyMemTables* memtables,
+                           FlushScheduler* flush_scheduler,
+                           TrimHistoryScheduler* trim_history_scheduler,
+                           bool ignore_missing_column_families = false,
+                           uint64_t log_number = 0, DB* db = nullptr,
+                           bool concurrent_memtable_writes = false,
+                           bool seq_per_batch = false, size_t batch_cnt = 0,
+                           bool batch_per_txn = true,
+                           bool hint_per_batch = false);
+
+  // Appends src write batch to dst write batch and updates count in dst
+  // write batch. Returns OK if the append is successful. Checks number of
+  // checksum against count in dst and src write batches, and returns Corruption
+  // if the count is inconsistent.
+  static Status Append(WriteBatch* dst, const WriteBatch* src,
+                       const bool WAL_only = false);
+
+  // Returns the byte size of appending a WriteBatch with ByteSize
+  // leftByteSize and a WriteBatch with ByteSize rightByteSize
+  static size_t AppendedByteSize(size_t leftByteSize, size_t rightByteSize);
+
+  // Iterate over [begin, end) range of a write batch
+  static Status Iterate(const WriteBatch* wb, WriteBatch::Handler* handler,
+                        size_t begin, size_t end);
+
+  // This write batch includes the latest state that should be persisted. Such
+  // state meant to be used only during recovery.
+  static void SetAsLatestPersistentState(WriteBatch* b);
+  static bool IsLatestPersistentState(const WriteBatch* b);
+
+  static std::tuple<Status, uint32_t, size_t> GetColumnFamilyIdAndTimestampSize(
+      WriteBatch* b, ColumnFamilyHandle* column_family);
+
+  static bool TimestampsUpdateNeeded(const WriteBatch& wb) {
+    return wb.needs_in_place_update_ts_;
+  }
+
+  static bool HasKeyWithTimestamp(const WriteBatch& wb) {
+    return wb.has_key_with_ts_;
+  }
+
+  // Update per-key value protection information on this write batch.
+  // If checksum is provided, the batch content is verfied against the checksum.
+  static Status UpdateProtectionInfo(WriteBatch* wb, size_t bytes_per_key,
+                                     uint64_t* checksum = nullptr);
+};
+
+// LocalSavePoint is similar to a scope guard
+class LocalSavePoint {
+ public:
+  explicit LocalSavePoint(WriteBatch* batch)
+      : batch_(batch),
+        savepoint_(batch->GetDataSize(), batch->Count(),
+                   batch->content_flags_.load(std::memory_order_relaxed))
+#ifndef NDEBUG
+        ,
+        committed_(false)
+#endif
+  {
+  }
+
+#ifndef NDEBUG
+  ~LocalSavePoint() { assert(committed_); }
+#endif
+  Status commit() {
+#ifndef NDEBUG
+    committed_ = true;
+#endif
+    if (batch_->max_bytes_ && batch_->rep_.size() > batch_->max_bytes_) {
+      batch_->rep_.resize(savepoint_.size);
+      WriteBatchInternal::SetCount(batch_, savepoint_.count);
+      if (batch_->prot_info_ != nullptr) {
+        batch_->prot_info_->entries_.resize(savepoint_.count);
+      }
+      batch_->content_flags_.store(savepoint_.content_flags,
+                                   std::memory_order_relaxed);
+      return Status::MemoryLimit();
+    }
+    return Status::OK();
+  }
+
+ private:
+  WriteBatch* batch_;
+  SavePoint savepoint_;
+#ifndef NDEBUG
+  bool committed_;
+#endif
+};
+
+template <typename TimestampSizeFuncType>
+class TimestampUpdater : public WriteBatch::Handler {
+ public:
+  explicit TimestampUpdater(WriteBatch::ProtectionInfo* prot_info,
+                            TimestampSizeFuncType&& ts_sz_func, const Slice& ts)
+      : prot_info_(prot_info),
+        ts_sz_func_(std::move(ts_sz_func)),
+        timestamp_(ts) {
+    assert(!timestamp_.empty());
+  }
+
+  ~TimestampUpdater() override {}
+
+  Status PutCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status DeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status SingleDeleteCF(uint32_t cf, const Slice& key) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status DeleteRangeCF(uint32_t cf, const Slice& begin_key,
+                       const Slice& end_key) override {
+    Status s = UpdateTimestamp(cf, begin_key, true /* is_key */);
+    if (s.ok()) {
+      s = UpdateTimestamp(cf, end_key, false /* is_key */);
+    }
+    return s;
+  }
+
+  Status MergeCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status PutBlobIndexCF(uint32_t cf, const Slice& key, const Slice&) override {
+    return UpdateTimestamp(cf, key);
+  }
+
+  Status MarkBeginPrepare(bool) override { return Status::OK(); }
+
+  Status MarkEndPrepare(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommit(const Slice&) override { return Status::OK(); }
+
+  Status MarkCommitWithTimestamp(const Slice&, const Slice&) override {
+    return Status::OK();
+  }
+
+  Status MarkRollback(const Slice&) override { return Status::OK(); }
+
+  Status MarkNoop(bool /*empty_batch*/) override { return Status::OK(); }
+
+ private:
+  // @param is_key specifies whether the update is for key or value.
+  Status UpdateTimestamp(uint32_t cf, const Slice& buf, bool is_key = true) {
+    Status s = UpdateTimestampImpl(cf, buf, idx_, is_key);
+    ++idx_;
+    return s;
+  }
+
+  Status UpdateTimestampImpl(uint32_t cf, const Slice& buf, size_t /*idx*/,
+                             bool is_key) {
+    if (timestamp_.empty()) {
+      return Status::InvalidArgument("Timestamp is empty");
+    }
+    size_t cf_ts_sz = ts_sz_func_(cf);
+    if (0 == cf_ts_sz) {
+      // Skip this column family.
+      return Status::OK();
+    } else if (std::numeric_limits<size_t>::max() == cf_ts_sz) {
+      // Column family timestamp info not found.
+      return Status::NotFound();
+    } else if (cf_ts_sz != timestamp_.size()) {
+      return Status::InvalidArgument("timestamp size mismatch");
+    }
+    UpdateProtectionInformationIfNeeded(buf, timestamp_, is_key);
+
+    char* ptr = const_cast<char*>(buf.data() + buf.size() - cf_ts_sz);
+    assert(ptr);
+    memcpy(ptr, timestamp_.data(), timestamp_.size());
+    return Status::OK();
+  }
+
+  void UpdateProtectionInformationIfNeeded(const Slice& buf, const Slice& ts,
+                                           bool is_key) {
+    if (prot_info_ != nullptr) {
+      const size_t ts_sz = ts.size();
+      SliceParts old(&buf, 1);
+      Slice old_no_ts(buf.data(), buf.size() - ts_sz);
+      std::array<Slice, 2> new_key_cmpts{{old_no_ts, ts}};
+      SliceParts new_parts(new_key_cmpts.data(), 2);
+      if (is_key) {
+        prot_info_->entries_[idx_].UpdateK(old, new_parts);
+      } else {
+        prot_info_->entries_[idx_].UpdateV(old, new_parts);
+      }
+    }
+  }
+
+  // No copy or move.
+  TimestampUpdater(const TimestampUpdater&) = delete;
+  TimestampUpdater(TimestampUpdater&&) = delete;
+  TimestampUpdater& operator=(const TimestampUpdater&) = delete;
+  TimestampUpdater& operator=(TimestampUpdater&&) = delete;
+
+  WriteBatch::ProtectionInfo* const prot_info_ = nullptr;
+  const TimestampSizeFuncType ts_sz_func_{};
+  const Slice timestamp_;
+  size_t idx_ = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_batch_test.cc b/src/rocksdb/db/write_batch_test.cc
new file mode 100644
index 000000000..d233853e2
--- /dev/null
+++ b/src/rocksdb/db/write_batch_test.cc
@@ -0,0 +1,1114 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include <memory>
+
+#include "db/column_family.h"
+#include "db/db_test_util.h"
+#include "db/memtable.h"
+#include "db/write_batch_internal.h"
+#include "rocksdb/comparator.h"
+#include "rocksdb/db.h"
+#include "rocksdb/env.h"
+#include "rocksdb/memtablerep.h"
+#include "rocksdb/utilities/write_batch_with_index.h"
+#include "rocksdb/write_buffer_manager.h"
+#include "table/scoped_arena_iterator.h"
+#include "test_util/testharness.h"
+#include "test_util/testutil.h"
+#include "util/string_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+static std::string PrintContents(WriteBatch* b,
+                                 bool merge_operator_supported = true) {
+  InternalKeyComparator cmp(BytewiseComparator());
+  auto factory = std::make_shared<SkipListFactory>();
+  Options options;
+  options.memtable_factory = factory;
+  if (merge_operator_supported) {
+    options.merge_operator.reset(new TestPutOperator());
+  }
+  ImmutableOptions ioptions(options);
+  WriteBufferManager wb(options.db_write_buffer_size);
+  MemTable* mem = new MemTable(cmp, ioptions, MutableCFOptions(options), &wb,
+                               kMaxSequenceNumber, 0 /* column_family_id */);
+  mem->Ref();
+  std::string state;
+  ColumnFamilyMemTablesDefault cf_mems_default(mem);
+  Status s =
+      WriteBatchInternal::InsertInto(b, &cf_mems_default, nullptr, nullptr);
+  uint32_t count = 0;
+  int put_count = 0;
+  int delete_count = 0;
+  int single_delete_count = 0;
+  int delete_range_count = 0;
+  int merge_count = 0;
+  for (int i = 0; i < 2; ++i) {
+    Arena arena;
+    ScopedArenaIterator arena_iter_guard;
+    std::unique_ptr<InternalIterator> iter_guard;
+    InternalIterator* iter;
+    if (i == 0) {
+      iter = mem->NewIterator(ReadOptions(), &arena);
+      arena_iter_guard.set(iter);
+    } else {
+      iter = mem->NewRangeTombstoneIterator(ReadOptions(),
+                                            kMaxSequenceNumber /* read_seq */,
+                                            false /* immutable_memtable */);
+      iter_guard.reset(iter);
+    }
+    if (iter == nullptr) {
+      continue;
+    }
+    EXPECT_OK(iter->status());
+    for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
+      ParsedInternalKey ikey;
+      ikey.clear();
+      EXPECT_OK(ParseInternalKey(iter->key(), &ikey, true /* log_err_key */));
+      switch (ikey.type) {
+        case kTypeValue:
+          state.append("Put(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          put_count++;
+          break;
+        case kTypeDeletion:
+          state.append("Delete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          delete_count++;
+          break;
+        case kTypeSingleDeletion:
+          state.append("SingleDelete(");
+          state.append(ikey.user_key.ToString());
+          state.append(")");
+          count++;
+          single_delete_count++;
+          break;
+        case kTypeRangeDeletion:
+          state.append("DeleteRange(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          delete_range_count++;
+          break;
+        case kTypeMerge:
+          state.append("Merge(");
+          state.append(ikey.user_key.ToString());
+          state.append(", ");
+          state.append(iter->value().ToString());
+          state.append(")");
+          count++;
+          merge_count++;
+          break;
+        default:
+          assert(false);
+          break;
+      }
+      state.append("@");
+      state.append(std::to_string(ikey.sequence));
+    }
+    EXPECT_OK(iter->status());
+  }
+  if (s.ok()) {
+    EXPECT_EQ(b->HasPut(), put_count > 0);
+    EXPECT_EQ(b->HasDelete(), delete_count > 0);
+    EXPECT_EQ(b->HasSingleDelete(), single_delete_count > 0);
+    EXPECT_EQ(b->HasDeleteRange(), delete_range_count > 0);
+    EXPECT_EQ(b->HasMerge(), merge_count > 0);
+    if (count != WriteBatchInternal::Count(b)) {
+      state.append("CountMismatch()");
+    }
+  } else {
+    state.append(s.ToString());
+  }
+  delete mem->Unref();
+  return state;
+}
+
+class WriteBatchTest : public testing::Test {};
+
+TEST_F(WriteBatchTest, Empty) {
+  WriteBatch batch;
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(0u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Multiple) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  ASSERT_OK(batch.DeleteRange(Slice("bar"), Slice("foo")));
+  ASSERT_OK(batch.Put(Slice("baz"), Slice("boo")));
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(100U, WriteBatchInternal::Sequence(&batch));
+  ASSERT_EQ(4u, WriteBatchInternal::Count(&batch));
+  ASSERT_EQ(
+      "Put(baz, boo)@103"
+      "Delete(box)@101"
+      "Put(foo, bar)@100"
+      "DeleteRange(bar, foo)@102",
+      PrintContents(&batch));
+  ASSERT_EQ(4u, batch.Count());
+}
+
+TEST_F(WriteBatchTest, Corruption) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Delete(Slice("box")));
+  WriteBatchInternal::SetSequence(&batch, 200);
+  Slice contents = WriteBatchInternal::Contents(&batch);
+  ASSERT_OK(WriteBatchInternal::SetContents(
+      &batch, Slice(contents.data(), contents.size() - 1)));
+  ASSERT_EQ(
+      "Put(foo, bar)@200"
+      "Corruption: bad WriteBatch Delete",
+      PrintContents(&batch));
+}
+
+TEST_F(WriteBatchTest, Append) {
+  WriteBatch b1, b2;
+  WriteBatchInternal::SetSequence(&b1, 200);
+  WriteBatchInternal::SetSequence(&b2, 300);
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ("", PrintContents(&b1));
+  ASSERT_EQ(0u, b1.Count());
+  ASSERT_OK(b2.Put("a", "va"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ("Put(a, va)@200", PrintContents(&b1));
+  ASSERT_EQ(1u, b1.Count());
+  b2.Clear();
+  ASSERT_OK(b2.Put("b", "vb"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@201",
+      PrintContents(&b1));
+  ASSERT_EQ(2u, b1.Count());
+  ASSERT_OK(b2.Delete("foo"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@202"
+      "Put(b, vb)@201"
+      "Delete(foo)@203",
+      PrintContents(&b1));
+  ASSERT_EQ(4u, b1.Count());
+  b2.Clear();
+  ASSERT_OK(b2.Put("c", "cc"));
+  ASSERT_OK(b2.Put("d", "dd"));
+  b2.MarkWalTerminationPoint();
+  ASSERT_OK(b2.Put("e", "ee"));
+  ASSERT_OK(WriteBatchInternal::Append(&b1, &b2, /*wal only*/ true));
+  ASSERT_EQ(
+      "Put(a, va)@200"
+      "Put(b, vb)@202"
+      "Put(b, vb)@201"
+      "Put(c, cc)@204"
+      "Put(d, dd)@205"
+      "Delete(foo)@203",
+      PrintContents(&b1));
+  ASSERT_EQ(6u, b1.Count());
+  ASSERT_EQ(
+      "Put(c, cc)@0"
+      "Put(d, dd)@1"
+      "Put(e, ee)@2",
+      PrintContents(&b2));
+  ASSERT_EQ(3u, b2.Count());
+}
+
+TEST_F(WriteBatchTest, SingleDeletion) {
+  WriteBatch batch;
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ("", PrintContents(&batch));
+  ASSERT_EQ(0u, batch.Count());
+  ASSERT_OK(batch.Put("a", "va"));
+  ASSERT_EQ("Put(a, va)@100", PrintContents(&batch));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_OK(batch.SingleDelete("a"));
+  ASSERT_EQ(
+      "SingleDelete(a)@101"
+      "Put(a, va)@100",
+      PrintContents(&batch));
+  ASSERT_EQ(2u, batch.Count());
+}
+
+namespace {
+struct TestHandler : public WriteBatch::Handler {
+  std::string seen;
+  Status PutCF(uint32_t column_family_id, const Slice& key,
+               const Slice& value) override {
+    if (column_family_id == 0) {
+      seen += "Put(" + key.ToString() + ", " + value.ToString() + ")";
+    } else {
+      seen += "PutCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ", " + value.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+    if (column_family_id == 0) {
+      seen += "Delete(" + key.ToString() + ")";
+    } else {
+      seen += "DeleteCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status SingleDeleteCF(uint32_t column_family_id, const Slice& key) override {
+    if (column_family_id == 0) {
+      seen += "SingleDelete(" + key.ToString() + ")";
+    } else {
+      seen += "SingleDeleteCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status DeleteRangeCF(uint32_t column_family_id, const Slice& begin_key,
+                       const Slice& end_key) override {
+    if (column_family_id == 0) {
+      seen += "DeleteRange(" + begin_key.ToString() + ", " +
+              end_key.ToString() + ")";
+    } else {
+      seen += "DeleteRangeCF(" + std::to_string(column_family_id) + ", " +
+              begin_key.ToString() + ", " + end_key.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  Status MergeCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+    if (column_family_id == 0) {
+      seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")";
+    } else {
+      seen += "MergeCF(" + std::to_string(column_family_id) + ", " +
+              key.ToString() + ", " + value.ToString() + ")";
+    }
+    return Status::OK();
+  }
+  void LogData(const Slice& blob) override {
+    seen += "LogData(" + blob.ToString() + ")";
+  }
+  Status MarkBeginPrepare(bool unprepare) override {
+    seen +=
+        "MarkBeginPrepare(" + std::string(unprepare ? "true" : "false") + ")";
+    return Status::OK();
+  }
+  Status MarkEndPrepare(const Slice& xid) override {
+    seen += "MarkEndPrepare(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+  Status MarkNoop(bool empty_batch) override {
+    seen += "MarkNoop(" + std::string(empty_batch ? "true" : "false") + ")";
+    return Status::OK();
+  }
+  Status MarkCommit(const Slice& xid) override {
+    seen += "MarkCommit(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+  Status MarkCommitWithTimestamp(const Slice& xid, const Slice& ts) override {
+    seen += "MarkCommitWithTimestamp(" + xid.ToString() + ", " +
+            ts.ToString(true) + ")";
+    return Status::OK();
+  }
+  Status MarkRollback(const Slice& xid) override {
+    seen += "MarkRollback(" + xid.ToString() + ")";
+    return Status::OK();
+  }
+};
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, PutNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Put(k1, v1)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, DeleteNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Delete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, SingleDeleteNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("SingleDelete(k2)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeNotImplemented) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ("Merge(foo, bar)@0", PrintContents(&batch));
+
+  WriteBatch::Handler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+}
+
+TEST_F(WriteBatchTest, MergeWithoutOperatorInsertionFailure) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(1u, batch.Count());
+  ASSERT_EQ(
+      "Invalid argument: Merge requires `ColumnFamilyOptions::merge_operator "
+      "!= nullptr`",
+      PrintContents(&batch, false /* merge_operator_supported */));
+}
+
+TEST_F(WriteBatchTest, Blob) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.Put(Slice("k3"), Slice("v3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k2")));
+  ASSERT_OK(batch.SingleDelete(Slice("k3")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_EQ(6u, batch.Count());
+  ASSERT_EQ(
+      "Merge(foo, bar)@5"
+      "Put(k1, v1)@0"
+      "Delete(k2)@3"
+      "Put(k2, v2)@1"
+      "SingleDelete(k3)@4"
+      "Put(k3, v3)@2",
+      PrintContents(&batch));
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "Put(k3, v3)"
+      "LogData(blob1)"
+      "Delete(k2)"
+      "SingleDelete(k3)"
+      "LogData(blob2)"
+      "Merge(foo, bar)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PrepareCommit) {
+  WriteBatch batch;
+  ASSERT_OK(WriteBatchInternal::InsertNoop(&batch));
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  batch.SetSavePoint();
+  ASSERT_OK(WriteBatchInternal::MarkEndPrepare(&batch, Slice("xid1")));
+  Status s = batch.RollbackToSavePoint();
+  ASSERT_EQ(s, Status::NotFound());
+  ASSERT_OK(WriteBatchInternal::MarkCommit(&batch, Slice("xid1")));
+  ASSERT_OK(WriteBatchInternal::MarkRollback(&batch, Slice("xid1")));
+  ASSERT_EQ(2u, batch.Count());
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "MarkBeginPrepare(false)"
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "MarkEndPrepare(xid1)"
+      "MarkCommit(xid1)"
+      "MarkRollback(xid1)",
+      handler.seen);
+}
+
+// It requires more than 30GB of memory to run the test. With single memory
+// allocation of more than 30GB.
+// Not all platform can run it. Also it runs a long time. So disable it.
+TEST_F(WriteBatchTest, DISABLED_ManyUpdates) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 4u;
+  static const uint32_t kNumUpdates = uint32_t{3} << 30;
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u);
+  char c = 'A';
+  for (uint32_t i = 0; i < kNumUpdates; i++) {
+    if (c > 'Z') {
+      c = 'A';
+    }
+    raw[0] = c;
+    raw[raw.length() - 1] = c;
+    c++;
+    ASSERT_OK(batch.Put(raw, raw));
+  }
+
+  ASSERT_EQ(kNumUpdates, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    uint32_t num_seen = 0;
+    char expected_char = 'A';
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ(expected_char, key[0]);
+      EXPECT_EQ(expected_char, value[0]);
+      EXPECT_EQ(expected_char, key[kKeyValueSize - 1]);
+      EXPECT_EQ(expected_char, value[kKeyValueSize - 1]);
+      expected_char++;
+      if (expected_char > 'Z') {
+        expected_char = 'A';
+      }
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < kNumUpdates; }
+  } handler;
+
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(kNumUpdates, handler.num_seen);
+}
+
+// The test requires more than 18GB memory to run it, with single memory
+// allocation of more than 12GB. Not all the platform can run it. So disable it.
+TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) {
+  // Insert key and value of 3GB and push total batch size to 12GB.
+  static const size_t kKeyValueSize = 3221225472u;
+  std::string raw(kKeyValueSize, 'A');
+  WriteBatch batch(size_t(12884901888ull + 1024u));
+  for (char i = 0; i < 2; i++) {
+    raw[0] = 'A' + i;
+    raw[raw.length() - 1] = 'A' - i;
+    ASSERT_OK(batch.Put(raw, raw));
+  }
+
+  ASSERT_EQ(2u, batch.Count());
+
+  struct NoopHandler : public WriteBatch::Handler {
+    int num_seen = 0;
+    Status PutCF(uint32_t /*column_family_id*/, const Slice& key,
+                 const Slice& value) override {
+      EXPECT_EQ(kKeyValueSize, key.size());
+      EXPECT_EQ(kKeyValueSize, value.size());
+      EXPECT_EQ('A' + num_seen, key[0]);
+      EXPECT_EQ('A' + num_seen, value[0]);
+      EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]);
+      EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]);
+      ++num_seen;
+      return Status::OK();
+    }
+    Status DeleteCF(uint32_t /*column_family_id*/,
+                    const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status SingleDeleteCF(uint32_t /*column_family_id*/,
+                          const Slice& /*key*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    Status MergeCF(uint32_t /*column_family_id*/, const Slice& /*key*/,
+                   const Slice& /*value*/) override {
+      ADD_FAILURE();
+      return Status::OK();
+    }
+    void LogData(const Slice& /*blob*/) override { ADD_FAILURE(); }
+    bool Continue() override { return num_seen < 2; }
+  } handler;
+
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(2, handler.num_seen);
+}
+
+TEST_F(WriteBatchTest, Continue) {
+  WriteBatch batch;
+
+  struct Handler : public TestHandler {
+    int num_seen = 0;
+    Status PutCF(uint32_t column_family_id, const Slice& key,
+                 const Slice& value) override {
+      ++num_seen;
+      return TestHandler::PutCF(column_family_id, key, value);
+    }
+    Status DeleteCF(uint32_t column_family_id, const Slice& key) override {
+      ++num_seen;
+      return TestHandler::DeleteCF(column_family_id, key);
+    }
+    Status SingleDeleteCF(uint32_t column_family_id,
+                          const Slice& key) override {
+      ++num_seen;
+      return TestHandler::SingleDeleteCF(column_family_id, key);
+    }
+    Status MergeCF(uint32_t column_family_id, const Slice& key,
+                   const Slice& value) override {
+      ++num_seen;
+      return TestHandler::MergeCF(column_family_id, key, value);
+    }
+    void LogData(const Slice& blob) override {
+      ++num_seen;
+      TestHandler::LogData(blob);
+    }
+    bool Continue() override { return num_seen < 5; }
+  } handler;
+
+  ASSERT_OK(batch.Put(Slice("k1"), Slice("v1")));
+  ASSERT_OK(batch.Put(Slice("k2"), Slice("v2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob1")));
+  ASSERT_OK(batch.Delete(Slice("k1")));
+  ASSERT_OK(batch.SingleDelete(Slice("k2")));
+  ASSERT_OK(batch.PutLogData(Slice("blob2")));
+  ASSERT_OK(batch.Merge(Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(k1, v1)"
+      "Put(k2, v2)"
+      "LogData(blob1)"
+      "Delete(k1)"
+      "SingleDelete(k2)",
+      handler.seen);
+}
+
+TEST_F(WriteBatchTest, PutGatherSlices) {
+  WriteBatch batch;
+  ASSERT_OK(batch.Put(Slice("foo"), Slice("bar")));
+
+  {
+    // Try a write where the key is one slice but the value is two
+    Slice key_slice("baz");
+    Slice value_slices[2] = {Slice("header"), Slice("payload")};
+    ASSERT_OK(
+        batch.Put(SliceParts(&key_slice, 1), SliceParts(value_slices, 2)));
+  }
+
+  {
+    // One where the key is composite but the value is a single slice
+    Slice key_slices[3] = {Slice("key"), Slice("part2"), Slice("part3")};
+    Slice value_slice("value");
+    ASSERT_OK(
+        batch.Put(SliceParts(key_slices, 3), SliceParts(&value_slice, 1)));
+  }
+
+  WriteBatchInternal::SetSequence(&batch, 100);
+  ASSERT_EQ(
+      "Put(baz, headerpayload)@101"
+      "Put(foo, bar)@100"
+      "Put(keypart2part3, value)@102",
+      PrintContents(&batch));
+  ASSERT_EQ(3u, batch.Count());
+}
+
+namespace {
+class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
+ public:
+  explicit ColumnFamilyHandleImplDummy(int id)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
+  explicit ColumnFamilyHandleImplDummy(int id, const Comparator* ucmp)
+      : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr),
+        id_(id),
+        ucmp_(ucmp) {}
+  uint32_t GetID() const override { return id_; }
+  const Comparator* GetComparator() const override { return ucmp_; }
+
+ private:
+  uint32_t id_;
+  const Comparator* const ucmp_ = BytewiseComparator();
+};
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, ColumnFamiliesBatchTest) {
+  WriteBatch batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.DeleteRange(&two, Slice("3foo"), Slice("4foo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+  TestHandler handler;
+  ASSERT_OK(batch.Iterate(&handler));
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "DeleteRangeCF(2, 3foo, 4foo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+
+#ifndef ROCKSDB_LITE
+TEST_F(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
+  WriteBatchWithIndex batch;
+  ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Put(&two, Slice("twofoo"), Slice("bar2")));
+  ASSERT_OK(batch.Put(&eight, Slice("eightfoo"), Slice("bar8")));
+  ASSERT_OK(batch.Delete(&eight, Slice("eightfoo")));
+  ASSERT_OK(batch.SingleDelete(&two, Slice("twofoo")));
+  ASSERT_OK(batch.Merge(&three, Slice("threethree"), Slice("3three")));
+  ASSERT_OK(batch.Put(&zero, Slice("foo"), Slice("bar")));
+  ASSERT_OK(batch.Merge(Slice("omom"), Slice("nom")));
+
+  std::unique_ptr<WBWIIterator> iter;
+
+  iter.reset(batch.NewIterator(&eight));
+  iter->Seek("eightfoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar8", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&two));
+  iter->Seek("twofoo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar2", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kSingleDeleteRecord, iter->Entry().type);
+  ASSERT_EQ("twofoo", iter->Entry().key.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator());
+  iter->Seek("gggg");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  iter.reset(batch.NewIterator(&zero));
+  iter->Seek("foo");
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
+  ASSERT_EQ("foo", iter->Entry().key.ToString());
+  ASSERT_EQ("bar", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(iter->Valid());
+  ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
+  ASSERT_EQ("omom", iter->Entry().key.ToString());
+  ASSERT_EQ("nom", iter->Entry().value.ToString());
+
+  iter->Next();
+  ASSERT_OK(iter->status());
+  ASSERT_TRUE(!iter->Valid());
+
+  TestHandler handler;
+  ASSERT_OK(batch.GetWriteBatch()->Iterate(&handler));
+  ASSERT_EQ(
+      "Put(foo, bar)"
+      "PutCF(2, twofoo, bar2)"
+      "PutCF(8, eightfoo, bar8)"
+      "DeleteCF(8, eightfoo)"
+      "SingleDeleteCF(2, twofoo)"
+      "MergeCF(3, threethree, 3three)"
+      "Put(foo, bar)"
+      "Merge(omom, nom)",
+      handler.seen);
+}
+#endif  // !ROCKSDB_LITE
+
+TEST_F(WriteBatchTest, SavePointTest) {
+  Status s;
+  WriteBatch batch;
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("A", "a"));
+  ASSERT_OK(batch.Put("B", "b"));
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("C", "c"));
+  ASSERT_OK(batch.Delete("A"));
+  batch.SetSavePoint();
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@3"
+      "Put(A, a)@0"
+      "Put(B, b)@1"
+      "Put(C, c)@2",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Put(A, a)@0"
+      "Put(B, b)@1",
+      PrintContents(&batch));
+
+  ASSERT_OK(batch.Delete("A"));
+  ASSERT_OK(batch.Put("B", "bb"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch));
+
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
+
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("A", "aaa"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  batch.SetSavePoint();
+
+  ASSERT_OK(batch.Put("D", "d"));
+  ASSERT_OK(batch.Delete("A"));
+
+  ASSERT_OK(batch.RollbackToSavePoint());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  s = batch.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ(
+      "Delete(A)@1"
+      "Put(D, d)@0",
+      PrintContents(&batch));
+
+  WriteBatch batch2;
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  ASSERT_OK(batch2.Delete("A"));
+  batch2.SetSavePoint();
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch2));
+
+  batch2.Clear();
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+
+  ASSERT_OK(batch2.Delete("B"));
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  batch2.SetSavePoint();
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(B)@0", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  s = batch2.RollbackToSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch2));
+
+  WriteBatch batch3;
+
+  s = batch3.PopSavePoint();
+  ASSERT_TRUE(s.IsNotFound());
+  ASSERT_EQ("", PrintContents(&batch3));
+
+  batch3.SetSavePoint();
+  ASSERT_OK(batch3.Delete("A"));
+
+  s = batch3.PopSavePoint();
+  ASSERT_OK(s);
+  ASSERT_EQ("Delete(A)@0", PrintContents(&batch3));
+}
+
+TEST_F(WriteBatchTest, MemoryLimitTest) {
+  Status s;
+  // The header size is 12 bytes. The two Puts take 8 bytes which gives total
+  // of 12 + 8 * 2 = 28 bytes.
+  WriteBatch batch(0, 28);
+
+  ASSERT_OK(batch.Put("a", "...."));
+  ASSERT_OK(batch.Put("b", "...."));
+  s = batch.Put("c", "....");
+  ASSERT_TRUE(s.IsMemoryLimit());
+}
+
+namespace {
+class TimestampChecker : public WriteBatch::Handler {
+ public:
+  explicit TimestampChecker(
+      std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps, Slice ts)
+      : cf_to_ucmps_(std::move(cf_to_ucmps)), timestamp_(std::move(ts)) {}
+  Status PutCF(uint32_t cf, const Slice& key, const Slice& /*value*/) override {
+    auto cf_iter = cf_to_ucmps_.find(cf);
+    if (cf_iter == cf_to_ucmps_.end()) {
+      return Status::Corruption();
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    size_t ts_sz = ucmp->timestamp_size();
+    if (ts_sz == 0) {
+      return Status::OK();
+    }
+    if (key.size() < ts_sz) {
+      return Status::Corruption();
+    }
+    Slice ts = ExtractTimestampFromUserKey(key, ts_sz);
+    if (ts.compare(timestamp_) != 0) {
+      return Status::Corruption();
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps_;
+  Slice timestamp_;
+};
+
+Status CheckTimestampsInWriteBatch(
+    WriteBatch& wb, Slice timestamp,
+    std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps) {
+  TimestampChecker ts_checker(cf_to_ucmps, timestamp);
+  return wb.Iterate(&ts_checker);
+}
+}  // anonymous namespace
+
+TEST_F(WriteBatchTest, SanityChecks) {
+  ColumnFamilyHandleImplDummy cf0(0,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+  ColumnFamilyHandleImplDummy cf4(4);
+
+  WriteBatch wb(0, 0, 0, /*default_cf_ts_sz=*/sizeof(uint64_t));
+
+  // Sanity checks for the new WriteBatch APIs with extra 'ts' arg.
+  ASSERT_TRUE(wb.Put(nullptr, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(nullptr, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(nullptr, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(nullptr, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.DeleteRange(nullptr, "begin_key", "end_key", "ts")
+                  .IsInvalidArgument());
+
+  ASSERT_TRUE(wb.Put(&cf4, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(&cf4, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(&cf4, "key", "ts").IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(&cf4, "key", "ts", "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb.DeleteRange(&cf4, "begin_key", "end_key", "ts").IsInvalidArgument());
+
+  constexpr size_t wrong_ts_sz = 1 + sizeof(uint64_t);
+  std::string ts(wrong_ts_sz, '\0');
+
+  ASSERT_TRUE(wb.Put(&cf0, "key", ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(wb.Delete(&cf0, "key", ts).IsInvalidArgument());
+  ASSERT_TRUE(wb.SingleDelete(&cf0, "key", ts).IsInvalidArgument());
+  ASSERT_TRUE(wb.Merge(&cf0, "key", ts, "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb.DeleteRange(&cf0, "begin_key", "end_key", ts).IsInvalidArgument());
+
+  // Sanity checks for the new WriteBatch APIs without extra 'ts' arg.
+  WriteBatch wb1(0, 0, 0, wrong_ts_sz);
+  ASSERT_TRUE(wb1.Put(&cf0, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(wb1.Delete(&cf0, "key").IsInvalidArgument());
+  ASSERT_TRUE(wb1.SingleDelete(&cf0, "key").IsInvalidArgument());
+  ASSERT_TRUE(wb1.Merge(&cf0, "key", "value").IsInvalidArgument());
+  ASSERT_TRUE(
+      wb1.DeleteRange(&cf0, "begin_key", "end_key").IsInvalidArgument());
+}
+
+TEST_F(WriteBatchTest, UpdateTimestamps) {
+  // We assume the last eight bytes of each key is reserved for timestamps.
+  // Therefore, we must make sure each key is longer than eight bytes.
+  constexpr size_t key_size = 16;
+  constexpr size_t num_of_keys = 10;
+  std::vector<std::string> key_strs(num_of_keys, std::string(key_size, '\0'));
+
+  ColumnFamilyHandleImplDummy cf0(0);
+  ColumnFamilyHandleImplDummy cf4(4,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+  ColumnFamilyHandleImplDummy cf5(5,
+                                  test::BytewiseComparatorWithU64TsWrapper());
+
+  const std::unordered_map<uint32_t, const Comparator*> cf_to_ucmps = {
+      {0, cf0.GetComparator()},
+      {4, cf4.GetComparator()},
+      {5, cf5.GetComparator()}};
+
+  static constexpr size_t timestamp_size = sizeof(uint64_t);
+
+  {
+    WriteBatch wb1, wb2, wb3, wb4, wb5, wb6, wb7;
+    ASSERT_OK(wb1.Put(&cf0, "key", "value"));
+    ASSERT_FALSE(WriteBatchInternal::HasKeyWithTimestamp(wb1));
+    ASSERT_OK(wb2.Put(&cf4, "key", "value"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb2));
+    ASSERT_OK(wb3.Put(&cf4, "key", /*ts=*/std::string(timestamp_size, '\xfe'),
+                      "value"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb3));
+    ASSERT_OK(wb4.Delete(&cf4, "key",
+                         /*ts=*/std::string(timestamp_size, '\xfe')));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb4));
+    ASSERT_OK(wb5.Delete(&cf4, "key"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb5));
+    ASSERT_OK(wb6.SingleDelete(&cf4, "key"));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb6));
+    ASSERT_OK(wb7.SingleDelete(&cf4, "key",
+                               /*ts=*/std::string(timestamp_size, '\xfe')));
+    ASSERT_TRUE(WriteBatchInternal::HasKeyWithTimestamp(wb7));
+  }
+
+  WriteBatch batch;
+  // Write to the batch. We will assign timestamps later.
+  for (const auto& key_str : key_strs) {
+    ASSERT_OK(batch.Put(&cf0, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf4, key_str, "value"));
+    ASSERT_OK(batch.Put(&cf5, key_str, "value"));
+  }
+
+  const auto checker1 = [](uint32_t cf) {
+    if (cf == 4 || cf == 5) {
+      return timestamp_size;
+    } else if (cf == 0) {
+      return static_cast<size_t>(0);
+    } else {
+      return std::numeric_limits<size_t>::max();
+    }
+  };
+  ASSERT_OK(
+      batch.UpdateTimestamps(std::string(timestamp_size, '\xfe'), checker1));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xfe'), cf_to_ucmps));
+
+  // We use indexed_cf_to_ucmps, non_indexed_cfs_with_ts and timestamp_size to
+  // simulate the case in which a transaction enables indexing for some writes
+  // while disables indexing for other writes. A transaction uses a
+  // WriteBatchWithIndex object to buffer writes (we consider Write-committed
+  // policy only). If indexing is enabled, then writes go through
+  // WriteBatchWithIndex API populating a WBWI internal data structure, i.e. a
+  // mapping from cf to user comparators. If indexing is disabled, a transaction
+  // writes directly to the underlying raw WriteBatch. We will need to track the
+  // comparator information for the column families to which un-indexed writes
+  // are performed. When calling UpdateTimestamp API of WriteBatch, we need
+  // indexed_cf_to_ucmps, non_indexed_cfs_with_ts, and timestamp_size to perform
+  // checking.
+  std::unordered_map<uint32_t, const Comparator*> indexed_cf_to_ucmps = {
+      {0, cf0.GetComparator()}, {4, cf4.GetComparator()}};
+  std::unordered_set<uint32_t> non_indexed_cfs_with_ts = {cf5.GetID()};
+  const auto checker2 = [&indexed_cf_to_ucmps,
+                         &non_indexed_cfs_with_ts](uint32_t cf) {
+    if (non_indexed_cfs_with_ts.count(cf) > 0) {
+      return timestamp_size;
+    }
+    auto cf_iter = indexed_cf_to_ucmps.find(cf);
+    if (cf_iter == indexed_cf_to_ucmps.end()) {
+      assert(false);
+      return std::numeric_limits<size_t>::max();
+    }
+    const Comparator* const ucmp = cf_iter->second;
+    assert(ucmp);
+    return ucmp->timestamp_size();
+  };
+  ASSERT_OK(
+      batch.UpdateTimestamps(std::string(timestamp_size, '\xef'), checker2));
+  ASSERT_OK(CheckTimestampsInWriteBatch(
+      batch, std::string(timestamp_size, '\xef'), cf_to_ucmps));
+}
+
+TEST_F(WriteBatchTest, CommitWithTimestamp) {
+  WriteBatch wb;
+  const std::string txn_name = "xid1";
+  std::string ts;
+  constexpr uint64_t commit_ts = 23;
+  PutFixed64(&ts, commit_ts);
+  ASSERT_OK(WriteBatchInternal::MarkCommitWithTimestamp(&wb, txn_name, ts));
+  TestHandler handler;
+  ASSERT_OK(wb.Iterate(&handler));
+  ASSERT_EQ("MarkCommitWithTimestamp(" + txn_name + ", " +
+                Slice(ts).ToString(true) + ")",
+            handler.seen);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_callback.h b/src/rocksdb/db/write_callback.h
new file mode 100644
index 000000000..106d02041
--- /dev/null
+++ b/src/rocksdb/db/write_callback.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "rocksdb/status.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class DB;
+
+class WriteCallback {
+ public:
+  virtual ~WriteCallback() {}
+
+  // Will be called while on the write thread before the write executes.  If
+  // this function returns a non-OK status, the write will be aborted and this
+  // status will be returned to the caller of DB::Write().
+  virtual Status Callback(DB* db) = 0;
+
+  // return true if writes with this callback can be batched with other writes
+  virtual bool AllowWriteBatching() = 0;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_callback_test.cc b/src/rocksdb/db/write_callback_test.cc
new file mode 100644
index 000000000..e6ebaae08
--- /dev/null
+++ b/src/rocksdb/db/write_callback_test.cc
@@ -0,0 +1,465 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#ifndef ROCKSDB_LITE
+
+#include "db/write_callback.h"
+
+#include <atomic>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "db/db_impl/db_impl.h"
+#include "port/port.h"
+#include "rocksdb/db.h"
+#include "rocksdb/write_batch.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+using std::string;
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteCallbackTest : public testing::Test {
+ public:
+  string dbname;
+
+  WriteCallbackTest() {
+    dbname = test::PerThreadDBPath("write_callback_testdb");
+  }
+};
+
+class WriteCallbackTestWriteCallback1 : public WriteCallback {
+ public:
+  bool was_called = false;
+
+  Status Callback(DB* db) override {
+    was_called = true;
+
+    // Make sure db is a DBImpl
+    DBImpl* db_impl = dynamic_cast<DBImpl*>(db);
+    if (db_impl == nullptr) {
+      return Status::InvalidArgument("");
+    }
+
+    return Status::OK();
+  }
+
+  bool AllowWriteBatching() override { return true; }
+};
+
+class WriteCallbackTestWriteCallback2 : public WriteCallback {
+ public:
+  Status Callback(DB* /*db*/) override { return Status::Busy(); }
+  bool AllowWriteBatching() override { return true; }
+};
+
+class MockWriteCallback : public WriteCallback {
+ public:
+  bool should_fail_ = false;
+  bool allow_batching_ = false;
+  std::atomic<bool> was_called_{false};
+
+  MockWriteCallback() {}
+
+  MockWriteCallback(const MockWriteCallback& other) {
+    should_fail_ = other.should_fail_;
+    allow_batching_ = other.allow_batching_;
+    was_called_.store(other.was_called_.load());
+  }
+
+  Status Callback(DB* /*db*/) override {
+    was_called_.store(true);
+    if (should_fail_) {
+      return Status::Busy();
+    } else {
+      return Status::OK();
+    }
+  }
+
+  bool AllowWriteBatching() override { return allow_batching_; }
+};
+
+#if !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+class WriteCallbackPTest
+    : public WriteCallbackTest,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, bool, bool, bool, bool, bool>> {
+ public:
+  WriteCallbackPTest() {
+    std::tie(unordered_write_, seq_per_batch_, two_queues_, allow_parallel_,
+             allow_batching_, enable_WAL_, enable_pipelined_write_) =
+        GetParam();
+  }
+
+ protected:
+  bool unordered_write_;
+  bool seq_per_batch_;
+  bool two_queues_;
+  bool allow_parallel_;
+  bool allow_batching_;
+  bool enable_WAL_;
+  bool enable_pipelined_write_;
+};
+
+TEST_P(WriteCallbackPTest, WriteWithCallbackTest) {
+  struct WriteOP {
+    WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; }
+
+    void Put(const string& key, const string& val) {
+      kvs_.push_back(std::make_pair(key, val));
+      ASSERT_OK(write_batch_.Put(key, val));
+    }
+
+    void Clear() {
+      kvs_.clear();
+      write_batch_.Clear();
+      callback_.was_called_.store(false);
+    }
+
+    MockWriteCallback callback_;
+    WriteBatch write_batch_;
+    std::vector<std::pair<string, string>> kvs_;
+  };
+
+  // In each scenario we'll launch multiple threads to write.
+  // The size of each array equals to number of threads, and
+  // each boolean in it denote whether callback of corresponding
+  // thread should succeed or fail.
+  std::vector<std::vector<WriteOP>> write_scenarios = {
+      {true},
+      {false},
+      {false, false},
+      {true, true},
+      {true, false},
+      {false, true},
+      {false, false, false},
+      {true, true, true},
+      {false, true, false},
+      {true, false, true},
+      {true, false, false, false, false},
+      {false, false, false, false, true},
+      {false, false, true, false, true},
+  };
+
+  for (auto& write_group : write_scenarios) {
+    Options options;
+    options.create_if_missing = true;
+    options.unordered_write = unordered_write_;
+    options.allow_concurrent_memtable_write = allow_parallel_;
+    options.enable_pipelined_write = enable_pipelined_write_;
+    options.two_write_queues = two_queues_;
+    // Skip unsupported combinations
+    if (options.enable_pipelined_write && seq_per_batch_) {
+      continue;
+    }
+    if (options.enable_pipelined_write && options.two_write_queues) {
+      continue;
+    }
+    if (options.unordered_write && !options.allow_concurrent_memtable_write) {
+      continue;
+    }
+    if (options.unordered_write && options.enable_pipelined_write) {
+      continue;
+    }
+
+    ReadOptions read_options;
+    DB* db;
+    DBImpl* db_impl;
+
+    ASSERT_OK(DestroyDB(dbname, options));
+
+    DBOptions db_options(options);
+    ColumnFamilyOptions cf_options(options);
+    std::vector<ColumnFamilyDescriptor> column_families;
+    column_families.push_back(
+        ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
+    std::vector<ColumnFamilyHandle*> handles;
+    auto open_s = DBImpl::Open(db_options, dbname, column_families, &handles,
+                               &db, seq_per_batch_, true /* batch_per_txn */);
+    ASSERT_OK(open_s);
+    assert(handles.size() == 1);
+    delete handles[0];
+
+    db_impl = dynamic_cast<DBImpl*>(db);
+    ASSERT_TRUE(db_impl);
+
+    // Writers that have called JoinBatchGroup.
+    std::atomic<uint64_t> threads_joining(0);
+    // Writers that have linked to the queue
+    std::atomic<uint64_t> threads_linked(0);
+    // Writers that pass WriteThread::JoinBatchGroup:Wait sync-point.
+    std::atomic<uint64_t> threads_verified(0);
+
+    std::atomic<uint64_t> seq(db_impl->GetLatestSequenceNumber());
+    ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0);
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Start", [&](void*) {
+          uint64_t cur_threads_joining = threads_joining.fetch_add(1);
+          // Wait for the last joined writer to link to the queue.
+          // In this way the writers link to the queue one by one.
+          // This allows us to confidently detect the first writer
+          // who increases threads_linked as the leader.
+          while (threads_linked.load() < cur_threads_joining) {
+          }
+        });
+
+    // Verification once writers call JoinBatchGroup.
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:Wait", [&](void* arg) {
+          uint64_t cur_threads_linked = threads_linked.fetch_add(1);
+          bool is_leader = false;
+          bool is_last = false;
+
+          // who am i
+          is_leader = (cur_threads_linked == 0);
+          is_last = (cur_threads_linked == write_group.size() - 1);
+
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (is_leader) {
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT);
+          }
+
+          // (meta test) the first WriteOP should indeed be the first
+          // and the last should be the last (all others can be out of
+          // order)
+          if (is_leader) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.front().callback_.should_fail_);
+          } else if (is_last) {
+            ASSERT_TRUE(writer->callback->Callback(nullptr).ok() ==
+                        !write_group.back().callback_.should_fail_);
+          }
+
+          threads_verified.fetch_add(1);
+          // Wait here until all verification in this sync-point
+          // callback finish for all writers.
+          while (threads_verified.load() < write_group.size()) {
+          }
+        });
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
+        "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) {
+          // check my state
+          auto* writer = reinterpret_cast<WriteThread::Writer*>(arg);
+
+          if (!allow_batching_) {
+            // no batching so everyone should be a leader
+            ASSERT_TRUE(writer->state ==
+                        WriteThread::State::STATE_GROUP_LEADER);
+          } else if (!allow_parallel_) {
+            ASSERT_TRUE(writer->state == WriteThread::State::STATE_COMPLETED ||
+                        (enable_pipelined_write_ &&
+                         writer->state ==
+                             WriteThread::State::STATE_MEMTABLE_WRITER_LEADER));
+          }
+        });
+
+    std::atomic<uint32_t> thread_num(0);
+    std::atomic<char> dummy_key(0);
+
+    // Each write thread create a random write batch and write to DB
+    // with a write callback.
+    std::function<void()> write_with_callback_func = [&]() {
+      uint32_t i = thread_num.fetch_add(1);
+      Random rnd(i);
+
+      // leaders gotta lead
+      while (i > 0 && threads_verified.load() < 1) {
+      }
+
+      // loser has to lose
+      while (i == write_group.size() - 1 &&
+             threads_verified.load() < write_group.size() - 1) {
+      }
+
+      auto& write_op = write_group.at(i);
+      write_op.Clear();
+      write_op.callback_.allow_batching_ = allow_batching_;
+
+      // insert some keys
+      for (uint32_t j = 0; j < rnd.Next() % 50; j++) {
+        // grab unique key
+        char my_key = dummy_key.fetch_add(1);
+
+        string skey(5, my_key);
+        string sval(10, my_key);
+        write_op.Put(skey, sval);
+
+        if (!write_op.callback_.should_fail_ && !seq_per_batch_) {
+          seq.fetch_add(1);
+        }
+      }
+      if (!write_op.callback_.should_fail_ && seq_per_batch_) {
+        seq.fetch_add(1);
+      }
+
+      WriteOptions woptions;
+      woptions.disableWAL = !enable_WAL_;
+      woptions.sync = enable_WAL_;
+      if (woptions.protection_bytes_per_key > 0) {
+        ASSERT_OK(WriteBatchInternal::UpdateProtectionInfo(
+            &write_op.write_batch_, woptions.protection_bytes_per_key));
+      }
+      Status s;
+      if (seq_per_batch_) {
+        class PublishSeqCallback : public PreReleaseCallback {
+         public:
+          PublishSeqCallback(DBImpl* db_impl_in) : db_impl_(db_impl_in) {}
+          Status Callback(SequenceNumber last_seq, bool /*not used*/, uint64_t,
+                          size_t /*index*/, size_t /*total*/) override {
+            db_impl_->SetLastPublishedSequence(last_seq);
+            return Status::OK();
+          }
+          DBImpl* db_impl_;
+        } publish_seq_callback(db_impl);
+        // seq_per_batch_ requires a natural batch separator or Noop
+        ASSERT_OK(WriteBatchInternal::InsertNoop(&write_op.write_batch_));
+        const size_t ONE_BATCH = 1;
+        s = db_impl->WriteImpl(woptions, &write_op.write_batch_,
+                               &write_op.callback_, nullptr, 0, false, nullptr,
+                               ONE_BATCH,
+                               two_queues_ ? &publish_seq_callback : nullptr);
+      } else {
+        s = db_impl->WriteWithCallback(woptions, &write_op.write_batch_,
+                                       &write_op.callback_);
+      }
+
+      if (write_op.callback_.should_fail_) {
+        ASSERT_TRUE(s.IsBusy());
+      } else {
+        ASSERT_OK(s);
+      }
+    };
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
+
+    // do all the writes
+    std::vector<port::Thread> threads;
+    for (uint32_t i = 0; i < write_group.size(); i++) {
+      threads.emplace_back(write_with_callback_func);
+    }
+    for (auto& t : threads) {
+      t.join();
+    }
+
+    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
+
+    // check for keys
+    string value;
+    for (auto& w : write_group) {
+      ASSERT_TRUE(w.callback_.was_called_.load());
+      for (auto& kvp : w.kvs_) {
+        if (w.callback_.should_fail_) {
+          ASSERT_TRUE(db->Get(read_options, kvp.first, &value).IsNotFound());
+        } else {
+          ASSERT_OK(db->Get(read_options, kvp.first, &value));
+          ASSERT_EQ(value, kvp.second);
+        }
+      }
+    }
+
+    ASSERT_EQ(seq.load(), db_impl->TEST_GetLastVisibleSequence());
+
+    delete db;
+    ASSERT_OK(DestroyDB(dbname, options));
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(WriteCallbackPTest, WriteCallbackPTest,
+                        ::testing::Combine(::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool(), ::testing::Bool(),
+                                           ::testing::Bool()));
+#endif  // !defined(ROCKSDB_VALGRIND_RUN) || defined(ROCKSDB_FULL_VALGRIND_RUN)
+
+TEST_F(WriteCallbackTest, WriteCallBackTest) {
+  Options options;
+  WriteOptions write_options;
+  ReadOptions read_options;
+  string value;
+  DB* db;
+  DBImpl* db_impl;
+
+  ASSERT_OK(DestroyDB(dbname, options));
+
+  options.create_if_missing = true;
+  Status s = DB::Open(options, dbname, &db);
+  ASSERT_OK(s);
+
+  db_impl = dynamic_cast<DBImpl*>(db);
+  ASSERT_TRUE(db_impl);
+
+  WriteBatch wb;
+
+  ASSERT_OK(wb.Put("a", "value.a"));
+  ASSERT_OK(wb.Delete("x"));
+
+  // Test a simple Write
+  s = db->Write(write_options, &wb);
+  ASSERT_OK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a", value);
+
+  // Test WriteWithCallback
+  WriteCallbackTestWriteCallback1 callback1;
+  WriteBatch wb2;
+
+  ASSERT_OK(wb2.Put("a", "value.a2"));
+
+  s = db_impl->WriteWithCallback(write_options, &wb2, &callback1);
+  ASSERT_OK(s);
+  ASSERT_TRUE(callback1.was_called);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  // Test WriteWithCallback for a callback that fails
+  WriteCallbackTestWriteCallback2 callback2;
+  WriteBatch wb3;
+
+  ASSERT_OK(wb3.Put("a", "value.a3"));
+
+  s = db_impl->WriteWithCallback(write_options, &wb3, &callback2);
+  ASSERT_NOK(s);
+
+  s = db->Get(read_options, "a", &value);
+  ASSERT_OK(s);
+  ASSERT_EQ("value.a2", value);
+
+  delete db;
+  ASSERT_OK(DestroyDB(dbname, options));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+
+#else
+#include <stdio.h>
+
+int main(int /*argc*/, char** /*argv*/) {
+  fprintf(stderr,
+          "SKIPPED as WriteWithCallback is not supported in ROCKSDB_LITE\n");
+  return 0;
+}
+
+#endif  // !ROCKSDB_LITE
diff --git a/src/rocksdb/db/write_controller.cc b/src/rocksdb/db/write_controller.cc
new file mode 100644
index 000000000..c5f744375
--- /dev/null
+++ b/src/rocksdb/db/write_controller.cc
@@ -0,0 +1,121 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_controller.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
+  ++total_stopped_;
+  return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
+    uint64_t write_rate) {
+  if (0 == total_delayed_++) {
+    // Starting delay, so reset counters.
+    next_refill_time_ = 0;
+    credit_in_bytes_ = 0;
+  }
+  // NOTE: for simplicity, any current credit_in_bytes_ or "debt" in
+  // next_refill_time_ will be based on an old rate. This rate will apply
+  // for subsequent additional debts and for the next refill.
+  set_delayed_write_rate(write_rate);
+  return std::unique_ptr<WriteControllerToken>(new DelayWriteToken(this));
+}
+
+std::unique_ptr<WriteControllerToken>
+WriteController::GetCompactionPressureToken() {
+  ++total_compaction_pressure_;
+  return std::unique_ptr<WriteControllerToken>(
+      new CompactionPressureToken(this));
+}
+
+bool WriteController::IsStopped() const {
+  return total_stopped_.load(std::memory_order_relaxed) > 0;
+}
+// This is inside DB mutex, so we can't sleep and need to minimize
+// frequency to get time.
+// If it turns out to be a performance issue, we can redesign the thread
+// synchronization model here.
+// The function trust caller will sleep micros returned.
+uint64_t WriteController::GetDelay(SystemClock* clock, uint64_t num_bytes) {
+  if (total_stopped_.load(std::memory_order_relaxed) > 0) {
+    return 0;
+  }
+  if (total_delayed_.load(std::memory_order_relaxed) == 0) {
+    return 0;
+  }
+
+  if (credit_in_bytes_ >= num_bytes) {
+    credit_in_bytes_ -= num_bytes;
+    return 0;
+  }
+  // The frequency to get time inside DB mutex is less than one per refill
+  // interval.
+  auto time_now = NowMicrosMonotonic(clock);
+
+  const uint64_t kMicrosPerSecond = 1000000;
+  // Refill every 1 ms
+  const uint64_t kMicrosPerRefill = 1000;
+
+  if (next_refill_time_ == 0) {
+    // Start with an initial allotment of bytes for one interval
+    next_refill_time_ = time_now;
+  }
+  if (next_refill_time_ <= time_now) {
+    // Refill based on time interval plus any extra elapsed
+    uint64_t elapsed = time_now - next_refill_time_ + kMicrosPerRefill;
+    credit_in_bytes_ += static_cast<uint64_t>(
+        1.0 * elapsed / kMicrosPerSecond * delayed_write_rate_ + 0.999999);
+    next_refill_time_ = time_now + kMicrosPerRefill;
+
+    if (credit_in_bytes_ >= num_bytes) {
+      // Avoid delay if possible, to reduce DB mutex release & re-aquire.
+      credit_in_bytes_ -= num_bytes;
+      return 0;
+    }
+  }
+
+  // We need to delay to avoid exceeding write rate.
+  assert(num_bytes > credit_in_bytes_);
+  uint64_t bytes_over_budget = num_bytes - credit_in_bytes_;
+  uint64_t needed_delay = static_cast<uint64_t>(
+      1.0 * bytes_over_budget / delayed_write_rate_ * kMicrosPerSecond);
+
+  credit_in_bytes_ = 0;
+  next_refill_time_ += needed_delay;
+
+  // Minimum delay of refill interval, to reduce DB mutex contention.
+  return std::max(next_refill_time_ - time_now, kMicrosPerRefill);
+}
+
+uint64_t WriteController::NowMicrosMonotonic(SystemClock* clock) {
+  return clock->NowNanos() / std::milli::den;
+}
+
+StopWriteToken::~StopWriteToken() {
+  assert(controller_->total_stopped_ >= 1);
+  --controller_->total_stopped_;
+}
+
+DelayWriteToken::~DelayWriteToken() {
+  controller_->total_delayed_--;
+  assert(controller_->total_delayed_.load() >= 0);
+}
+
+CompactionPressureToken::~CompactionPressureToken() {
+  controller_->total_compaction_pressure_--;
+  assert(controller_->total_compaction_pressure_ >= 0);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller.h b/src/rocksdb/db/write_controller.h
new file mode 100644
index 000000000..bcead165b
--- /dev/null
+++ b/src/rocksdb/db/write_controller.h
@@ -0,0 +1,148 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <stdint.h>
+
+#include <atomic>
+#include <memory>
+
+#include "rocksdb/rate_limiter.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class SystemClock;
+class WriteControllerToken;
+
+// WriteController is controlling write stalls in our write code-path. Write
+// stalls happen when compaction can't keep up with write rate.
+// All of the methods here (including WriteControllerToken's destructors) need
+// to be called while holding DB mutex
+class WriteController {
+ public:
+  explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u,
+                           int64_t low_pri_rate_bytes_per_sec = 1024 * 1024)
+      : total_stopped_(0),
+        total_delayed_(0),
+        total_compaction_pressure_(0),
+        credit_in_bytes_(0),
+        next_refill_time_(0),
+        low_pri_rate_limiter_(
+            NewGenericRateLimiter(low_pri_rate_bytes_per_sec)) {
+    set_max_delayed_write_rate(_delayed_write_rate);
+  }
+  ~WriteController() = default;
+
+  // When an actor (column family) requests a stop token, all writes will be
+  // stopped until the stop token is released (deleted)
+  std::unique_ptr<WriteControllerToken> GetStopToken();
+  // When an actor (column family) requests a delay token, total delay for all
+  // writes to the DB will be controlled under the delayed write rate. Every
+  // write needs to call GetDelay() with number of bytes writing to the DB,
+  // which returns number of microseconds to sleep.
+  std::unique_ptr<WriteControllerToken> GetDelayToken(
+      uint64_t delayed_write_rate);
+  // When an actor (column family) requests a moderate token, compaction
+  // threads will be increased
+  std::unique_ptr<WriteControllerToken> GetCompactionPressureToken();
+
+  // these three metods are querying the state of the WriteController
+  bool IsStopped() const;
+  bool NeedsDelay() const { return total_delayed_.load() > 0; }
+  bool NeedSpeedupCompaction() const {
+    return IsStopped() || NeedsDelay() || total_compaction_pressure_.load() > 0;
+  }
+  // return how many microseconds the caller needs to sleep after the call
+  // num_bytes: how many number of bytes to put into the DB.
+  // Prerequisite: DB mutex held.
+  uint64_t GetDelay(SystemClock* clock, uint64_t num_bytes);
+  void set_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    } else if (write_rate > max_delayed_write_rate()) {
+      write_rate = max_delayed_write_rate();
+    }
+    delayed_write_rate_ = write_rate;
+  }
+
+  void set_max_delayed_write_rate(uint64_t write_rate) {
+    // avoid divide 0
+    if (write_rate == 0) {
+      write_rate = 1u;
+    }
+    max_delayed_write_rate_ = write_rate;
+    // update delayed_write_rate_ as well
+    delayed_write_rate_ = write_rate;
+  }
+
+  uint64_t delayed_write_rate() const { return delayed_write_rate_; }
+
+  uint64_t max_delayed_write_rate() const { return max_delayed_write_rate_; }
+
+  RateLimiter* low_pri_rate_limiter() { return low_pri_rate_limiter_.get(); }
+
+ private:
+  uint64_t NowMicrosMonotonic(SystemClock* clock);
+
+  friend class WriteControllerToken;
+  friend class StopWriteToken;
+  friend class DelayWriteToken;
+  friend class CompactionPressureToken;
+
+  std::atomic<int> total_stopped_;
+  std::atomic<int> total_delayed_;
+  std::atomic<int> total_compaction_pressure_;
+
+  // Number of bytes allowed to write without delay
+  uint64_t credit_in_bytes_;
+  // Next time that we can add more credit of bytes
+  uint64_t next_refill_time_;
+  // Write rate set when initialization or by `DBImpl::SetDBOptions`
+  uint64_t max_delayed_write_rate_;
+  // Current write rate (bytes / second)
+  uint64_t delayed_write_rate_;
+
+  std::unique_ptr<RateLimiter> low_pri_rate_limiter_;
+};
+
+class WriteControllerToken {
+ public:
+  explicit WriteControllerToken(WriteController* controller)
+      : controller_(controller) {}
+  virtual ~WriteControllerToken() {}
+
+ protected:
+  WriteController* controller_;
+
+ private:
+  // no copying allowed
+  WriteControllerToken(const WriteControllerToken&) = delete;
+  void operator=(const WriteControllerToken&) = delete;
+};
+
+class StopWriteToken : public WriteControllerToken {
+ public:
+  explicit StopWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~StopWriteToken();
+};
+
+class DelayWriteToken : public WriteControllerToken {
+ public:
+  explicit DelayWriteToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~DelayWriteToken();
+};
+
+class CompactionPressureToken : public WriteControllerToken {
+ public:
+  explicit CompactionPressureToken(WriteController* controller)
+      : WriteControllerToken(controller) {}
+  virtual ~CompactionPressureToken();
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_controller_test.cc b/src/rocksdb/db/write_controller_test.cc
new file mode 100644
index 000000000..b6321a3bc
--- /dev/null
+++ b/src/rocksdb/db/write_controller_test.cc
@@ -0,0 +1,248 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+#include "db/write_controller.h"
+
+#include <array>
+#include <ratio>
+
+#include "rocksdb/system_clock.h"
+#include "test_util/testharness.h"
+
+namespace ROCKSDB_NAMESPACE {
+namespace {
+class TimeSetClock : public SystemClockWrapper {
+ public:
+  explicit TimeSetClock() : SystemClockWrapper(nullptr) {}
+  const char* Name() const override { return "TimeSetClock"; }
+  uint64_t now_micros_ = 6666;
+  uint64_t NowNanos() override { return now_micros_ * std::milli::den; }
+};
+}  // anonymous namespace
+class WriteControllerTest : public testing::Test {
+ public:
+  WriteControllerTest() { clock_ = std::make_shared<TimeSetClock>(); }
+  std::shared_ptr<TimeSetClock> clock_;
+};
+
+// Make tests easier to read
+#define MILLION *1000000u
+#define MB MILLION
+#define MBPS MILLION
+#define SECS MILLION  // in microseconds
+
+TEST_F(WriteControllerTest, BasicAPI) {
+  WriteController controller(40 MBPS);  // also set max delayed rate
+  EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  // set, get
+  controller.set_delayed_write_rate(20 MBPS);
+  EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+  EXPECT_FALSE(controller.IsStopped());
+  EXPECT_FALSE(controller.NeedsDelay());
+  EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+
+  {
+    // set with token, get
+    auto delay_token_0 = controller.GetDelayToken(10 MBPS);
+    EXPECT_EQ(controller.delayed_write_rate(), 10 MBPS);
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    // test with delay
+    EXPECT_EQ(2 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 2 SECS;  // pay the "debt"
+
+    auto delay_token_1 = controller.GetDelayToken(2 MBPS);
+    EXPECT_EQ(10 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 10 SECS;  // pay the "debt"
+
+    auto delay_token_2 = controller.GetDelayToken(1 MBPS);
+    EXPECT_EQ(20 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 20 SECS;  // pay the "debt"
+
+    auto delay_token_3 = controller.GetDelayToken(20 MBPS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 20 MB));
+    clock_->now_micros_ += 1 SECS;  // pay the "debt"
+
+    // 60M is more than the max rate of 40M. Max rate will be used.
+    EXPECT_EQ(controller.delayed_write_rate(), 20 MBPS);
+    auto delay_token_4 =
+        controller.GetDelayToken(controller.delayed_write_rate() * 3);
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    EXPECT_EQ(static_cast<uint64_t>(0.5 SECS),
+              controller.GetDelay(clock_.get(), 20 MB));
+
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+
+    // Test stop tokens
+    {
+      auto stop_token_1 = controller.GetStopToken();
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      {
+        auto stop_token_2 = controller.GetStopToken();
+        EXPECT_TRUE(controller.IsStopped());
+        EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+      }
+      EXPECT_TRUE(controller.IsStopped());
+      EXPECT_EQ(0, controller.GetDelay(clock_.get(), 100 MB));
+    }
+    // Stop tokens released
+    EXPECT_FALSE(controller.IsStopped());
+    EXPECT_TRUE(controller.NeedsDelay());
+    EXPECT_EQ(controller.delayed_write_rate(), 40 MBPS);
+    // pay the previous "debt"
+    clock_->now_micros_ += static_cast<uint64_t>(0.5 SECS);
+    EXPECT_EQ(1 SECS, controller.GetDelay(clock_.get(), 40 MB));
+  }
+
+  // Delay tokens released
+  EXPECT_FALSE(controller.NeedsDelay());
+}
+
+TEST_F(WriteControllerTest, StartFilled) {
+  WriteController controller(10 MBPS);
+
+  // Attempt to write two things that combined would be allowed within
+  // a single refill interval
+  auto delay_token_0 =
+      controller.GetDelayToken(controller.delayed_write_rate());
+
+  // Verify no delay because write rate has not been exceeded within
+  // refill interval.
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Allow refill (kMicrosPerRefill)
+  clock_->now_micros_ += 1000;
+
+  // Again
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 2000u /*bytes*/));
+
+  // Control: something bigger that would exceed write rate within interval
+  uint64_t delay = controller.GetDelay(clock_.get(), 10 MB);
+  EXPECT_GT(1.0 * delay, 0.999 SECS);
+  EXPECT_LT(1.0 * delay, 1.001 SECS);
+}
+
+TEST_F(WriteControllerTest, DebtAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Accumulate a time delay debt with no passage of time, like many column
+  // families delaying writes simultaneously. (Old versions of WriteController
+  // would reset the debt on every GetDelayToken.)
+  uint64_t debt = 0;
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;
+
+  // Now accumulate debt with some passage of time.
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Debt is accumulated in time, not in bytes, so this new write
+    // limit is not applied to prior requested delays, even it they are
+    // in progress.
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    uint64_t delay = controller.GetDelay(clock_.get(), 63 MB);
+    ASSERT_GT(delay, debt);
+    uint64_t incremental = delay - debt;
+    ASSERT_EQ(incremental, (63 SECS) / (i + 1u));
+    debt += incremental;
+    uint64_t credit = debt / 2;
+    clock_->now_micros_ += credit;
+    debt -= credit;
+  }
+
+  // Pay down the debt
+  clock_->now_micros_ += debt;
+  debt = 0;    // consistent state
+  (void)debt;  // appease clang-analyze
+
+  // Verify paid down
+  EXPECT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+
+  // Accumulate another debt, without accounting, and releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    // Big and small are delayed
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 63 MB));
+    ASSERT_LT(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+    tokens[i].reset();
+  }
+  // All tokens released.
+  // Verify that releasing all tokens pays down debt, even with no time passage.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 100u /*small bytes*/));
+}
+
+// This may or may not be a "good" feature, but it's an old feature
+TEST_F(WriteControllerTest, CreditAccumulation) {
+  WriteController controller(10 MBPS);
+
+  std::array<std::unique_ptr<WriteControllerToken>, 10> tokens;
+
+  // Ensure started
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit
+  uint64_t credit = 1000 SECS /* see below: * 1 MB / 1 SEC */;
+  clock_->now_micros_ += credit;
+
+  // Spend some credit (burst of I/O)
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    tokens[i] = controller.GetDelayToken((i + 1u) MBPS);
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 63 MB));
+    // In WriteController, credit is accumulated in bytes, not in time.
+    // After an "unnecessary" delay, all of our time credit will be
+    // translated to bytes on the next operation, in this case with
+    // setting 1 MBPS. So regardless of the rate at delay time, we just
+    // account for the bytes.
+    credit -= 63 MB;
+  }
+  // Spend remaining credit
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(0U, controller.GetDelay(clock_.get(), credit));
+  // Verify
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+  clock_->now_micros_ += 10 SECS;
+
+  // Accumulate a credit, no accounting
+  clock_->now_micros_ += 1000 SECS;
+
+  // Spend a small amount, releasing tokens
+  for (unsigned i = 0; i < tokens.size(); ++i) {
+    ASSERT_EQ(0U, controller.GetDelay(clock_.get(), 3 MB));
+    tokens[i].reset();
+  }
+
+  // All tokens released.
+  // Verify credit is wiped away on new delay.
+  tokens[0] = controller.GetDelayToken(1 MBPS);
+  ASSERT_EQ(10 SECS, controller.GetDelay(clock_.get(), 10 MB));
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/src/rocksdb/db/write_thread.cc b/src/rocksdb/db/write_thread.cc
new file mode 100644
index 000000000..cc8645f37
--- /dev/null
+++ b/src/rocksdb/db/write_thread.cc
@@ -0,0 +1,815 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "db/write_thread.h"
+
+#include <chrono>
+#include <thread>
+
+#include "db/column_family.h"
+#include "monitoring/perf_context_imp.h"
+#include "port/port.h"
+#include "test_util/sync_point.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+WriteThread::WriteThread(const ImmutableDBOptions& db_options)
+    : max_yield_usec_(db_options.enable_write_thread_adaptive_yield
+                          ? db_options.write_thread_max_yield_usec
+                          : 0),
+      slow_yield_usec_(db_options.write_thread_slow_yield_usec),
+      allow_concurrent_memtable_write_(
+          db_options.allow_concurrent_memtable_write),
+      enable_pipelined_write_(db_options.enable_pipelined_write),
+      max_write_batch_group_size_bytes(
+          db_options.max_write_batch_group_size_bytes),
+      newest_writer_(nullptr),
+      newest_memtable_writer_(nullptr),
+      last_sequence_(0),
+      write_stall_dummy_(),
+      stall_mu_(),
+      stall_cv_(&stall_mu_) {}
+
+uint8_t WriteThread::BlockingAwaitState(Writer* w, uint8_t goal_mask) {
+  // We're going to block.  Lazily create the mutex.  We guarantee
+  // propagation of this construction to the waker via the
+  // STATE_LOCKED_WAITING state.  The waker won't try to touch the mutex
+  // or the condvar unless they CAS away the STATE_LOCKED_WAITING that
+  // we install below.
+  w->CreateMutex();
+
+  auto state = w->state.load(std::memory_order_acquire);
+  assert(state != STATE_LOCKED_WAITING);
+  if ((state & goal_mask) == 0 &&
+      w->state.compare_exchange_strong(state, STATE_LOCKED_WAITING)) {
+    // we have permission (and an obligation) to use StateMutex
+    std::unique_lock<std::mutex> guard(w->StateMutex());
+    w->StateCV().wait(guard, [w] {
+      return w->state.load(std::memory_order_relaxed) != STATE_LOCKED_WAITING;
+    });
+    state = w->state.load(std::memory_order_relaxed);
+  }
+  // else tricky.  Goal is met or CAS failed.  In the latter case the waker
+  // must have changed the state, and compare_exchange_strong has updated
+  // our local variable with the new one.  At the moment WriteThread never
+  // waits for a transition across intermediate states, so we know that
+  // since a state change has occurred the goal must have been met.
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
+                                AdaptationContext* ctx) {
+  uint8_t state = 0;
+
+  // 1. Busy loop using "pause" for 1 micro sec
+  // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+  // 3. Else blocking wait
+
+  // On a modern Xeon each loop takes about 7 nanoseconds (most of which
+  // is the effect of the pause instruction), so 200 iterations is a bit
+  // more than a microsecond.  This is long enough that waits longer than
+  // this can amortize the cost of accessing the clock and yielding.
+  for (uint32_t tries = 0; tries < 200; ++tries) {
+    state = w->state.load(std::memory_order_acquire);
+    if ((state & goal_mask) != 0) {
+      return state;
+    }
+    port::AsmVolatilePause();
+  }
+
+  // This is below the fast path, so that the stat is zero when all writes are
+  // from the same thread.
+  PERF_TIMER_GUARD(write_thread_wait_nanos);
+
+  // If we're only going to end up waiting a short period of time,
+  // it can be a lot more efficient to call std::this_thread::yield()
+  // in a loop than to block in StateMutex().  For reference, on my 4.0
+  // SELinux test server with support for syscall auditing enabled, the
+  // minimum latency between FUTEX_WAKE to returning from FUTEX_WAIT is
+  // 2.7 usec, and the average is more like 10 usec.  That can be a big
+  // drag on RockDB's single-writer design.  Of course, spinning is a
+  // bad idea if other threads are waiting to run or if we're going to
+  // wait for a long time.  How do we decide?
+  //
+  // We break waiting into 3 categories: short-uncontended,
+  // short-contended, and long.  If we had an oracle, then we would always
+  // spin for short-uncontended, always block for long, and our choice for
+  // short-contended might depend on whether we were trying to optimize
+  // RocksDB throughput or avoid being greedy with system resources.
+  //
+  // Bucketing into short or long is easy by measuring elapsed time.
+  // Differentiating short-uncontended from short-contended is a bit
+  // trickier, but not too bad.  We could look for involuntary context
+  // switches using getrusage(RUSAGE_THREAD, ..), but it's less work
+  // (portability code and CPU) to just look for yield calls that take
+  // longer than we expect.  sched_yield() doesn't actually result in any
+  // context switch overhead if there are no other runnable processes
+  // on the current core, in which case it usually takes less than
+  // a microsecond.
+  //
+  // There are two primary tunables here: the threshold between "short"
+  // and "long" waits, and the threshold at which we suspect that a yield
+  // is slow enough to indicate we should probably block.  If these
+  // thresholds are chosen well then CPU-bound workloads that don't
+  // have more threads than cores will experience few context switches
+  // (voluntary or involuntary), and the total number of context switches
+  // (voluntary and involuntary) will not be dramatically larger (maybe
+  // 2x) than the number of voluntary context switches that occur when
+  // --max_yield_wait_micros=0.
+  //
+  // There's another constant, which is the number of slow yields we will
+  // tolerate before reversing our previous decision.  Solitary slow
+  // yields are pretty common (low-priority small jobs ready to run),
+  // so this should be at least 2.  We set this conservatively to 3 so
+  // that we can also immediately schedule a ctx adaptation, rather than
+  // waiting for the next update_ctx.
+
+  const size_t kMaxSlowYieldsWhileSpinning = 3;
+
+  // Whether the yield approach has any credit in this context. The credit is
+  // added by yield being succesfull before timing out, and decreased otherwise.
+  auto& yield_credit = ctx->value;
+  // Update the yield_credit based on sample runs or right after a hard failure
+  bool update_ctx = false;
+  // Should we reinforce the yield credit
+  bool would_spin_again = false;
+  // The samling base for updating the yeild credit. The sampling rate would be
+  // 1/sampling_base.
+  const int sampling_base = 256;
+
+  if (max_yield_usec_ > 0) {
+    update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
+
+    if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
+      // we're updating the adaptation statistics, or spinning has >
+      // 50% chance of being shorter than max_yield_usec_ and causing no
+      // involuntary context switches
+      auto spin_begin = std::chrono::steady_clock::now();
+
+      // this variable doesn't include the final yield (if any) that
+      // causes the goal to be met
+      size_t slow_yield_count = 0;
+
+      auto iter_begin = spin_begin;
+      while ((iter_begin - spin_begin) <=
+             std::chrono::microseconds(max_yield_usec_)) {
+        std::this_thread::yield();
+
+        state = w->state.load(std::memory_order_acquire);
+        if ((state & goal_mask) != 0) {
+          // success
+          would_spin_again = true;
+          break;
+        }
+
+        auto now = std::chrono::steady_clock::now();
+        if (now == iter_begin ||
+            now - iter_begin >= std::chrono::microseconds(slow_yield_usec_)) {
+          // conservatively count it as a slow yield if our clock isn't
+          // accurate enough to measure the yield duration
+          ++slow_yield_count;
+          if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
+            // Not just one ivcsw, but several.  Immediately update yield_credit
+            // and fall back to blocking
+            update_ctx = true;
+            break;
+          }
+        }
+        iter_begin = now;
+      }
+    }
+  }
+
+  if ((state & goal_mask) == 0) {
+    TEST_SYNC_POINT_CALLBACK("WriteThread::AwaitState:BlockingWaiting", w);
+    state = BlockingAwaitState(w, goal_mask);
+  }
+
+  if (update_ctx) {
+    // Since our update is sample based, it is ok if a thread overwrites the
+    // updates by other threads. Thus the update does not have to be atomic.
+    auto v = yield_credit.load(std::memory_order_relaxed);
+    // fixed point exponential decay with decay constant 1/1024, with +1
+    // and -1 scaled to avoid overflow for int32_t
+    //
+    // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+    // 0.1%). If the sampled yield was successful, the credit is also increased
+    // by X. Setting X=2^17 ensures that the credit never exceeds
+    // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+    // logic applies to negative credits.
+    v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+    yield_credit.store(v, std::memory_order_relaxed);
+  }
+
+  assert((state & goal_mask) != 0);
+  return state;
+}
+
+void WriteThread::SetState(Writer* w, uint8_t new_state) {
+  assert(w);
+  auto state = w->state.load(std::memory_order_acquire);
+  if (state == STATE_LOCKED_WAITING ||
+      !w->state.compare_exchange_strong(state, new_state)) {
+    assert(state == STATE_LOCKED_WAITING);
+
+    std::lock_guard<std::mutex> guard(w->StateMutex());
+    assert(w->state.load(std::memory_order_relaxed) != new_state);
+    w->state.store(new_state, std::memory_order_relaxed);
+    w->StateCV().notify_one();
+  }
+}
+
+bool WriteThread::LinkOne(Writer* w, std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  assert(w->state == STATE_INIT);
+  Writer* writers = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    // If write stall in effect, and w->no_slowdown is not true,
+    // block here until stall is cleared. If its true, then return
+    // immediately
+    if (writers == &write_stall_dummy_) {
+      if (w->no_slowdown) {
+        w->status = Status::Incomplete("Write stall");
+        SetState(w, STATE_COMPLETED);
+        return false;
+      }
+      // Since no_slowdown is false, wait here to be notified of the write
+      // stall clearing
+      {
+        MutexLock lock(&stall_mu_);
+        writers = newest_writer->load(std::memory_order_relaxed);
+        if (writers == &write_stall_dummy_) {
+          TEST_SYNC_POINT_CALLBACK("WriteThread::WriteStall::Wait", w);
+          stall_cv_.Wait();
+          // Load newest_writers_ again since it may have changed
+          writers = newest_writer->load(std::memory_order_relaxed);
+          continue;
+        }
+      }
+    }
+    w->link_older = writers;
+    if (newest_writer->compare_exchange_weak(writers, w)) {
+      return (writers == nullptr);
+    }
+  }
+}
+
+bool WriteThread::LinkGroup(WriteGroup& write_group,
+                            std::atomic<Writer*>* newest_writer) {
+  assert(newest_writer != nullptr);
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  Writer* w = last_writer;
+  while (true) {
+    // Unset link_newer pointers to make sure when we call
+    // CreateMissingNewerLinks later it create all missing links.
+    w->link_newer = nullptr;
+    w->write_group = nullptr;
+    if (w == leader) {
+      break;
+    }
+    w = w->link_older;
+  }
+  Writer* newest = newest_writer->load(std::memory_order_relaxed);
+  while (true) {
+    leader->link_older = newest;
+    if (newest_writer->compare_exchange_weak(newest, last_writer)) {
+      return (newest == nullptr);
+    }
+  }
+}
+
+void WriteThread::CreateMissingNewerLinks(Writer* head) {
+  while (true) {
+    Writer* next = head->link_older;
+    if (next == nullptr || next->link_newer != nullptr) {
+      assert(next == nullptr || next->link_newer == head);
+      break;
+    }
+    next->link_newer = head;
+    head = next;
+  }
+}
+
+void WriteThread::CompleteLeader(WriteGroup& write_group) {
+  assert(write_group.size > 0);
+  Writer* leader = write_group.leader;
+  if (write_group.size == 1) {
+    write_group.leader = nullptr;
+    write_group.last_writer = nullptr;
+  } else {
+    assert(leader->link_newer != nullptr);
+    leader->link_newer->link_older = nullptr;
+    write_group.leader = leader->link_newer;
+  }
+  write_group.size -= 1;
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::CompleteFollower(Writer* w, WriteGroup& write_group) {
+  assert(write_group.size > 1);
+  assert(w != write_group.leader);
+  if (w == write_group.last_writer) {
+    w->link_older->link_newer = nullptr;
+    write_group.last_writer = w->link_older;
+  } else {
+    w->link_older->link_newer = w->link_newer;
+    w->link_newer->link_older = w->link_older;
+  }
+  write_group.size -= 1;
+  SetState(w, STATE_COMPLETED);
+}
+
+void WriteThread::BeginWriteStall() {
+  LinkOne(&write_stall_dummy_, &newest_writer_);
+
+  // Walk writer list until w->write_group != nullptr. The current write group
+  // will not have a mix of slowdown/no_slowdown, so its ok to stop at that
+  // point
+  Writer* w = write_stall_dummy_.link_older;
+  Writer* prev = &write_stall_dummy_;
+  while (w != nullptr && w->write_group == nullptr) {
+    if (w->no_slowdown) {
+      prev->link_older = w->link_older;
+      w->status = Status::Incomplete("Write stall");
+      SetState(w, STATE_COMPLETED);
+      // Only update `link_newer` if it's already set.
+      // `CreateMissingNewerLinks()` will update the nullptr `link_newer` later,
+      // which assumes the the first non-nullptr `link_newer` is the last
+      // nullptr link in the writer list.
+      // If `link_newer` is set here, `CreateMissingNewerLinks()` may stop
+      // updating the whole list when it sees the first non nullptr link.
+      if (prev->link_older && prev->link_older->link_newer) {
+        prev->link_older->link_newer = prev;
+      }
+      w = prev->link_older;
+    } else {
+      prev = w;
+      w = w->link_older;
+    }
+  }
+}
+
+void WriteThread::EndWriteStall() {
+  MutexLock lock(&stall_mu_);
+
+  // Unlink write_stall_dummy_ from the write queue. This will unblock
+  // pending write threads to enqueue themselves
+  assert(newest_writer_.load(std::memory_order_relaxed) == &write_stall_dummy_);
+  assert(write_stall_dummy_.link_older != nullptr);
+  write_stall_dummy_.link_older->link_newer = write_stall_dummy_.link_newer;
+  newest_writer_.exchange(write_stall_dummy_.link_older);
+
+  // Wake up writers
+  stall_cv_.SignalAll();
+}
+
+static WriteThread::AdaptationContext jbg_ctx("JoinBatchGroup");
+void WriteThread::JoinBatchGroup(Writer* w) {
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Start", w);
+  assert(w->batch != nullptr);
+
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+
+  if (linked_as_leader) {
+    SetState(w, STATE_GROUP_LEADER);
+  }
+
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w);
+  TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait2", w);
+
+  if (!linked_as_leader) {
+    /**
+     * Wait util:
+     * 1) An existing leader pick us as the new leader when it finishes
+     * 2) An existing leader pick us as its follewer and
+     * 2.1) finishes the memtable writes on our behalf
+     * 2.2) Or tell us to finish the memtable writes in pralallel
+     * 3) (pipelined write) An existing leader pick us as its follower and
+     *    finish book-keeping and WAL write for us, enqueue us as pending
+     *    memtable writer, and
+     * 3.1) we become memtable writer group leader, or
+     * 3.2) an existing memtable writer group leader tell us to finish memtable
+     *      writes in parallel.
+     */
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:BeganWaiting", w);
+    AwaitState(w,
+               STATE_GROUP_LEADER | STATE_MEMTABLE_WRITER_LEADER |
+                   STATE_PARALLEL_MEMTABLE_WRITER | STATE_COMPLETED,
+               &jbg_ctx);
+    TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w);
+  }
+}
+
+size_t WriteThread::EnterAsBatchGroupLeader(Writer* leader,
+                                            WriteGroup* write_group) {
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->last_writer = leader;
+  write_group->size = 1;
+  Writer* newest_writer = newest_writer_.load(std::memory_order_acquire);
+
+  // This is safe regardless of any db mutex status of the caller. Previous
+  // calls to ExitAsGroupLeader either didn't call CreateMissingNewerLinks
+  // (they emptied the list and then we added ourself as leader) or had to
+  // explicitly wake us up (the list was non-empty when we added ourself,
+  // so we have already received our MarkJoined).
+  CreateMissingNewerLinks(newest_writer);
+
+  // Tricky. Iteration start (leader) is exclusive and finish
+  // (newest_writer) is inclusive. Iteration goes from old to new.
+  Writer* w = leader;
+  while (w != newest_writer) {
+    assert(w->link_newer);
+    w = w->link_newer;
+
+    if (w->sync && !leader->sync) {
+      // Do not include a sync write into a batch handled by a non-sync write.
+      break;
+    }
+
+    if (w->no_slowdown != leader->no_slowdown) {
+      // Do not mix writes that are ok with delays with the ones that
+      // request fail on delays.
+      break;
+    }
+
+    if (w->disable_wal != leader->disable_wal) {
+      // Do not mix writes that enable WAL with the ones whose
+      // WAL disabled.
+      break;
+    }
+
+    if (w->protection_bytes_per_key != leader->protection_bytes_per_key) {
+      // Do not mix writes with different levels of integrity protection.
+      break;
+    }
+
+    if (w->rate_limiter_priority != leader->rate_limiter_priority) {
+      // Do not mix writes with different rate limiter priorities.
+      break;
+    }
+
+    if (w->batch == nullptr) {
+      // Do not include those writes with nullptr batch. Those are not writes,
+      // those are something else. They want to be alone
+      break;
+    }
+
+    if (w->callback != nullptr && !w->callback->AllowWriteBatching()) {
+      // don't batch writes that don't want to be batched
+      break;
+    }
+
+    auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+    if (size + batch_size > max_size) {
+      // Do not make batch too big
+      break;
+    }
+
+    w->write_group = write_group;
+    size += batch_size;
+    write_group->last_writer = w;
+    write_group->size++;
+  }
+  TEST_SYNC_POINT_CALLBACK("WriteThread::EnterAsBatchGroupLeader:End", w);
+  return size;
+}
+
+void WriteThread::EnterAsMemTableWriter(Writer* leader,
+                                        WriteGroup* write_group) {
+  assert(leader != nullptr);
+  assert(leader->link_older == nullptr);
+  assert(leader->batch != nullptr);
+  assert(write_group != nullptr);
+
+  size_t size = WriteBatchInternal::ByteSize(leader->batch);
+
+  // Allow the group to grow up to a maximum size, but if the
+  // original write is small, limit the growth so we do not slow
+  // down the small write too much.
+  size_t max_size = max_write_batch_group_size_bytes;
+  const uint64_t min_batch_size_bytes = max_write_batch_group_size_bytes / 8;
+  if (size <= min_batch_size_bytes) {
+    max_size = size + min_batch_size_bytes;
+  }
+
+  leader->write_group = write_group;
+  write_group->leader = leader;
+  write_group->size = 1;
+  Writer* last_writer = leader;
+
+  if (!allow_concurrent_memtable_write_ || !leader->batch->HasMerge()) {
+    Writer* newest_writer = newest_memtable_writer_.load();
+    CreateMissingNewerLinks(newest_writer);
+
+    Writer* w = leader;
+    while (w != newest_writer) {
+      assert(w->link_newer);
+      w = w->link_newer;
+
+      if (w->batch == nullptr) {
+        break;
+      }
+
+      if (w->batch->HasMerge()) {
+        break;
+      }
+
+      if (!allow_concurrent_memtable_write_) {
+        auto batch_size = WriteBatchInternal::ByteSize(w->batch);
+        if (size + batch_size > max_size) {
+          // Do not make batch too big
+          break;
+        }
+        size += batch_size;
+      }
+
+      w->write_group = write_group;
+      last_writer = w;
+      write_group->size++;
+    }
+  }
+
+  write_group->last_writer = last_writer;
+  write_group->last_sequence =
+      last_writer->sequence + WriteBatchInternal::Count(last_writer->batch) - 1;
+}
+
+void WriteThread::ExitAsMemTableWriter(Writer* /*self*/,
+                                       WriteGroup& write_group) {
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+
+  Writer* newest_writer = last_writer;
+  if (!newest_memtable_writer_.compare_exchange_strong(newest_writer,
+                                                       nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = last_writer->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_MEMTABLE_WRITER_LEADER);
+  }
+  Writer* w = leader;
+  while (true) {
+    if (!write_group.status.ok()) {
+      w->status = write_group.status;
+    }
+    Writer* next = w->link_newer;
+    if (w != leader) {
+      SetState(w, STATE_COMPLETED);
+    }
+    if (w == last_writer) {
+      break;
+    }
+    assert(next);
+    w = next;
+  }
+  // Note that leader has to exit last, since it owns the write group.
+  SetState(leader, STATE_COMPLETED);
+}
+
+void WriteThread::LaunchParallelMemTableWriters(WriteGroup* write_group) {
+  assert(write_group != nullptr);
+  write_group->running.store(write_group->size);
+  for (auto w : *write_group) {
+    SetState(w, STATE_PARALLEL_MEMTABLE_WRITER);
+  }
+}
+
+static WriteThread::AdaptationContext cpmtw_ctx(
+    "CompleteParallelMemTableWriter");
+// This method is called by both the leader and parallel followers
+bool WriteThread::CompleteParallelMemTableWriter(Writer* w) {
+  auto* write_group = w->write_group;
+  if (!w->status.ok()) {
+    std::lock_guard<std::mutex> guard(write_group->leader->StateMutex());
+    write_group->status = w->status;
+  }
+
+  if (write_group->running-- > 1) {
+    // we're not the last one
+    AwaitState(w, STATE_COMPLETED, &cpmtw_ctx);
+    return false;
+  }
+  // else we're the last parallel worker and should perform exit duties.
+  w->status = write_group->status;
+  // Callers of this function must ensure w->status is checked.
+  write_group->status.PermitUncheckedError();
+  return true;
+}
+
+void WriteThread::ExitAsBatchGroupFollower(Writer* w) {
+  auto* write_group = w->write_group;
+
+  assert(w->state == STATE_PARALLEL_MEMTABLE_WRITER);
+  assert(write_group->status.ok());
+  ExitAsBatchGroupLeader(*write_group, write_group->status);
+  assert(w->status.ok());
+  assert(w->state == STATE_COMPLETED);
+  SetState(write_group->leader, STATE_COMPLETED);
+}
+
+static WriteThread::AdaptationContext eabgl_ctx("ExitAsBatchGroupLeader");
+void WriteThread::ExitAsBatchGroupLeader(WriteGroup& write_group,
+                                         Status& status) {
+  TEST_SYNC_POINT_CALLBACK("WriteThread::ExitAsBatchGroupLeader:Start",
+                           &write_group);
+
+  Writer* leader = write_group.leader;
+  Writer* last_writer = write_group.last_writer;
+  assert(leader->link_older == nullptr);
+
+  // If status is non-ok already, then write_group.status won't have the chance
+  // of being propagated to caller.
+  if (!status.ok()) {
+    write_group.status.PermitUncheckedError();
+  }
+
+  // Propagate memtable write error to the whole group.
+  if (status.ok() && !write_group.status.ok()) {
+    status = write_group.status;
+  }
+
+  if (enable_pipelined_write_) {
+    // We insert a dummy Writer right before our current write_group. This
+    // allows us to unlink our write_group without the risk that a subsequent
+    // writer becomes a new leader and might overtake us and add itself to the
+    // memtable-writer-list before we can do so. This ensures that writers are
+    // added to the memtable-writer-list in the exact same order in which they
+    // were in the newest_writer list.
+    // This must happen before completing the writers from our group to prevent
+    // a race where the owning thread of one of these writers can start a new
+    // write operation.
+    Writer dummy;
+    Writer* head = newest_writer_.load(std::memory_order_acquire);
+    if (head != last_writer ||
+        !newest_writer_.compare_exchange_strong(head, &dummy)) {
+      // Either last_writer wasn't the head during the load(), or it was the
+      // head during the load() but somebody else pushed onto the list before
+      // we did the compare_exchange_strong (causing it to fail). In the latter
+      // case compare_exchange_strong has the effect of re-reading its first
+      // param (head). No need to retry a failing CAS, because only a departing
+      // leader (which we are at the moment) can remove nodes from the list.
+      assert(head != last_writer);
+
+      // After walking link_older starting from head (if not already done) we
+      // will be able to traverse w->link_newer below.
+      CreateMissingNewerLinks(head);
+      assert(last_writer->link_newer != nullptr);
+      last_writer->link_newer->link_older = &dummy;
+      dummy.link_newer = last_writer->link_newer;
+    }
+
+    // Complete writers that don't write to memtable
+    for (Writer* w = last_writer; w != leader;) {
+      Writer* next = w->link_older;
+      w->status = status;
+      if (!w->ShouldWriteToMemtable()) {
+        CompleteFollower(w, write_group);
+      }
+      w = next;
+    }
+    if (!leader->ShouldWriteToMemtable()) {
+      CompleteLeader(write_group);
+    }
+
+    TEST_SYNC_POINT_CALLBACK(
+        "WriteThread::ExitAsBatchGroupLeader:AfterCompleteWriters",
+        &write_group);
+
+    // Link the remaining of the group to memtable writer list.
+    // We have to link our group to memtable writer queue before wake up the
+    // next leader or set newest_writer_ to null, otherwise the next leader
+    // can run ahead of us and link to memtable writer queue before we do.
+    if (write_group.size > 0) {
+      if (LinkGroup(write_group, &newest_memtable_writer_)) {
+        // The leader can now be different from current writer.
+        SetState(write_group.leader, STATE_MEMTABLE_WRITER_LEADER);
+      }
+    }
+
+    // Unlink the dummy writer from the list and identify the new leader
+    head = newest_writer_.load(std::memory_order_acquire);
+    if (head != &dummy ||
+        !newest_writer_.compare_exchange_strong(head, nullptr)) {
+      CreateMissingNewerLinks(head);
+      Writer* new_leader = dummy.link_newer;
+      assert(new_leader != nullptr);
+      new_leader->link_older = nullptr;
+      SetState(new_leader, STATE_GROUP_LEADER);
+    }
+
+    AwaitState(leader,
+               STATE_MEMTABLE_WRITER_LEADER | STATE_PARALLEL_MEMTABLE_WRITER |
+                   STATE_COMPLETED,
+               &eabgl_ctx);
+  } else {
+    Writer* head = newest_writer_.load(std::memory_order_acquire);
+    if (head != last_writer ||
+        !newest_writer_.compare_exchange_strong(head, nullptr)) {
+      // Either last_writer wasn't the head during the load(), or it was the
+      // head during the load() but somebody else pushed onto the list before
+      // we did the compare_exchange_strong (causing it to fail).  In the
+      // latter case compare_exchange_strong has the effect of re-reading
+      // its first param (head).  No need to retry a failing CAS, because
+      // only a departing leader (which we are at the moment) can remove
+      // nodes from the list.
+      assert(head != last_writer);
+
+      // After walking link_older starting from head (if not already done)
+      // we will be able to traverse w->link_newer below. This function
+      // can only be called from an active leader, only a leader can
+      // clear newest_writer_, we didn't, and only a clear newest_writer_
+      // could cause the next leader to start their work without a call
+      // to MarkJoined, so we can definitely conclude that no other leader
+      // work is going on here (with or without db mutex).
+      CreateMissingNewerLinks(head);
+      assert(last_writer->link_newer != nullptr);
+      assert(last_writer->link_newer->link_older == last_writer);
+      last_writer->link_newer->link_older = nullptr;
+
+      // Next leader didn't self-identify, because newest_writer_ wasn't
+      // nullptr when they enqueued (we were definitely enqueued before them
+      // and are still in the list).  That means leader handoff occurs when
+      // we call MarkJoined
+      SetState(last_writer->link_newer, STATE_GROUP_LEADER);
+    }
+    // else nobody else was waiting, although there might already be a new
+    // leader now
+
+    while (last_writer != leader) {
+      assert(last_writer);
+      last_writer->status = status;
+      // we need to read link_older before calling SetState, because as soon
+      // as it is marked committed the other thread's Await may return and
+      // deallocate the Writer.
+      auto next = last_writer->link_older;
+      SetState(last_writer, STATE_COMPLETED);
+
+      last_writer = next;
+    }
+  }
+}
+
+static WriteThread::AdaptationContext eu_ctx("EnterUnbatched");
+void WriteThread::EnterUnbatched(Writer* w, InstrumentedMutex* mu) {
+  assert(w != nullptr && w->batch == nullptr);
+  mu->Unlock();
+  bool linked_as_leader = LinkOne(w, &newest_writer_);
+  if (!linked_as_leader) {
+    TEST_SYNC_POINT("WriteThread::EnterUnbatched:Wait");
+    // Last leader will not pick us as a follower since our batch is nullptr
+    AwaitState(w, STATE_GROUP_LEADER, &eu_ctx);
+  }
+  if (enable_pipelined_write_) {
+    WaitForMemTableWriters();
+  }
+  mu->Lock();
+}
+
+void WriteThread::ExitUnbatched(Writer* w) {
+  assert(w != nullptr);
+  Writer* newest_writer = w;
+  if (!newest_writer_.compare_exchange_strong(newest_writer, nullptr)) {
+    CreateMissingNewerLinks(newest_writer);
+    Writer* next_leader = w->link_newer;
+    assert(next_leader != nullptr);
+    next_leader->link_older = nullptr;
+    SetState(next_leader, STATE_GROUP_LEADER);
+  }
+}
+
+static WriteThread::AdaptationContext wfmw_ctx("WaitForMemTableWriters");
+void WriteThread::WaitForMemTableWriters() {
+  assert(enable_pipelined_write_);
+  if (newest_memtable_writer_.load() == nullptr) {
+    return;
+  }
+  Writer w;
+  if (!LinkOne(&w, &newest_memtable_writer_)) {
+    AwaitState(&w, STATE_MEMTABLE_WRITER_LEADER, &wfmw_ctx);
+  }
+  newest_memtable_writer_.store(nullptr);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/src/rocksdb/db/write_thread.h b/src/rocksdb/db/write_thread.h
new file mode 100644
index 000000000..0ea51d922
--- /dev/null
+++ b/src/rocksdb/db/write_thread.h
@@ -0,0 +1,440 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <cstdint>
+#include <mutex>
+#include <type_traits>
+#include <vector>
+
+#include "db/dbformat.h"
+#include "db/post_memtable_callback.h"
+#include "db/pre_release_callback.h"
+#include "db/write_callback.h"
+#include "monitoring/instrumented_mutex.h"
+#include "rocksdb/options.h"
+#include "rocksdb/status.h"
+#include "rocksdb/types.h"
+#include "rocksdb/write_batch.h"
+#include "util/autovector.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteThread {
+ public:
+  enum State : uint8_t {
+    // The initial state of a writer.  This is a Writer that is
+    // waiting in JoinBatchGroup.  This state can be left when another
+    // thread informs the waiter that it has become a group leader
+    // (-> STATE_GROUP_LEADER), when a leader that has chosen to be
+    // non-parallel informs a follower that its writes have been committed
+    // (-> STATE_COMPLETED), or when a leader that has chosen to perform
+    // updates in parallel and needs this Writer to apply its batch (->
+    // STATE_PARALLEL_MEMTABLE_WRITER).
+    STATE_INIT = 1,
+
+    // The state used to inform a waiting Writer that it has become the
+    // leader, and it should now build a write batch group.  Tricky:
+    // this state is not used if newest_writer_ is empty when a writer
+    // enqueues itself, because there is no need to wait (or even to
+    // create the mutex and condvar used to wait) in that case.  This is
+    // a terminal state unless the leader chooses to make this a parallel
+    // batch, in which case the last parallel worker to finish will move
+    // the leader to STATE_COMPLETED.
+    STATE_GROUP_LEADER = 2,
+
+    // The state used to inform a waiting writer that it has become the
+    // leader of memtable writer group. The leader will either write
+    // memtable for the whole group, or launch a parallel group write
+    // to memtable by calling LaunchParallelMemTableWrite.
+    STATE_MEMTABLE_WRITER_LEADER = 4,
+
+    // The state used to inform a waiting writer that it has become a
+    // parallel memtable writer. It can be the group leader who launch the
+    // parallel writer group, or one of the followers. The writer should then
+    // apply its batch to the memtable concurrently and call
+    // CompleteParallelMemTableWriter.
+    STATE_PARALLEL_MEMTABLE_WRITER = 8,
+
+    // A follower whose writes have been applied, or a parallel leader
+    // whose followers have all finished their work.  This is a terminal
+    // state.
+    STATE_COMPLETED = 16,
+
+    // A state indicating that the thread may be waiting using StateMutex()
+    // and StateCondVar()
+    STATE_LOCKED_WAITING = 32,
+  };
+
+  struct Writer;
+
+  struct WriteGroup {
+    Writer* leader = nullptr;
+    Writer* last_writer = nullptr;
+    SequenceNumber last_sequence;
+    // before running goes to zero, status needs leader->StateMutex()
+    Status status;
+    std::atomic<size_t> running;
+    size_t size = 0;
+
+    struct Iterator {
+      Writer* writer;
+      Writer* last_writer;
+
+      explicit Iterator(Writer* w, Writer* last)
+          : writer(w), last_writer(last) {}
+
+      Writer* operator*() const { return writer; }
+
+      Iterator& operator++() {
+        assert(writer != nullptr);
+        if (writer == last_writer) {
+          writer = nullptr;
+        } else {
+          writer = writer->link_newer;
+        }
+        return *this;
+      }
+
+      bool operator!=(const Iterator& other) const {
+        return writer != other.writer;
+      }
+    };
+
+    Iterator begin() const { return Iterator(leader, last_writer); }
+    Iterator end() const { return Iterator(nullptr, nullptr); }
+  };
+
+  // Information kept for every waiting writer.
+  struct Writer {
+    WriteBatch* batch;
+    bool sync;
+    bool no_slowdown;
+    bool disable_wal;
+    Env::IOPriority rate_limiter_priority;
+    bool disable_memtable;
+    size_t batch_cnt;  // if non-zero, number of sub-batches in the write batch
+    size_t protection_bytes_per_key;
+    PreReleaseCallback* pre_release_callback;
+    PostMemTableCallback* post_memtable_callback;
+    uint64_t log_used;  // log number that this batch was inserted into
+    uint64_t log_ref;   // log number that memtable insert should reference
+    WriteCallback* callback;
+    bool made_waitable;          // records lazy construction of mutex and cv
+    std::atomic<uint8_t> state;  // write under StateMutex() or pre-link
+    WriteGroup* write_group;
+    SequenceNumber sequence;  // the sequence number to use for the first key
+    Status status;
+    Status callback_status;  // status returned by callback->Callback()
+
+    std::aligned_storage<sizeof(std::mutex)>::type state_mutex_bytes;
+    std::aligned_storage<sizeof(std::condition_variable)>::type state_cv_bytes;
+    Writer* link_older;  // read/write only before linking, or as leader
+    Writer* link_newer;  // lazy, read/write only before linking, or as leader
+
+    Writer()
+        : batch(nullptr),
+          sync(false),
+          no_slowdown(false),
+          disable_wal(false),
+          rate_limiter_priority(Env::IOPriority::IO_TOTAL),
+          disable_memtable(false),
+          batch_cnt(0),
+          protection_bytes_per_key(0),
+          pre_release_callback(nullptr),
+          post_memtable_callback(nullptr),
+          log_used(0),
+          log_ref(0),
+          callback(nullptr),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    Writer(const WriteOptions& write_options, WriteBatch* _batch,
+           WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable,
+           size_t _batch_cnt = 0,
+           PreReleaseCallback* _pre_release_callback = nullptr,
+           PostMemTableCallback* _post_memtable_callback = nullptr)
+        : batch(_batch),
+          sync(write_options.sync),
+          no_slowdown(write_options.no_slowdown),
+          disable_wal(write_options.disableWAL),
+          rate_limiter_priority(write_options.rate_limiter_priority),
+          disable_memtable(_disable_memtable),
+          batch_cnt(_batch_cnt),
+          protection_bytes_per_key(_batch->GetProtectionBytesPerKey()),
+          pre_release_callback(_pre_release_callback),
+          post_memtable_callback(_post_memtable_callback),
+          log_used(0),
+          log_ref(_log_ref),
+          callback(_callback),
+          made_waitable(false),
+          state(STATE_INIT),
+          write_group(nullptr),
+          sequence(kMaxSequenceNumber),
+          link_older(nullptr),
+          link_newer(nullptr) {}
+
+    ~Writer() {
+      if (made_waitable) {
+        StateMutex().~mutex();
+        StateCV().~condition_variable();
+      }
+      status.PermitUncheckedError();
+      callback_status.PermitUncheckedError();
+    }
+
+    bool CheckCallback(DB* db) {
+      if (callback != nullptr) {
+        callback_status = callback->Callback(db);
+      }
+      return callback_status.ok();
+    }
+
+    void CreateMutex() {
+      if (!made_waitable) {
+        // Note that made_waitable is tracked separately from state
+        // transitions, because we can't atomically create the mutex and
+        // link into the list.
+        made_waitable = true;
+        new (&state_mutex_bytes) std::mutex;
+        new (&state_cv_bytes) std::condition_variable;
+      }
+    }
+
+    // returns the aggregate status of this Writer
+    Status FinalStatus() {
+      if (!status.ok()) {
+        // a non-ok memtable write status takes presidence
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      } else if (!callback_status.ok()) {
+        // if the callback failed then that is the status we want
+        // because a memtable insert should not have been attempted
+        assert(callback != nullptr);
+        assert(status.ok());
+        return callback_status;
+      } else {
+        // if there is no callback then we only care about
+        // the memtable insert status
+        assert(callback == nullptr || callback_status.ok());
+        return status;
+      }
+    }
+
+    bool CallbackFailed() {
+      return (callback != nullptr) && !callback_status.ok();
+    }
+
+    bool ShouldWriteToMemtable() {
+      return status.ok() && !CallbackFailed() && !disable_memtable;
+    }
+
+    bool ShouldWriteToWAL() {
+      return status.ok() && !CallbackFailed() && !disable_wal;
+    }
+
+    // No other mutexes may be acquired while holding StateMutex(), it is
+    // always last in the order
+    std::mutex& StateMutex() {
+      assert(made_waitable);
+      return *static_cast<std::mutex*>(static_cast<void*>(&state_mutex_bytes));
+    }
+
+    std::condition_variable& StateCV() {
+      assert(made_waitable);
+      return *static_cast<std::condition_variable*>(
+          static_cast<void*>(&state_cv_bytes));
+    }
+  };
+
+  struct AdaptationContext {
+    const char* name;
+    std::atomic<int32_t> value;
+
+    explicit AdaptationContext(const char* name0) : name(name0), value(0) {}
+  };
+
+  explicit WriteThread(const ImmutableDBOptions& db_options);
+
+  virtual ~WriteThread() = default;
+
+  // IMPORTANT: None of the methods in this class rely on the db mutex
+  // for correctness. All of the methods except JoinBatchGroup and
+  // EnterUnbatched may be called either with or without the db mutex held.
+  // Correctness is maintained by ensuring that only a single thread is
+  // a leader at a time.
+
+  // Registers w as ready to become part of a batch group, waits until the
+  // caller should perform some work, and returns the current state of the
+  // writer.  If w has become the leader of a write batch group, returns
+  // STATE_GROUP_LEADER.  If w has been made part of a sequential batch
+  // group and the leader has performed the write, returns STATE_DONE.
+  // If w has been made part of a parallel batch group and is responsible
+  // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER.
+  //
+  // The db mutex SHOULD NOT be held when calling this function, because
+  // it will block.
+  //
+  // Writer* w:        Writer to be executed as part of a batch group
+  void JoinBatchGroup(Writer* w);
+
+  // Constructs a write batch group led by leader, which should be a
+  // Writer passed to JoinBatchGroup on the current thread.
+  //
+  // Writer* leader:          Writer that is STATE_GROUP_LEADER
+  // WriteGroup* write_group: Out-param of group members
+  // returns:                 Total batch group byte size
+  size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group);
+
+  // Unlinks the Writer-s in a batch group, wakes up the non-leaders,
+  // and wakes up the next leader (if any).
+  //
+  // WriteGroup* write_group: the write group
+  // Status status:           Status of write operation
+  void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status);
+
+  // Exit batch group on behalf of batch group leader.
+  void ExitAsBatchGroupFollower(Writer* w);
+
+  // Constructs a write batch group led by leader from newest_memtable_writers_
+  // list. The leader should either write memtable for the whole group and
+  // call ExitAsMemTableWriter, or launch parallel memtable write through
+  // LaunchParallelMemTableWriters.
+  void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup);
+
+  // Memtable writer group leader, or the last finished writer in a parallel
+  // write group, exit from the newest_memtable_writers_ list, and wake up
+  // the next leader if needed.
+  void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group);
+
+  // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of
+  // the non-leader members of this write batch group.  Sets Writer::sequence
+  // before waking them up.
+  //
+  // WriteGroup* write_group: Extra state used to coordinate the parallel add
+  void LaunchParallelMemTableWriters(WriteGroup* write_group);
+
+  // Reports the completion of w's batch to the parallel group leader, and
+  // waits for the rest of the parallel batch to complete.  Returns true
+  // if this thread is the last to complete, and hence should advance
+  // the sequence number and then call EarlyExitParallelGroup, false if
+  // someone else has already taken responsibility for that.
+  bool CompleteParallelMemTableWriter(Writer* w);
+
+  // Waits for all preceding writers (unlocking mu while waiting), then
+  // registers w as the currently proceeding writer.
+  //
+  // Writer* w:              A Writer not eligible for batching
+  // InstrumentedMutex* mu:  The db mutex, to unlock while waiting
+  // REQUIRES: db mutex held
+  void EnterUnbatched(Writer* w, InstrumentedMutex* mu);
+
+  // Completes a Writer begun with EnterUnbatched, unblocking subsequent
+  // writers.
+  void ExitUnbatched(Writer* w);
+
+  // Wait for all parallel memtable writers to finish, in case pipelined
+  // write is enabled.
+  void WaitForMemTableWriters();
+
+  SequenceNumber UpdateLastSequence(SequenceNumber sequence) {
+    if (sequence > last_sequence_) {
+      last_sequence_ = sequence;
+    }
+    return last_sequence_;
+  }
+
+  // Insert a dummy writer at the tail of the write queue to indicate a write
+  // stall, and fail any writers in the queue with no_slowdown set to true
+  void BeginWriteStall();
+
+  // Remove the dummy writer and wake up waiting writers
+  void EndWriteStall();
+
+ private:
+  // See AwaitState.
+  const uint64_t max_yield_usec_;
+  const uint64_t slow_yield_usec_;
+
+  // Allow multiple writers write to memtable concurrently.
+  const bool allow_concurrent_memtable_write_;
+
+  // Enable pipelined write to WAL and memtable.
+  const bool enable_pipelined_write_;
+
+  // The maximum limit of number of bytes that are written in a single batch
+  // of WAL or memtable write. It is followed when the leader write size
+  // is larger than 1/8 of this limit.
+  const uint64_t max_write_batch_group_size_bytes;
+
+  // Points to the newest pending writer. Only leader can remove
+  // elements, adding can be done lock-free by anybody.
+  std::atomic<Writer*> newest_writer_;
+
+  // Points to the newest pending memtable writer. Used only when pipelined
+  // write is enabled.
+  std::atomic<Writer*> newest_memtable_writer_;
+
+  // The last sequence that have been consumed by a writer. The sequence
+  // is not necessary visible to reads because the writer can be ongoing.
+  SequenceNumber last_sequence_;
+
+  // A dummy writer to indicate a write stall condition. This will be inserted
+  // at the tail of the writer queue by the leader, so newer writers can just
+  // check for this and bail
+  Writer write_stall_dummy_;
+
+  // Mutex and condvar for writers to block on a write stall. During a write
+  // stall, writers with no_slowdown set to false will wait on this rather
+  // on the writer queue
+  port::Mutex stall_mu_;
+  port::CondVar stall_cv_;
+
+  // Waits for w->state & goal_mask using w->StateMutex().  Returns
+  // the state that satisfies goal_mask.
+  uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask);
+
+  // Blocks until w->state & goal_mask, returning the state value
+  // that satisfied the predicate.  Uses ctx to adaptively use
+  // std::this_thread::yield() to avoid mutex overheads.  ctx should be
+  // a context-dependent static.
+  uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx);
+
+  // Set writer state and wake the writer up if it is waiting.
+  void SetState(Writer* w, uint8_t new_state);
+
+  // Links w into the newest_writer list. Return true if w was linked directly
+  // into the leader position.  Safe to call from multiple threads without
+  // external locking.
+  bool LinkOne(Writer* w, std::atomic<Writer*>* newest_writer);
+
+  // Link write group into the newest_writer list as a whole, while keeping the
+  // order of the writers unchanged. Return true if the group was linked
+  // directly into the leader position.
+  bool LinkGroup(WriteGroup& write_group, std::atomic<Writer*>* newest_writer);
+
+  // Computes any missing link_newer links.  Should not be called
+  // concurrently with itself.
+  void CreateMissingNewerLinks(Writer* head);
+
+  // Set the leader in write_group to completed state and remove it from the
+  // write group.
+  void CompleteLeader(WriteGroup& write_group);
+
+  // Set a follower in write_group to completed state and remove it from the
+  // write group.
+  void CompleteFollower(Writer* w, WriteGroup& write_group);
+};
+
+}  // namespace ROCKSDB_NAMESPACE